In [1]:
from google.colab import files
import pandas as pd
from sklearn.svm import SVC
import numpy as np

In [2]:
def compute_sample_weights(df, label_col='label', taxa_col='taxa', meiosis_taxa=None):
    """
    Compute sample weights based on label and optionally taxa for meiosis proteins.

    Parameters:
        df (pd.DataFrame): DataFrame containing the data.
        label_col (str): Name of the label column ('Meiosis' vs. 'Non-meiosis').
        taxa_col (str): Name of the column with taxonomic group info.
        meiosis_taxa (list or set): Taxa considered for upweighting (e.g., fungi, plants).

    Returns:
        pd.Series: Sample weights aligned with df rows.
    """
    if meiosis_taxa is None:
        meiosis_taxa = {'chordates', 'arthropods', 'fungi', 'plants', 'other animals'}

    weights = []
    n_non_meiosis = len(df[df[label_col] == 0])
    n_meiosis = len(df[df[label_col] == 1])
    global_weight = n_non_meiosis / n_meiosis  # global class weight

    for _, row in df.iterrows():
        if row[label_col] == 1 and row[taxa_col] in meiosis_taxa:
            weights.append(global_weight)
        else:
            weights.append(1.0)

    return pd.Series(weights, index=df.index)


In [144]:
uploaded=files.upload()

In [143]:
uploaded=files.upload()

Saving mrmr_selected_train_rec8_aa_100.csv to mrmr_selected_train_rec8_aa_100.csv


In [363]:
# ---- 2. Load Training Data ----
df_train = pd.read_csv("mrmr_selected_train_dmc1_aa_100.csv")
X_train = df_train.iloc[:, 1:-2].values
y_train = df_train['label'].values
taxa_train = df_train['taxa'].values
sample_weights=compute_sample_weights(df_train, label_col='label', taxa_col='taxa')

In [364]:
best_params = {'C': 1, 'gamma': 0.001}

In [371]:
# ---- 1. Load protist data ---- entamoeba, plasmodium, trypanosoma
df_protist = pd.read_csv("selected_dmc1_trypanosoma_100.csv")  # replace with your actual filename
protist_ids = df_protist.iloc[:, 0].values                      # protein IDs
X_protist = df_protist.iloc[:, 1:].values                      # feature matrix (assumed scaled)

# ---- 2. Train SVM on all training data ----
if isinstance(best_params, tuple):
    best_params = best_params[0]

model = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'], probability=True)
model.fit(X_train, y_train, sample_weight=sample_weights)

# ---- 3. Predict probabilities for protist proteins ----
probs = model.predict_proba(X_protist)[:, 1]  # probability of class "meiosis"
preds = (probs >= 0.95).astype(int)           # adjust threshold as needed

In [372]:
# ---- 4. Save results ----
df_results = pd.DataFrame({
    "ID": protist_ids,
    "PredictedLabel": preds,
    "MeiosisProbability": probs
})

# Save full predictions
df_results.to_csv("trypanosoma_dmc1_predictions_100.csv", index=False)

# Save only predictions with probability ≥ 0.90
df_results[df_results["MeiosisProbability"] >= 0.85] \
          .sort_values("MeiosisProbability", ascending=False) \
          .to_csv("dmc1_trypanosoma_hits_prob85plus_100.csv", index=False)

# ✅ Print how many proteins were classified as meiosis
print(f"Number of predicted meiosis proteins (threshold 0.95): {sum(probs >= 0.85)}")
print("✅ Protist scan complete. Results saved.")


Number of predicted meiosis proteins (threshold 0.95): 19
✅ Protist scan complete. Results saved.


In [373]:
files.download("dmc1_trypanosoma_hits_prob85plus_100.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Save only predictions with probability ≥ 0.90
df_results[df_results["MeiosisProbability"] >= 0.85] \
          .sort_values("MeiosisProbability", ascending=False) \
          .to_csv("mnd1_entamoeba_hits_prob85plus_50.csv", index=False)

In [198]:
files.download("mnd1_entamoeba_hits_prob85plus_50.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>