In [None]:
def compute_sample_weights(df, label_col='label', taxa_col='taxa', meiosis_taxa=None):
    """
    Compute sample weights based on label and optionally taxa for meiosis proteins.

    Parameters:
        df (pd.DataFrame): DataFrame containing the data.
        label_col (str): Name of the label column ('Meiosis' vs. 'Non-meiosis').
        taxa_col (str): Name of the column with taxonomic group info.
        meiosis_taxa (list or set): Taxa considered for upweighting (e.g., fungi, plants).

    Returns:
        pd.Series: Sample weights aligned with df rows.
    """
    if meiosis_taxa is None:
        meiosis_taxa = {'chordates', 'arthropods', 'fungi', 'plants', 'other animals'}

    weights = []
    n_non_meiosis = len(df[df[label_col] == 0])
    n_meiosis = len(df[df[label_col] == 1])
    global_weight = n_non_meiosis / n_meiosis  # global class weight

    for _, row in df.iterrows():
        if row[label_col] == 1 and row[taxa_col] in meiosis_taxa:
            weights.append(global_weight)
        else:
            weights.append(1.0)

    return pd.Series(weights, index=df.index)

In [1]:
# Best parameters from the trained model recorded
best_params = {
    'hidden_dim': 128,
    'dropout': 0.3,
    'lr': 0.001,
    'batch_size': 32,
    'num_layers': 2,
    'activation': 'relu',
    'weight_decay': 0,
    'optimizer': 'adamW'
}

In [203]:
from google.colab import files
# uploaded=files.upload()

In [204]:
# uploaded=files.upload()

In [205]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- 1. Sample Weight Function ----
def compute_sample_weights(df, label_col='label', taxa_col='taxa', meiosis_taxa=None):
    """
    Compute sample weights to handle class imbalance, upweighting known meiosis taxa.
    """
    if meiosis_taxa is None:
        meiosis_taxa = {'chordates', 'arthropods', 'fungi', 'plants', 'other animals'}

    n_non_meiosis = len(df[df[label_col] == 0])
    n_meiosis = len(df[df[label_col] == 1])
    global_weight = n_non_meiosis / n_meiosis  # Upweight positive samples

    weights = []
    for _, row in df.iterrows():
        if row[label_col] == 1 and row[taxa_col] in meiosis_taxa:
            weights.append(global_weight)
        else:
            weights.append(1.0)

    return pd.Series(weights, index=df.index)

# ---- 2. Load Training Data ----
df_train = pd.read_csv("mrmr_selected_train_spo11_aa_50.csv")
X_train = df_train.iloc[:, 1:-2].values
y_train = df_train['label'].values
taxa_train = df_train['taxa'].values

# Compute sample weights
sample_weights_series = compute_sample_weights(df_train, label_col='label', taxa_col='taxa')
sample_weights = torch.tensor(sample_weights_series.values, dtype=torch.float32)

# ---- 3. Torch Dataset ----
class ProteinDataset(Dataset):
    def __init__(self, X, y, weights):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
        self.weights = weights
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.weights[idx]

# ---- 4. MLP Model ----
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout, activation):
        super(MLPClassifier, self).__init__()
        layers = [nn.Linear(input_dim, hidden_dim)]
        act = getattr(nn, activation)() if hasattr(nn, activation) else nn.ReLU()
        for _ in range(num_layers - 1):
            layers += [act, nn.Dropout(dropout), nn.Linear(hidden_dim, hidden_dim)]
        layers += [act, nn.Dropout(dropout), nn.Linear(hidden_dim, 2)]
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# ---- 5. Model Setup ----
best_params = {
    'hidden_dim': 128, #
    'dropout': 0.3,
    'num_layers': 2,
    'activation': 'ReLU',
    'lr': 0.001,
    'weight_decay': 0.0001,
    'batch_size': 32
}

model = MLPClassifier(
    input_dim=X_train.shape[1],
    hidden_dim=best_params['hidden_dim'],
    dropout=best_params['dropout'],
    num_layers=best_params['num_layers'],
    activation=best_params['activation']
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=best_params['lr'], weight_decay=best_params['weight_decay'])
criterion = nn.CrossEntropyLoss(reduction='none')

train_dataset = ProteinDataset(X_train, y_train, sample_weights)
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)

# ---- 6. Training Loop ----
n_epochs = 50
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    for batch_x, batch_y, batch_weights in train_loader:
        batch_x, batch_y, batch_weights = batch_x.to(device), batch_y.to(device), batch_weights.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        weighted_loss = (loss * batch_weights).mean()
        weighted_loss.backward()
        optimizer.step()
        total_loss += weighted_loss.item()
#    print(f"Epoch {epoch+1}/{n_epochs} - Loss: {total_loss:.4f}")

In [201]:
# ---- 7. Predict Protist Candidates ---- entamoeba, trypanosoma, plasmodium
threshold = 0.95  # Set your desired confidence threshold

df_protist = pd.read_csv("selected_spo11_entamoeba_50.csv")
protist_ids = df_protist.iloc[:, 0].values
X_protist = df_protist.iloc[:, 1:].values
X_protist_tensor = torch.tensor(X_protist, dtype=torch.float32).to(device)

model.eval()
# with torch.no_grad():
#     logits = model(X_protist_tensor)
#    probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()  # Probability of class 1 (meiosis)

# Apply custom threshold to assign labels
# preds = (probs >= threshold).astype(int)

# Save results
# df_results = pd.DataFrame({
#    "ID": protist_ids,
#    "PredictedLabel": preds,
#    "MeiosisProbability": probs
#})

# df_results.to_csv("protist_spo11_entamoeba_50_predictions_mlp.csv", index=False)

# Save top-scoring predictions, regardless of threshold
# df_results.sort_values("MeiosisProbability", ascending=False).head(50).to_csv("top_spo11_trypanosoma_hits_mlp_50.csv", index=False)

# print(f"✅ Protist scan complete with threshold {threshold}. Results saved.")
# print(f"Number of predicted meiosis proteins: {(preds == 1).sum()}")
