In [233]:
from google.colab import files
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, ParameterGrid, cross_validate
from sklearn.metrics import f1_score, make_scorer
import numpy as np
from joblib import Parallel, delayed

In [234]:
def compute_sample_weights(df, label_col='label', taxa_col='taxa', meiosis_taxa=None):
    """
    Compute sample weights based on label and optionally taxa for meiosis proteins.

    Parameters:
        df (pd.DataFrame): DataFrame containing the data.
        label_col (str): Name of the label column ('Meiosis' vs. 'Non-meiosis').
        taxa_col (str): Name of the column with taxonomic group info.
        meiosis_taxa (list or set): Taxa considered for upweighting (e.g., fungi, plants).

    Returns:
        pd.Series: Sample weights aligned with df rows.
    """
    if meiosis_taxa is None:
        meiosis_taxa = {'chordates', 'arthropods', 'fungi', 'plants', 'other animals'}

    weights = []
    n_non_meiosis = len(df[df[label_col] == 0])
    n_meiosis = len(df[df[label_col] == 1])
    global_weight = n_non_meiosis / n_meiosis  # global class weight

    for _, row in df.iterrows():
        if row[label_col] == 1 and row[taxa_col] in meiosis_taxa:
            weights.append(global_weight)
        else:
            weights.append(1.0)

    return pd.Series(weights, index=df.index)


In [32]:
def tune_rbf_svm(X_train, y_train, sample_weights=None, param_grid=None,
                 n_splits=10, random_state=42, n_jobs=-1):
    """
    Enhanced RBF-SVM tuning with parallel execution and smarter parameter search.

    Parameters:
        X_train (np.ndarray): Training features (n_samples, n_features)
        y_train (np.ndarray): Labels (0/1)
        sample_weights (np.ndarray): Optional sample weights
        param_grid (dict): Hyperparameter grid
        n_splits (int): CV folds
        random_state (int): Random seed
        n_jobs (int): CPU cores for parallelization (-1 = all)

    Returns:
        dict: Best parameters {'C': ..., 'gamma': ...}
        float: Best F1 score
    """
    if param_grid is None:
        param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'gamma': [1e-4, 1e-3, 1e-2, 0.1, 1, 10, 'scale', 'auto']
}

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    f1_scorer = make_scorer(f1_score)
    best_score = -np.inf
    best_params = {}

    def evaluate_params(params):
        model = SVC(
            kernel='rbf',
            C=params['C'],
            gamma=params['gamma'],
            class_weight='balanced' if sample_weights is None else None
        )

        # Correct way to pass sample_weight in cross_validate (using **kwargs)
        if sample_weights is not None:
            result = cross_validate(
                model, X_train, y_train,
                cv=cv,
                scoring=f1_scorer,
                n_jobs=1,
                return_estimator=False,
                fit_params={"sample_weight": sample_weights}  # ← NOT SUPPORTED → ERROR
            )
        else:
            result = cross_validate(
                model, X_train, y_train,
                cv=cv,
                scoring=f1_scorer,
                n_jobs=1
            )

        # But this fails. So instead, we do manual CV loop below.
        mean_score = np.mean(result['test_score'])
        print(f"C={params['C']:.3f}, gamma={params['gamma']}, F1={mean_score:.4f}")
        return mean_score, params


    def evaluate_params_safe(params):
        f1s = []
        for train_idx, val_idx in cv.split(X_train, y_train):
            X_tr, X_val = X_train[train_idx], X_train[val_idx]
            y_tr, y_val = y_train[train_idx], y_train[val_idx]

            w_tr = sample_weights[train_idx] if sample_weights is not None else None

            model = SVC(
                kernel='rbf',
                C=params['C'],
                gamma=params['gamma'],
                random_state=random_state,
                class_weight='balanced' if sample_weights is None else None
            )

            model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            f1 = f1_score(y_val, y_pred)
            f1s.append(f1)

        avg_f1 = np.mean(f1s)
        print(f"C={params['C']:.3f}, gamma={params['gamma']}, F1={avg_f1:.4f}")
        return avg_f1, params

    results = Parallel(n_jobs=n_jobs)(
        delayed(evaluate_params_safe)(params)
        for params in ParameterGrid(param_grid)
    )

    for score, params in results:
        if score > best_score:
            best_score = score
            best_params = params

    print(f"\n🎯 Best: C={best_params['C']}, gamma={best_params['gamma']}, F1={best_score:.4f}")
    return best_params, best_score


In [314]:
# df = pd.read_csv("mrmr_selected_train_rec8_esm_100.csv")
# sample_weights=compute_sample_weights(df)

In [317]:
# Extract features and labels
id_col = df.columns[0]
feature_cols = df.columns[1:-2]  # Exclude ID, taxa, label
label_col = df.columns[-1]
taxa_col = df.columns[-2]

X_train = df[feature_cols].values
y_train = df[label_col].values
taxa = df[taxa_col]

In [318]:
# best_params, best_score = tune_rbf_svm(X_train=X_train, y_train=y_train, sample_weights=sample_weights)

In [321]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load test data
df_test = pd.read_csv('mrmr_selected_test_spo11_aa_50.csv')

# Extract columns
ids = df_test.iloc[:, 0]              # ID column
X_test = df_test.iloc[:, 1:-2].values # Feature matrix (already scaled)
y_true = df_test.iloc[:, -1].values   # Label column (last column)

# Train model on full training data using best hyperparameters
if isinstance(best_params, tuple):  # Unpack if needed
    best_params = best_params[0]

model = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'])
model.fit(X_train, y_train, sample_weight=sample_weights)

# Predict
# y_pred = model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Print metrics
# print(f"✅ Accuracy:  {accuracy:.4f}")
# print(f"✅ Precision: {precision:.4f}")
# print(f"✅ Recall:    {recall:.4f}")
# print(f"✅ F1 Score:  {f1:.4f}")