In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression, Lasso
import numpy as np

In [28]:
df = pd.read_csv('/mnt/ncshare/ozkilim/BRCA/data/tasks/combined_genomic_plat_responce.csv')  # Or load your DataFrame by other means

feature_cols = ["Signature.1", "Signature.2", "Signature.3", "Signature.5", "Signature.8", "Signature.13", "Microhomology2", "Microhomology2ratio", "Del/ins-ratio", "Del10-ratio", "HRD-LOH", "Telomeric.AI", "LST", "DBS2", "DBS4", "DBS5", "DBS6", "DBS9", "SBS1", "SBS2", "SBS3", "SBS5", "SBS8", "SBS13", "SBS18", "SBS26", "SBS35", "SBS38", "SBS39", "SBS40", "SBS41", "ID1", "ID2", "ID4", "ID8", "HRDetect"]

grouped = df.groupby('case_id')
groups = [grouped.get_group(x) for x in grouped.groups]


# Randomly shuffle the list of groups
np.random.shuffle(groups)

# Split the groups (e.g., 70% training, 30% testing)
split_index = int(len(groups) * 0.9)
train_groups = groups[:split_index]
test_groups = groups[split_index:]

# Concatenate the groups back into DataFrames
train_df = pd.concat(train_groups)
test_df = pd.concat(test_groups)

X_train = train_df[feature_cols]
y_train = train_df['DSS']
X_test = test_df[feature_cols]
y_test = test_df['DSS']


In [29]:
def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    # print(y_test)
    print(y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'Model: {model.__class__.__name__}')
    print(f'AUC Score: {auc_score}')
    print(f'F1 Score: {f1}\n')

# List of models to train
models = [
    RandomForestClassifier(),
    SVC(probability=True),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    LogisticRegression(),
    # Lasso(alpha=1.0)
]

# Train and evaluate each model
for model in models:
    train_evaluate_model(model, X_train, y_train, X_test, y_test)

[0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0.
 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1.]
Model: RandomForestClassifier
AUC Score: 0.49455337690631807
F1 Score: 0.45

[0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1.]
Model: SVC
AUC Score: 0.49455337690631807
F1 Score: 0.5

[1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1.]
Model: GradientBoostingClassifier
AUC Score: 0.381263616557