In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
import os 
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
from sklearn.ensemble import VotingClassifier
import json

### Trian proteomic data 60 signatures TCGA test. Test on HGSOC

In [2]:
tissue_type = "primary"

df = pd.read_csv("/mnt/ncshare/ozkilim/BRCA/data/tasks/HGSOC_TCGA_merged_all.csv",header=0,low_memory=False)
df["label"]

0      0
1      0
2      1
3      1
4      1
      ..
691    1
692    1
693    1
694    1
695    1
Name: label, Length: 696, dtype: int64

In [3]:
#60 overlapping signature prots.

prots = ['RAB25', 'BCL2L1', 'HADH', 'NFKB2', 'COX7A2', 'COX7C', 'TPMT', 'GOLPH3L', 'LTA4H', 'COX6C', 'IDH1', 'YWHAG', 'S100A10', 'COX6A1', 'NDUFB3', 'TGM2', 'CDKN1B', 'NFKB1', 'CAMK2D', 'IL4I1', 'FDX1', 'VCAM1', 'ATM', 'NCAPH2', 'ABCB8', 'IDI1', 'PLIN2', 'ATP6V1D', 'GPX4', 'CA2', 'RELA', 'GLUD1', 'TOP3B', 'RPS6KB2', 'KEAP1', 'LGALS1', 'MTDH', 'AIFM1', 'RHOA', 'CASP7', 'PTGES2', 'TFRC', 'CHUK', 'GPX1', 'PDK1', 'STAT3', 'PECR', 'TALDO1', 'XIAP', 'ACADSB', 'CPOX', 'ARNT', 'BIRC2', 'ACOT7', 'HACL1', 'MYD88', 'EGFR', 'RIPK1', 'NBN', 'LDHA']


In [4]:


def get_rf_auc_list(split_folder_path):
    rf_test_aucs = []
    # Loop over split files and create train, val, test subsets
    for filename in os.listdir(split_folder_path):
        if filename.endswith('.csv'):  # Assuming the files are in CSV format use given format... 
            
            split_df = pd.read_csv(os.path.join(split_folder_path, filename))
            # Extract identifiers
        
            train_df = df[df['slide_id'].isin(split_df["test"].to_list())] 
            val_df = df[df['slide_id'].isin(split_df["val"].to_list())]
            test_df = df[df['slide_id'].isin(split_df["train"].to_list())] #swapped test and train...


            X_train = train_df[prots]  # Replace 'label_column' with your label column name
            y_train = train_df['label']  # Replace 'label_column' with your label column name

            X_test = test_df[prots]  # Replace 'label_column' with your label column name
            y_test = test_df['label']  # Replace 'label_column' with your label column name


            # Initialize and train the Random Forest classifier

            random_forest = RandomForestRegressor()
            elastic_net = ElasticNet()
            xgboost_model = XGBRegressor()

            ensemble = VotingRegressor(
            estimators=[
                    ('random_forest', random_forest),
                    ('elastic_net', elastic_net),
                    ('xgboost', xgboost_model)
                ]
            )

            # rf_clf = RandomForestClassifier()
            ensemble.fit(X_train, y_train)

            # Make predictions on the test set
            rf_pred = ensemble.predict(X_test)
            # rf_pred_proba = ensemble.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

            # Evaluate the Random Forest model
            # rf_accuracy = accuracy_score(y_test, rf_pred)
            rf_roc_auc = roc_auc_score(y_test, rf_pred)
            # print(f"Random Forest ROC AUC: {rf_roc_auc}")
            rf_test_aucs.append(rf_roc_auc)
            
    # break
    return rf_test_aucs


split_folder_path = '/mnt/ncshare/ozkilim/BRCA/splits/TCGA_primary' 
primary_aucs = get_rf_auc_list(split_folder_path)
print(primary_aucs)

split_folder_path = '/mnt/ncshare/ozkilim/BRCA/splits/TCGA_metastatic' # working very poorly here...? 
metastatic_aucs = get_rf_auc_list(split_folder_path)
print(metastatic_aucs)


metastatic_results = {"primary_aucs":primary_aucs,"metastatic_aucs":metastatic_aucs}

with open('../results_analysis/TCGA_train_HGSOC_Test_classical_omics_results_'+tissue_type+'.json', 'w') as json_file:
    json.dump(metastatic_results, json_file)



[0.7100547536087606, 0.7033527696793003, 0.6703081232492998, 0.6690514043455219, 0.664147993746743]
[0.5418935902806871, 0.533922697368421, 0.5753164556962025, 0.4875998318621269, 0.5301668806161746]
