In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, roc_auc_score
!pip install xgboost
import numpy as np
import os 
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
from sklearn.ensemble import VotingClassifier
import json



### Test proteomic data for 3 way splits

In [2]:
tissue_type = "metastatic"

df = pd.read_csv("/mnt/ncshare/ozkilim/BRCA/data/tasks/HGSOC_multimodal_"+tissue_type+"_protein_and_phospho.csv",header=0,low_memory=False)
df["label"]

0      0
1      0
2      1
3      1
4      1
      ..
169    0
170    1
171    1
172    0
173    0
Name: label, Length: 174, dtype: int64

In [3]:
omics_features=[]
phospo_prots = pd.read_excel("/mnt/ncshare/ozkilim/BRCA/data/HGSOC_processed_data/mmc3.xlsx",sheet_name="Phospho_predictors")
phospho_features = phospo_prots["Phospho predictors"].to_list()

protein_categories = {
       "Drug Metabolism & Biological Oxidation": ["TPMT"],
       "Hemostasis": ["CARMIL1","CCDC167"],
       "Metabolic": [
              "TALDO1", "COX7A2", "LGALS1", "S100A10", "ACADSB", "COX6C", "COX7C", 
              "CA2", "GPX1", "GPX4", "LDHA", "NDUFB3", "ATP6V1D", "ACOT7", "HACL1", 
              "CPOX", "PTGES2", "GLUD1", "COX6A1", "LTA4H", "CASP7", "IL4I1" , "PECR",
              "YWHAG", "IDI1", "AIFM1", "NBN", "HADH", "PLIN2", "FDX1", "NCAPH2", "IDH1", "ABCB8"
       ],
       "Hypoxia": [
              "TGM2", "SENP1", "RAB25", "CDKN1B", "EGFR", "CDKN1A", "RHOA", "NFKB1", 
              "PDK1", "RPS6KB2", "TFRC", "STAT3", "ARNT", "CAMK2D"
       ],
       "NF-kB": [
              "RELA", "ATM", "BCL2L1", "BIRC2", "VCAM1", "NFKB2", "KEAP1", "RIPK1", "MTDH",
              "CHUK", "MYD88", "GOLPH3L", "TOP3B", "XIAP"
       ],
       # "phospho": phospho_features

}


prots = []

for selected_prots in protein_categories.values():
       prots.append(selected_prots)

prots = [item for sublist in prots for item in sublist]

print(prots)

['TPMT', 'CARMIL1', 'CCDC167', 'TALDO1', 'COX7A2', 'LGALS1', 'S100A10', 'ACADSB', 'COX6C', 'COX7C', 'CA2', 'GPX1', 'GPX4', 'LDHA', 'NDUFB3', 'ATP6V1D', 'ACOT7', 'HACL1', 'CPOX', 'PTGES2', 'GLUD1', 'COX6A1', 'LTA4H', 'CASP7', 'IL4I1', 'PECR', 'YWHAG', 'IDI1', 'AIFM1', 'NBN', 'HADH', 'PLIN2', 'FDX1', 'NCAPH2', 'IDH1', 'ABCB8', 'TGM2', 'SENP1', 'RAB25', 'CDKN1B', 'EGFR', 'CDKN1A', 'RHOA', 'NFKB1', 'PDK1', 'RPS6KB2', 'TFRC', 'STAT3', 'ARNT', 'CAMK2D', 'RELA', 'ATM', 'BCL2L1', 'BIRC2', 'VCAM1', 'NFKB2', 'KEAP1', 'RIPK1', 'MTDH', 'CHUK', 'MYD88', 'GOLPH3L', 'TOP3B', 'XIAP']


In [4]:
# re run but with subset and splits.. 


def get_rf_auc_list(split_folder_path):
    rf_test_aucs = []
    # Loop over split files and create train, val, test subsets
    for filename in os.listdir(split_folder_path):
        if filename.endswith('.csv'):  # Assuming the files are in CSV format use given format... 
            
            split_df = pd.read_csv(os.path.join(split_folder_path, filename))
            # Extract identifiers
        
            train_df = df[df['slide_id'].isin(split_df["train"].to_list())]
            val_df = df[df['slide_id'].isin(split_df["val"].to_list())]
            test_df = df[df['slide_id'].isin(split_df["test"].to_list())]

            X_train = train_df[prots]  # Replace 'label_column' with your label column name
            y_train = train_df['label']  # Replace 'label_column' with your label column name

            X_test = test_df[prots]  # Replace 'label_column' with your label column name
            y_test = test_df['label']  # Replace 'label_column' with your label column name


            # Initialize and train the Random Forest classifier

            random_forest = RandomForestRegressor()
            elastic_net = ElasticNet()
            xgboost_model = XGBRegressor()

            ensemble = VotingRegressor(
            estimators=[
                    ('random_forest', random_forest),
                    ('elastic_net', elastic_net),
                    ('xgboost', xgboost_model)
                ]
            )

            # rf_clf = RandomForestClassifier()
            ensemble.fit(X_train, y_train)

            # Make predictions on the test set
            rf_pred = ensemble.predict(X_test)
            # rf_pred_proba = ensemble.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

            # Evaluate the Random Forest model
            # rf_accuracy = accuracy_score(y_test, rf_pred)
            rf_roc_auc = roc_auc_score(y_test, rf_pred)
            # print(f"Random Forest ROC AUC: {rf_roc_auc}")
            rf_test_aucs.append(rf_roc_auc)
            
    # break
    return rf_test_aucs


split_folder_path = '/mnt/ncshare/ozkilim/BRCA/splits/UAB_' + tissue_type  
UAB_aucs = get_rf_auc_list(split_folder_path)
split_folder_path = '/mnt/ncshare/ozkilim/BRCA/splits/FHCRC_' + tissue_type
FHCRC_aucs = get_rf_auc_list(split_folder_path)
split_folder_path = '/mnt/ncshare/ozkilim/BRCA/splits/Mayo_' + tissue_type
Mayo_aucs = get_rf_auc_list(split_folder_path)


metastatic_results = {"UAB_aucs_"+tissue_type:UAB_aucs,"FHCRC_aucs_"+tissue_type:FHCRC_aucs,"Mayo_aucs_"+tissue_type:Mayo_aucs}

with open('../results_analysis/classical_omics_results_'+tissue_type+'.json', 'w') as json_file:
    json.dump(metastatic_results, json_file)