In [43]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import torch as pt
import csv
import pickle #to save notebook at sessions


#from Bojar lab format
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder

#set path for pickles to be saved in
pickle_path = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/python pickles/'

# model evaluation function
def model_evaluation(model, x, y):
#     print(f"Accuracy for 'PHA-L high' class: {100*(model.score(x[y==1], y[y==1])):>4f}%")
#     print(f"Accuracy for 'PHA-L low' class: {100*(model.score(x[y==0], y[y==0])):>4f}%")
#     print(f"Overall accuracy: {100*(model.score(x, y)):>4f}%")
    
    model_predict = model.predict(x)
    model_predict_prob = model.predict_proba(x)
    
    high = f"{100*(model.score(x[y==1], y[y==1])):>4f}%"
    low = f'{100*(model.score(x[y==0], y[y==0])):>4f}%'
    total = f'{100*(model.score(x, y)):>4f}%'

#     print(f"Average loss: {log_loss(y, model_predict_prob):>4f}")
#     print(f"ROC Curve AUC: {roc_auc_score(y, model_predict):>4f}")
#     print(f"F1 score: {f1_score(y, model_predict):>4f}")
    return high, low, total

# Testing LN data on TIL model

In [54]:
## load pickles of the TIL models 

#TIL all 
pickle_in = open(pickle_path + "TILmodel_all_robust.pkl","rb")
all_TILmodel = pickle.load(pickle_in)

# CD8 naive like
pickle_in = open(pickle_path + "CD8_NaiveLike_df_TILmodel.pkl","rb")
CD8_NaiveLike_df_TILmodel = pickle.load(pickle_in)

# CD8 effector memory like
pickle_in = open(pickle_path + "CD8_EffectorMemory_df_TILmodel.pkl","rb")
CD8_EffectorMemory_df_TILmodel = pickle.load(pickle_in)

# Th1
pickle_in = open(pickle_path + "Th1_df_TILmodel.pkl","rb")
Th1_df_TILmodel = pickle.load(pickle_in)

# CD8_EarlyActiv_df_TILmodel
pickle_in = open(pickle_path + "CD8_EarlyActiv_df_TILmodel.pkl","rb")
CD8_EarlyActiv_df_TILmodel = pickle.load(pickle_in)

# Treg_df_TILmodel
pickle_in = open(pickle_path + "Treg_df_TILmodel.pkl","rb")
Treg_df_TILmodel = pickle.load(pickle_in)

# CD8_Tex_df_TILmodel
pickle_in = open(pickle_path + "CD8_Tex_df_TILmodel.pkl","rb")
CD8_Tex_df_TILmodel = pickle.load(pickle_in)

# CD4_NaiveLike_df_TILmodel
pickle_in = open(pickle_path + "CD4_NaiveLike_df_TILmodel.pkl","rb")
CD4_NaiveLike_df_TILmodel = pickle.load(pickle_in)

# Tfh_df_TILmodel
pickle_in = open(pickle_path + "Tfh_df_TILmodel.pkl","rb")
Tfh_df_TILmodel = pickle.load(pickle_in)

# CD8_Tpex_df_TILmodel
pickle_in = open(pickle_path + "CD8_Tpex_df_TILmodel.pkl","rb")
CD8_Tpex_df_TILmodel = pickle.load(pickle_in)

#All TIL model
pickle_in = open(pickle_path + 'TILmodel_all_robust.pkl',"rb")
TILmodel_all_robust = pickle.load(pickle_in)

In [59]:
#pickle in LN data 

#full data
pickle_in = open(pickle_path +"robust_LN.pkl","rb")
robust_LN = pickle.load(pickle_in)

#split data 
pickle_in = open(pickle_path + "glycoscored_dict_LNsub.pkl","rb")
glycoscored_dict = pickle.load(pickle_in)

#remove housekeeping genes from full data 
housekeeping_list = ['Ahsa1', 'Api5', 'Atp6v1e1', 'Bcap31', 'Cops6', 'Csnk2b', 'Eif3i', 'Eif4g2', 'Gdi2', 'Hnrnpf', 
                     'Hnrnph1', 'Hnrnph2', 'Ilf2', 'Dnajc5', 'Ncl', 'Otub1', 'Pdap1', 'Polr2f', 'Rhoa', 'Srp14', 
                     'Srrm1', 'Timm44', 'Ttc1', 'Ywhab', 'Pdcd6']
cols_to_remove = set(robust_LN.columns).intersection(housekeeping_list)
robust_LN.drop(columns=cols_to_remove, inplace=True)

#add full data to dictionary containing split data
glycoscored_dict['all_LN_df'] = robust_LN #add all T-cells info into dictionary containing split dfs
data_names = list(glycoscored_dict.keys())

#initialize df to contain test info
cols = {"Model": [], "Input Data": [], "PHA-L high accuracy": [], "PHA-L low accuracy": [], 
        "Overall accuracy": [], 'Dimensions':[]}
total_dfs = pd.DataFrame(columns = cols)

In [60]:
'''
ITeratively test and make dataframe containing accuracy values of each model on each data type

input data: 
- glycoTIL_Z which is a dataframe of normalized counts of all T-cells from TIL
- glycoscored_dicts, a dictionary containing split normalized dfs
- models whose names are in all_models list 

output:
- dataframe containing columns/info for 

'''

def subtype_modeltesting(model, model_name):
    for name in data_names:
        ## generate dataframe from dictionary key
        sub_df = glycoscored_dict[name]
        
        ## test random forest
        #y: PHA-L score array
        y = sub_df['PHA-L'].values 
        encoder = LabelEncoder()
        y_binary = encoder.fit_transform(y)

        #X: glycogene transcript data array
        x = sub_df.iloc[:, :-3].values

        #test model
        high, low, total = model_evaluation(model, x, y_binary)

        #save all the info into dataframe
        #get dimension 
        dim = sub_df.shape
        total_dfs.loc[len(total_dfs.index)] = [model_name, name, high, low, total, dim]

In [61]:
subtype_modeltesting(CD8_NaiveLike_df_TILmodel, 'CD8_NaiveLike_df_TILmodel')
subtype_modeltesting(CD8_EffectorMemory_df_TILmodel, 'CD8_EffectorMemory_df_TILmodel')
subtype_modeltesting(Th1_df_TILmodel, 'Th1_df_TILmodel')
subtype_modeltesting(CD8_EarlyActiv_df_TILmodel, 'CD8_EarlyActiv_df_TILmodel')
subtype_modeltesting(Treg_df_TILmodel, 'Treg_df_TILmodel')
subtype_modeltesting(CD8_Tex_df_TILmodel, 'CD8_Tex_df_TILmodel')
subtype_modeltesting(CD4_NaiveLike_df_TILmodel, 'CD4_NaiveLike_df_TILmodel')
subtype_modeltesting(Tfh_df_TILmodel, 'Tfh_df_TILmodel')
subtype_modeltesting(CD8_Tpex_df_TILmodel, 'CD8_Tpex_df_TILmodel')
subtype_modeltesting(TILmodel_all_robust, 'all_TILmodel')

In [62]:
total_dfs.rename(columns = {'Input Data':'Input LN Data'}, inplace = True)
total_dfs = total_dfs.sort_values(by='Overall accuracy', ascending = False)
output_path = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/output/TIL output/'
total_dfs.to_csv(output_path+'LN data on TIL model accuracy.csv', index=False)

In [63]:
total_dfs

Unnamed: 0,Model,Input LN Data,PHA-L high accuracy,PHA-L low accuracy,Overall accuracy,Dimensions
93,all_TILmodel,CD8_Tpex_df,88.888889%,94.117647%,91.428571%,"(35, 239)"
82,CD8_Tpex_df_TILmodel,Treg_df,89.592760%,88.636364%,89.115646%,"(441, 239)"
42,Treg_df_TILmodel,Treg_df,89.592760%,88.636364%,89.115646%,"(441, 239)"
92,all_TILmodel,Treg_df,89.140271%,87.272727%,88.208617%,"(441, 239)"
52,CD8_Tex_df_TILmodel,Treg_df,73.303167%,98.636364%,85.941043%,"(441, 239)"
...,...,...,...,...,...,...
58,CD8_Tex_df_TILmodel,Tfh_df,9.090909%,95.238095%,51.162791%,"(43, 239)"
67,CD4_NaiveLike_df_TILmodel,CD4_NaiveLike_df,55.241935%,45.454545%,50.408163%,"(490, 239)"
57,CD8_Tex_df_TILmodel,CD4_NaiveLike_df,1.209677%,100.000000%,50.000000%,"(490, 239)"
64,CD4_NaiveLike_df_TILmodel,Th1_df,55.849057%,42.424242%,49.149338%,"(529, 239)"


# Testing TIL data on LN model

In [49]:
## load pickles of the LN models 

#LN all 
pickle_in = open(pickle_path + "LNmodel_all_robust.pkl","rb")
all_LNmodel = pickle.load(pickle_in)

# CD8 naive like
pickle_in = open(pickle_path + "CD8_NaiveLike_df_LNmodel.pkl","rb")
CD8_NaiveLike_df_LNmodel = pickle.load(pickle_in)

# CD8 effector memory like
pickle_in = open(pickle_path + "CD8_EffectorMemory_df_LNmodel.pkl","rb")
CD8_EffectorMemory_df_LNmodel = pickle.load(pickle_in)

# Th1
pickle_in = open(pickle_path + "Th1_df_LNmodel.pkl","rb")
Th1_df_LNmodel = pickle.load(pickle_in)

# CD8_EarlyActive
pickle_in = open(pickle_path + "CD8_EarlyActiv_df_LNmodel.pkl","rb")
CD8_EarlyActiv_df_LNmodel = pickle.load(pickle_in)

# Treg
pickle_in = open(pickle_path + "Treg_df_LNmodel.pkl","rb")
Treg_df_LNmodel = pickle.load(pickle_in)

# CD8_Tex
pickle_in = open(pickle_path + "CD8_Tex_df_LNmodel.pkl","rb")
CD8_Tex_df_LNmodel = pickle.load(pickle_in)

# CD4_NaiveLike
pickle_in = open(pickle_path + "CD4_NaiveLike_df_LNmodel.pkl","rb")
CD4_NaiveLike_df_LNmodel = pickle.load(pickle_in)

# Tfh
pickle_in = open(pickle_path + "Tfh_df_LNmodel.pkl","rb")
Tfh_df_LNmodel = pickle.load(pickle_in)

# CD8_Tpex
pickle_in = open(pickle_path + "CD8_Tpex_df_LNmodel.pkl","rb")
CD8_Tpex_df_LNmodel = pickle.load(pickle_in)


In [50]:
#pickle in the TIL data 

#full data
pickle_in = open(pickle_path +"robust_TIL.pkl","rb")
robust_TIL = pickle.load(pickle_in)

#split data 
pickle_in = open(pickle_path + "glycoscored_dict_TILsub.pkl","rb")
glycoscored_dict = pickle.load(pickle_in)

#remove housekeeping genes from full data 
housekeeping_list = ['Ahsa1', 'Api5', 'Atp6v1e1', 'Bcap31', 'Cops6', 'Csnk2b', 'Eif3i', 'Eif4g2', 'Gdi2', 'Hnrnpf', 
                     'Hnrnph1', 'Hnrnph2', 'Ilf2', 'Dnajc5', 'Ncl', 'Otub1', 'Pdap1', 'Polr2f', 'Rhoa', 'Srp14', 
                     'Srrm1', 'Timm44', 'Ttc1', 'Ywhab', 'Pdcd6']
cols_to_remove = set(robust_TIL.columns).intersection(housekeeping_list)
robust_TIL.drop(columns=cols_to_remove, inplace=True)

#add full data to dictionary containing split data
glycoscored_dict['all_TILs_df'] = robust_TIL #add all T-cells info into dictionary containing split dfs
data_names = list(glycoscored_dict.keys())

#initialize df to contain test info
cols = {"Model": [], "Input Data": [], "PHA-L high accuracy": [], "PHA-L low accuracy": [], 
        "Overall accuracy": [], 'Dimensions':[]}
total_dfs = pd.DataFrame(columns = cols)

In [51]:
subtype_modeltesting(CD8_NaiveLike_df_LNmodel, 'CD8_NaiveLike_df_LNmodel')
subtype_modeltesting(CD8_EffectorMemory_df_LNmodel, 'CD8_EffectorMemory_df_LNmodel')
subtype_modeltesting(Th1_df_LNmodel, 'Th1_df_LNmodel')
subtype_modeltesting(CD8_EarlyActiv_df_LNmodel, 'CD8_EarlyActiv_df_LNmodel')
subtype_modeltesting(Treg_df_LNmodel, 'Treg_df_LNmodel')
subtype_modeltesting(CD8_Tex_df_LNmodel, 'CD8_Tex_df_LNmodel')
subtype_modeltesting(CD4_NaiveLike_df_LNmodel, 'CD4_NaiveLike_df_TILmodel')
subtype_modeltesting(Tfh_df_LNmodel, 'Tfh_df_LNmodel')
subtype_modeltesting(CD8_Tpex_df_LNmodel, 'CD8_Tpex_df_LNmodel')
subtype_modeltesting(all_LNmodel, 'all_LNLmodel')

In [53]:
total_dfs.rename(columns = {'Input Data':'Input TIL Data'}, inplace = True)
total_dfs=total_dfs.sort_values(by='Overall accuracy', ascending = False)
output_path = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/output/LN output/'
total_dfs.to_csv(output_path+'TIL data on LN model accuracy.csv', index=False)

In [42]:
total_dfs.sort_values(by='Overall accuracy', ascending = False)

Unnamed: 0,Model,Input TIL Data,PHA-L high accuracy,PHA-L low accuracy,Overall accuracy,Dimensions
94,Treg_df_LNmodel,Treg_df_LN,80.366226%,90.573770%,85.451761%,"(1959, 239)"
184,Treg_df_LNmodel,Treg_df_LN,80.366226%,90.573770%,85.451761%,"(1959, 239)"
284,Treg_df_LNmodel,Treg_df_LN,80.366226%,90.573770%,85.451761%,"(1959, 239)"
44,Treg_df_LNmodel,Treg_df_LN,80.366226%,90.573770%,85.451761%,"(1959, 239)"
324,CD8_Tpex_df_LNmodel,Treg_df_LN,71.820956%,93.340164%,82.542113%,"(1959, 239)"
...,...,...,...,...,...,...
16,CD8_EffectorMemory_df_LNmodel,CD4_NaiveLike_df_LN,4.477612%,93.939394%,48.872180%,"(133, 239)"
156,CD8_EffectorMemory_df_LNmodel,CD4_NaiveLike_df_LN,4.477612%,93.939394%,48.872180%,"(133, 239)"
206,CD4_NaiveLike_df_TILmodel,CD4_NaiveLike_df_LN,10.447761%,86.363636%,48.120301%,"(133, 239)"
306,CD4_NaiveLike_df_TILmodel,CD4_NaiveLike_df_LN,10.447761%,86.363636%,48.120301%,"(133, 239)"
