In [39]:
import pandas as pd
import numpy as np
import scipy as sp
import csv
import pickle #to save notebook at sessions
import matplotlib.pyplot as plt
import ast 

from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind

#from Bojar lab format
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder

#set path for pickles to be saved in
pickle_path = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/python pickles/'
output_path = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/output/'

# Defining key functions


In [40]:
# Parameters for grid search

# Number of trees in random forest
n_estimators = [int(x) for x in np.arange(200, 800, step=100)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.arange(10, 50, step=10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }

In [41]:
def model_evaluation(model, x, y):
#     print(f"Accuracy for 'PHA-L high' class: {100*(model.score(x[y==1], y[y==1])):>4f}%")
#     print(f"Accuracy for 'PHA-L low' class: {100*(model.score(x[y==0], y[y==0])):>4f}%")
#     print(f"Overall accuracy: {100*(model.score(x, y)):>4f}%")
    
    model_predict = model.predict(x)
    model_predict_prob = model.predict_proba(x)
    
    high = f"{100*(model.score(x[y==1], y[y==1])):>4f}%"
    low = f'{100*(model.score(x[y==0], y[y==0])):>4f}%'
    total = f'{100*(model.score(x, y)):>4f}%'

    avg_loss = log_loss(y, model_predict_prob)
    roc_auc = roc_auc_score(y, model_predict)
    f1 = f1_score(y, model_predict)
    return high, low, total, avg_loss, roc_auc, f1

In [42]:
'''
Generate training, validation, and test set from original full df using scikit learn 
saves best estimator model as a pickle

Input: 
- dictionary: dictionary containing three dataframes with Biotin, Type, PHA-L column (binary) and gene transcripts
- index: index of which df (out of three) to use as the training set. the rest will be used for testing. 
            index options are '1, '2', '3' (as strings)
- name: name of pickle file to contain model (str) 
- df: dataframe to add model stats to
- bool: True or False based on whether you want to save model as pickle or not. True for save, False for not
- i: random state, default we've been using is 42

Output: 
- high, low, total accuracy of best estimator model 
'''
def train_and_evaluate(dictionary, index, name, df, boo):
    
    '''extract training set'''
    data = dictionary[index]
    
    #y: PHA-L score array
    y_train = data['PHA-L'].values 

    #X: glycogene transcript data array, drop columns if they exist
    if 'PHA-L' in data.columns:
        data = data.drop(columns=['PHA-L'])
    if 'Type' in data.columns:
        data = data.drop(columns=['Type'])
    if 'Biotin' in data.columns:
        data = data.drop(columns=['Biotin'])
    
    x_train = data.values

    '''extract test sets (the other two dataframes that aren't used for training)'''
    keys_list = [x for x in list(dictionary.keys()) if x != index]
    df1 = dictionary[keys_list[0]]
    df2 = dictionary[keys_list[1]]
    test_data = pd.concat([df1, df2])
    
    #y: PHA-L score array
    y_test = test_data['PHA-L'].values 

    #X: glycogene transcript data array, drop columns if they exist
    if 'PHA-L' in test_data.columns:
        test_data = test_data.drop(columns=['PHA-L'])
    if 'Type' in test_data.columns:
        test_data = test_data.drop(columns=['Type'])
    if 'Biotin' in test_data.columns:
        test_data = test_data.drop(columns=['Biotin'])
    
    x_test = test_data.values
    
    encoder = LabelEncoder()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)
    
    
    ''''Use RandomSearchCV to optimize hyperparameters'''
    #takes a while to run!!
    model = RandomForestClassifier()

    model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, 
                                      n_iter = 20, cv = 5, verbose=5, random_state=42, n_jobs = -1)

    model_random.fit(x_train, y_train)
    
    mod = model_random.best_estimator_
    
    if boo:    
        # save pickle 
        with open(name +'.pkl', 'wb') as f:
            pickle.dump(mod, f)
        f.close()
    
    high, low, total, avg_loss, roc_auc, f1 = model_evaluation(mod, x_test, y_test)
    
    '''
    extract top 5 important genes and append to dataframe
    '''     
    feature_importances = pd.DataFrame({'feature': data.columns, 
                                        'importance': mod.feature_importances_})

    # Sort the dataframe by importance score in descending order
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    feature_importances['combo'] = feature_importances['feature'] + ': '+ feature_importances['importance'].round(4).astype(str)
    

    # make new column that combines feature name with its importance value, get top 5 as string to add to dictionary
    top10p_df = feature_importances.head(26)
    top10p = str(list(top10p_df['combo']))
    
    #adds model information to dataframe called 'model_comp'
    df.loc[len(df.index)] = [name, high, low, total, top10p, avg_loss, roc_auc, f1] 

    
    return high, low, total, mod, feature_importances

# #load pickled model via: 
# pickle_in = open("pickle_path + TILmodel_all_robust.pkl","rb")
# TILmodel_all_robust = pickle.load(pickle_in)

# Random Forest Classifier using all T-cells and all genes
Referenced:
https://github.com/BojarLab/scGlycomics_b16_branching/blob/main/Random%20Forest%20-%20Apr%208%202022%20-%20RQ.ipynb

Full dataframe containing all genes and all T cells were split by cells into three equally sized datasets.
Models were trained on one third and tested on remaining two thirds for each subset of the dataset.

In [11]:
'''
load pickled dataframe containing:
- robust normalized gene expression counts
- glycoscores
- all genes + type + biotin + L-PHA in columns
- cell barcodes of T-cells in rows
'''
pickle_in = open("TILglyconorm_split.pkl","rb")
TILglyconorm_split = pickle.load(pickle_in)

pickle_in = open("LNglyconorm_split.pkl","rb")
LNglyconorm_split = pickle.load(pickle_in)

In [39]:
#10595 cells for LNs
#9824 cells for TILs

In [None]:
# make dataframe comparing different transformation types
cols = {"Model": [], "PHA-L high accuracy": [], "PHA-L low accuracy": [], 
        "Overall accuracy": [], 'Top 5 features': [], 'Average loss': [], 
        'ROC AUC score': [], 'F1 score': []}
full_comp = pd.DataFrame(columns = cols)

## TILs

In [25]:
high,low,total,mod_TIL1,TILfeats_1= train_and_evaluate(TILglyconorm_split, '1','allgenes_splitTIL_1', 
                                                      full_comp, True)


Fitting 5 folds for each of 20 candidates, totalling 100 fits




[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.916 total time=  25.5s
[CV 4/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.845 total time=  28.8s
[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.902 total time=  25.1s
[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.827 total time=  29.4s
[CV 1/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.898 total time=  30.8s
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.905 total time=  28.7s
[CV 2/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.920 total time=  39.5s
[CV 4/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.850 total time=  24.9s
[CV 1/5] END max_depth=30, min_samples_leaf=1, min_sampl

In [26]:
with open('fullTILmod_1.pkl', 'wb') as f:
    pickle.dump(mod_TIL1, f)
f.close()
with open('fullTILfeats_1.pkl', 'wb') as f:
    pickle.dump(TILfeats_1, f)
f.close()

In [27]:
high,low,total,mod_TIL2,TILfeats_2= train_and_evaluate(TILglyconorm_split, '2','allgenes_splitTIL_2', 
                                                      full_comp, True)

[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.899 total time=  41.0s
[CV 5/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=300;, score=0.838 total time=  22.7s
[CV 3/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.902 total time=  24.5s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=700;, score=0.893 total time=  49.4s
[CV 4/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=400;, score=0.848 total time=  27.4s
[CV 2/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=700;, score=0.920 total time=  45.4s
[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=0.904 total time=  23.3s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400;, score=0.920 total time=  48.0s
[CV 5/5] END max_depth=20, min_samples_leaf=2, min



[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.887 total time=  32.3s
[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.842 total time=  29.1s
[CV 4/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.891 total time=  25.8s
[CV 1/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=300;, score=0.888 total time=  22.6s
[CV 4/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.894 total time=  41.2s
[CV 1/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.882 total time=  31.8s
[CV 4/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.893 total time=  30.7s
[CV 2/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.917 total time=  29.1s
[CV 5/5] END max_depth=20, min_samples_leaf=1, min_sampl

In [28]:
with open('fullTILmod_2.pkl', 'wb') as f:
    pickle.dump(mod_TIL2, f)
f.close()
with open('fullTILfeats_2.pkl', 'wb') as f:
    pickle.dump(TILfeats_2, f)
f.close()

In [29]:
high,low,total,mod_TIL3,TILfeats_3= train_and_evaluate(TILglyconorm_split, '3','allgenes_splitTIL_3', 
                                                      full_comp, True)

[CV 5/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=400;, score=0.854 total time=  27.2s
[CV 3/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=700;, score=0.890 total time=  49.7s
[CV 4/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=0.893 total time=  24.7s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400;, score=0.928 total time=  30.2s
[CV 5/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=0.853 total time=  24.9s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400;, score=0.899 total time=  37.4s
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=700;, score=0.896 total time=  49.5s
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=700;, score=0.887 tot



[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.877 total time=  30.9s
[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.922 total time=  25.1s
[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.837 total time=  30.6s
[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.885 total time=  30.7s
[CV 1/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=300;, score=0.884 total time=  23.3s
[CV 5/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.848 total time=  47.8s
[CV 3/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.888 total time=  26.6s
[CV 5/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=700;, score=0.853 total time=  56.2s
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_sampl

In [25]:
fullLN_ROCs = list(fullLN_comp['ROC AUC score'])
with open('fullLN_ROCs.pkl', 'wb') as f:
    pickle.dump(fullLN_ROCs, f)
    f.close()

### LNs

In [43]:
high,low,total,mod_LN1,LNfeats_1= train_and_evaluate(LNglyconorm_split, '1','allgenes_splitLN_1', 
                                                      full_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits




[CV 1/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.901 total time=  40.0s
[CV 5/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.827 total time=  39.3s
[CV 4/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.863 total time= 1.0min
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.890 total time=  44.5s
[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.905 total time=  39.3s
[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.879 total time=  59.8s
[CV 1/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.903 total time=  44.7s
[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.810 total time=  45.6s
[CV 4/5] END max_depth=20, min_samples_leaf=2, min_sampl

IOStream.flush timed out


[CV 1/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.903 total time=  56.8s
[CV 5/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=300;, score=0.842 total time=  36.1s
[CV 4/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.857 total time=  41.0s
[CV 3/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.898 total time=  57.5s
[CV 4/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.868 total time= 1.0min
[CV 5/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.829 total time=  42.2s
[CV 5/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.832 total time=  57.2s
[CV 4/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.876 total time=  37.9s
[CV 3/5] END max_depth=40, min_samples_leaf=1, min_sampl

IOStream.flush timed out


[CV 2/5] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=0.877 total time=  34.7s
[CV 1/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.900 total time=  22.2s
[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.809 total time=  22.1s
[CV 1/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=0.908 total time=  34.2s
[CV 4/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.842 total time=  22.0s
[CV 3/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.907 total time=  58.5s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400;, score=0.879 total time=  46.1s
[CV 4/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.866 total time= 1.0min
[CV 3/5] END max_depth=None, min_samples_leaf=1, min

In [44]:
with open('fullLNmod_1.pkl', 'wb') as f:
    pickle.dump(mod_LN1, f)
f.close()

with open('fullLNfeats_1.pkl', 'wb') as f:
    pickle.dump(LNfeats_1, f)
f.close()

In [45]:
high,low,total,mod_LN2,LNfeats_2= train_and_evaluate(LNglyconorm_split, '2','allgenes_splitLN_2', 
                                                      full_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


IOStream.flush timed out


[CV 4/5] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=0.870 total time=  36.2s
[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=0.898 total time=  35.0s
[CV 2/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.860 total time=  22.0s
[CV 1/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.898 total time=  58.9s
[CV 5/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.839 total time=  59.5s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400;, score=0.868 total time=  46.2s
[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.871 total time=  39.5s
[CV 1/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.901 total time= 1.0min
[CV 5/5] END max_depth=20, min_samples_leaf=2, min_s

IOStream.flush timed out


[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.819 total time=  42.8s
[CV 2/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=300;, score=0.871 total time=  35.9s
[CV 1/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.898 total time=  41.2s
[CV 2/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.864 total time=  40.8s
[CV 5/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.830 total time=  41.1s
[CV 4/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=300;, score=0.864 total time=  37.2s
[CV 3/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.900 total time=  41.8s
[CV 2/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.873 total time=  57.1s
[CV 4/5] END max_depth=20, min_samples_leaf=2, min_sampl

IOStream.flush timed out


[CV 5/5] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=200;, score=0.806 total time=  21.6s
[CV 4/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=400;, score=0.859 total time=  42.7s
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=700;, score=0.893 total time=  48.6s
[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.873 total time=  25.3s
[CV 1/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=700;, score=0.904 total time= 1.1min
[CV 1/5] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=0.903 total time=  35.2s
[CV 5/5] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=0.844 total time=  34.9s


IOStream.flush timed out


[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=0.904 total time=  35.6s
[CV 5/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=700;, score=0.829 total time= 1.1min
[CV 4/5] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=0.859 total time=  34.8s
[CV 4/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=0.866 total time=  35.3s
[CV 2/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.857 total time=  21.7s
[CV 1/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.903 total time=  58.4s
[CV 5/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.836 total time= 1.0min
[CV 2/5] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=300;, score=0.880 total time=  34.6s
[CV 1/5] END max_depth=None, min_samples_leaf=2, min

In [46]:
with open('fullLNmod_2.pkl', 'wb') as f:
    pickle.dump(mod_LN2, f)
f.close()

with open('fullLNfeats_2.pkl', 'wb') as f:
    pickle.dump(LNfeats_2, f)
f.close()

In [47]:
high,low,total,mod_LN3,LNfeats_3= train_and_evaluate(LNglyconorm_split, '3','allgenes_splitLN_3', 
                                                      full_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.888 total time=  22.6s
[CV 2/5] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.871 total time=  59.6s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400;, score=0.904 total time=  45.2s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400;, score=0.849 total time=  45.1s




[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.908 total time=  39.1s
[CV 1/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.900 total time= 1.0min
[CV 4/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.874 total time=  38.9s
[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.907 total time= 1.0min
[CV 3/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400;, score=0.908 total time=  39.8s
[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=700;, score=0.907 total time= 1.0min
[CV 1/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.904 total time=  43.4s
[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=600;, score=0.781 total time=  42.8s
[CV 5/5] END max_depth=20, min_samples_leaf=2, min_sampl

IOStream.flush timed out


[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=600;, score=0.904 total time=  42.9s
[CV 2/5] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.905 total time=  53.6s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=700;, score=0.901 total time= 1.2min
[CV 1/5] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=200;, score=0.900 total time=  21.5s
[CV 4/5] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.867 total time=  53.7s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=700;, score=0.914 total time= 1.3min
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=700;, score=0.815 total time= 1.2min
[CV 1/5] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=0.901 total time=  53.9s
[CV 5/5] END max_depth=20, min_samples_leaf=2, min

In [48]:
with open('fullLNmod_3.pkl', 'wb') as f:
    pickle.dump(mod_LN3, f)
f.close()

with open('fullLNfeats_3.pkl', 'wb') as f:
    pickle.dump(LNfeats_3, f)
f.close()

# Random Forest Clustering using all T-cells but only glycogenes, with and without housekeeping genes
Referenced:
https://github.com/BojarLab/scGlycomics_b16_branching/blob/main/Random%20Forest%20-%20Apr%208%202022%20-%20RQ.ipynb

Full dataframe containing all genes and all T cells were split by cells into three equally sized datasets.
Each third was filtered for just the 214 glycogenes.
Models were trained on one third and tested on remaining two thirds for each subset of the dataset.

In [57]:
'''Load glycosorted, normalized dataframe from pickle, saved from glycogene filtering raw.ipynb

Dataframes (glycoTIL_df and glycoLN_df) have:
- gene names as columns (241 glycogenes,housekeeping incl.) +columns for type, biotin, PHA-L = 244 columns total
- cell barcodes as row index, for cells identified as T-cells via ProjecTILs package in R
- genes are columns, NORMALIZED via robust scaler +3 columns for info 
'''

pickle_in = open("glycoTIL_normscored_split.pkl","rb")
glycoTIL_normscored_split = pickle.load(pickle_in)

pickle_in = open("glycoLN_normscored_split.pkl","rb")
glycoLN_normscored_split = pickle.load(pickle_in)

In [62]:
# make dataframe comparing effect of housekeeping genes on prediction via glycogenes
cols = {"Model": [], "PHA-L high accuracy": [], "PHA-L low accuracy": [], 
        "Overall accuracy": [], 'Top 5 features': [], 'Average loss': [], 
        'ROC AUC score': [], 'F1 score': []}
hk_comp = pd.DataFrame(columns = cols)

In [81]:
#Remove housekeeping genes
housekeeping_list = ['Ahsa1', 'Api5', 'Atp6v1e1', 'Bcap31', 'Cops6', 'Csnk2b', 'Eif3i', 'Eif4g2', 'Gdi2', 'Hnrnpf', 
                     'Hnrnph1', 'Hnrnph2', 'Ilf2', 'Dnajc5', 'Ncl', 'Otub1', 'Pdap1', 'Polr2f', 'Rhoa', 'Srp14', 
                     'Srrm1', 'Timm44', 'Ttc1', 'Ywhab', 'Pdcd6']

glycoTIL_normscored_split_nohk = glycoTIL_normscored_split.copy()

for key in list(glycoTIL_normscored_split_nohk.keys()):
    df = glycoTIL_normscored_split_nohk[key]
    cols_to_remove = df.columns.intersection(housekeeping_list)
    df.drop(columns=cols_to_remove, inplace=True) #now have 216 glycogenes
    df = df.loc[:, ~df.columns.duplicated()]

    glycoTIL_normscored_split_nohk[key] = df

glycoLN_normscored_split_nohk = glycoLN_normscored_split.copy()
    
for key in list(glycoLN_normscored_split_nohk.keys()):
    df = glycoLN_normscored_split_nohk[key]
    cols_to_remove = df.columns.intersection(housekeeping_list)
    df.drop(columns=cols_to_remove, inplace=True) #now have 216 glycogenes
    df = df.loc[:, ~df.columns.duplicated()]
    glycoLN_normscored_split_nohk[key] = df

In [84]:
high,low,total,mod_glycoTIL1,glycoTILfeats_1 = train_and_evaluate(glycoTIL_normscored_split_nohk, '1',
                                                                  'glyco_splitTIL_1',hk_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [85]:
high,low,total,mod_glycoTIL2,glycoTILfeats_2 = train_and_evaluate(glycoTIL_normscored_split_nohk, '2',
                                                                  'glyco_splitTIL_2',hk_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [86]:
high,low,total,mod_glycoTIL3,glycoTILfeats_3 = train_and_evaluate(glycoTIL_normscored_split_nohk, '3',
                                                                  'glyco_splitTIL_3',hk_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [87]:
high,low,total,mod_glycoLN1,glycoLNfeats_1 = train_and_evaluate(glycoLN_normscored_split_nohk, '1',
                                                                  'glyco_splitLN_1',hk_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [88]:
high,low,total,mod_glycoLN2,glycoLNfeats_2 = train_and_evaluate(glycoLN_normscored_split_nohk, '2',
                                                                  'glyco_splitLN_2',hk_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [89]:
high,low,total,mod_glycoLN1,glycoLNfeats_3 = train_and_evaluate(glycoLN_normscored_split_nohk, '3',
                                                                  'glyco_splitLN_3',hk_comp, True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [91]:
'''
pickle models and feature dfs for the highest performing one for both TIL and LN
'''
'''TILs'''
with open('mod_glycoTIL1.pkl', 'wb') as f:
    pickle.dump(mod_glycoTIL1, f)
f.close()

with open('glycoTILfeats_1.pkl', 'wb') as f:
    pickle.dump(glycoTILfeats_1, f)
f.close()

'''LN'''
with open('mod_glycoLN2.pkl', 'wb') as f:
    pickle.dump(mod_glycoLN2, f)
f.close()

with open('glycoLNfeats_2.pkl', 'wb') as f:
    pickle.dump(glycoLNfeats_2, f)
f.close()

In [92]:
hk_comp.to_csv('updatedTILandLNglyco_nohk.csv', index=False)