In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as plt
import torch as pt
import csv
import pickle #to save notebook at sessions


#from Bojar lab format
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

#set path for pickles to be saved in
pickle_path = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/python pickles/'

# Random Forest Clustering for combo, separated by different normalization methods (T-cells from LN and TILs) and compare the different normalization methods

## Set up:
- open pickle
- define model_evaluation
- set parameters

In [8]:
# open raw combo dataframe
#load updated df from pickle
pickle_in = open(pickle_path +"combo_raw.pkl","rb")
combo_raw = pickle.load(pickle_in)

In [4]:
def model_evaluation(model, x, y):
    print(f"Accuracy for 'PHA-L high' class: {100*(model.score(x[y==1], y[y==1])):>4f}%")
    print(f"Accuracy for 'PHA-L low' class: {100*(model.score(x[y==0], y[y==0])):>4f}%")
    print(f"Overall accuracy: {100*(model.score(x, y)):>4f}%")
    high_accuracy = 100*(model.score(x[y==1], y[y==1]))
    low_accuracy = 100*(model.score(x[y==0], y[y==0]))
    overall_accuracy = 100*(model.score(x, y))
        
    model_predict = model.predict(x)
    model_predict_prob = model.predict_proba(x)

    print(f"Average loss: {log_loss(y, model_predict_prob):>4f}")
    print(f"ROC Curve AUC: {roc_auc_score(y, model_predict):>4f}")
    print(f"F1 score: {f1_score(y, model_predict):>4f}")
    return high_accuracy, low_accuracy, overall_accuracy

In [5]:
# Parameters for grid search

# Number of trees in random forest
n_estimators = [int(x) for x in np.arange(200, 800, step=100)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.arange(10, 50, step=10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder


## Split up combo by T-cell subtype (make dictionary)

In [9]:
'''
Now want to split up glycosorted dataframe into sub-dataframes for
each of the different T-cell subtypes. TO do this, this code block takes in the glycosorted
dataframe data as the input and spits out a dictionaryt containing:
- t-cell type as key 
- matrix containing raw expression data for dataframe as values

'''
# Get list of T-cell subtypes for which to make a dataframe for TILs
tcell_subtypes = combo_raw['Type'].unique()

#make dataframe names for later access
df_names = [i+'_df' for i in list(tcell_subtypes)]

# Make copy of original df just in case
split_df = combo_raw.copy()

combotcell_dfs = {}

# Make separate dataframe containing data for each t-cell subtype
for cell_type, name in zip(tcell_subtypes, df_names):
    combotcell_dfs[name] = split_df[split_df['Type'] == cell_type]

### Standard scaler

In [16]:
# make dataframe to store subtype's overall accuracy scores 
cols = {"Cell type": [], "PHA-L high accuracy": [], "PHA-L low accuracy": [], 
        "Overall accuracy": [], 'Dimensions':[], 'Top 5 features':[]}
standard_accuracy = pd.DataFrame(columns = cols)

# for loop to add values for each t-cell subtype into dataframe
subtype_names = list(combotcell_dfs.keys())

for name in subtype_names:
    '''
    Generate training, validation, and test set from original full df using scikit learn 
    TIL ver.
    '''
    ## generate dataframe from dictionary key
    sub_df = combotcell_dfs[name]
    
    ###NORMALIZE DATA using robust scaler
    
    ### NORMALIZE COLUMNS ###
    toy_df = sub_df.transpose()
    toy_df2 = toy_df.iloc[:-4] #takes out the non-gene expression columns
    cols_to_standardize = toy_df2.columns    
    scaler = StandardScaler()
    toy_df2[cols_to_standardize] = scaler.fit_transform(toy_df2[cols_to_standardize])

    ### ADD BACK Cell type, Biotin values, and L-PHA scores ###
    scores_to_append = toy_df.iloc[-4:]
    df = pd.concat([toy_df2, scores_to_append])
    df = df.transpose()
    
    ## start random forest here
    #y: PHA-L score array
    y = df['PHA-L'].values 

    #X: glycogene transcript data array
    x = df.iloc[:, :-4].values

    # Split training, validation and test set
    x_train_val, x_test, y_train_val, y_test = train_test_split(
        x, y, test_size=0.1, random_state=42, stratify=y)

    x_train, x_val, y_train, y_val = train_test_split(
        x_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)
    
    #encode all values as numbers not anything else 
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.fit_transform(y_val)
    
    ''''Use RandomSearchCV to optimize hyperparameters'''
    
    #Generate model!!
    model = RandomForestClassifier()

    model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, 
                                      n_iter = 20, cv = 5, verbose=5, random_state=42, n_jobs = -1)

    model_random.fit(x_train, y_train)
    
    # Return the best estimator
    TILmodel_sub_robust = model_random.best_estimator_
    
    #train model
    model_evaluation(TILmodel_sub_robust, x_train, y_train)
    
    #test model
    high, low, total = model_evaluation(TILmodel_sub_robust, x_val, y_val)
    
    #extract top 5 important genes and append to dataframe 
    feature_importances = pd.DataFrame({'feature': sub_df.columns[:-3], 
                                        'importance': TILmodel_sub_robust.feature_importances_})

    # Sort the dataframe by importance score in descending order
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    feature_importances['combo'] = feature_importances['feature'] + ': '+ feature_importances['importance'].round(4).astype(str)

    # make new column that combines feature name with its importance value, get top 5 as string to add to dictionary
    top5_df = feature_importances.head()
    top5 = str(list(top5_df['combo']))
    
    #save all the info into dataframe
    #get dimension 
    dim = sub_df.shape
    standard_accuracy.loc[len(subtype_accuracy.index)] = [name, high, low, total, dim, top5] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toy_df2[cols_to_standardize] = scaler.fit_transform(toy_df2[cols_to_standardize])


NameError: name 'LabelEncoder' is not defined

### MinMax scaler

In [None]:
# make dataframe to store subtype's overall accuracy scores 
cols = {"Cell type": [], "PHA-L high accuracy": [], "PHA-L low accuracy": [], 
        "Overall accuracy": [], 'Dimensions':[], 'Top 5 features':[]}
minmax_accuracy = pd.DataFrame(columns = cols)

# for loop to add values for each t-cell subtype into dataframe
subtype_names = list(combotcell_dfs.keys())

for name in subtype_names:
    '''
    Generate training, validation, and test set from original full df using scikit learn 
    TIL ver.
    '''
    ## generate dataframe from dictionary key
    sub_df = combotcell_dfs[name]
    
    ###NORMALIZE DATA using robust scaler
    
    ### NORMALIZE COLUMNS ###
    toy_df = sub_df.transpose()
    toy_df2 = toy_df.iloc[:-4] #takes out the non-gene expression columns
    cols_to_standardize = toy_df2.columns    
    scaler = MinMaxScaler()
    toy_df2[cols_to_standardize] = scaler.fit_transform(toy_df2[cols_to_standardize])

    ### ADD BACK Cell type, Biotin values, and L-PHA scores ###
    scores_to_append = toy_df.iloc[-4:]
    df = pd.concat([toy_df2, scores_to_append])
    df = df.transpose()
    
    ## start random forest here
    #y: PHA-L score array
    y = df['PHA-L'].values 

    #X: glycogene transcript data array
    x = df.iloc[:, :-4].values

    # Split training, validation and test set
    x_train_val, x_test, y_train_val, y_test = train_test_split(
        x, y, test_size=0.1, random_state=42, stratify=y)

    x_train, x_val, y_train, y_val = train_test_split(
        x_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)
    
    #encode all values as numbers not anything else 
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.fit_transform(y_val)
    
    ''''Use RandomSearchCV to optimize hyperparameters'''
    
    #Generate model!!
    model = RandomForestClassifier()

    model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, 
                                      n_iter = 20, cv = 5, verbose=5, random_state=42, n_jobs = -1)

    model_random.fit(x_train, y_train)
    
    # Return the best estimator
    TILmodel_sub_robust = model_random.best_estimator_
    
    #train model
    model_evaluation(TILmodel_sub_robust, x_train, y_train)
    
    #test model
    high, low, total = model_evaluation(TILmodel_sub_robust, x_val, y_val)
    
    #extract top 5 important genes and append to dataframe 
    feature_importances = pd.DataFrame({'feature': sub_df.columns[:-3], 
                                        'importance': TILmodel_sub_robust.feature_importances_})

    # Sort the dataframe by importance score in descending order
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    feature_importances['combo'] = feature_importances['feature'] + ': '+ feature_importances['importance'].round(4).astype(str)

    # make new column that combines feature name with its importance value, get top 5 as string to add to dictionary
    top5_df = feature_importances.head()
    top5 = str(list(top5_df['combo']))
    
    #save all the info into dataframe
    #get dimension 
    dim = sub_df.shape
    minmax_accuracy.loc[len(subtype_accuracy.index)] = [name, high, low, total, dim, top5] 
    
    
    

In [None]:
# make dataframe to store subtype's overall accuracy scores 
cols = {"Cell type": [], "PHA-L high accuracy": [], "PHA-L low accuracy": [], 
        "Overall accuracy": [], 'Dimensions':[], 'Top 5 features':[]}
robust_accuracy = pd.DataFrame(columns = cols)

# for loop to add values for each t-cell subtype into dataframe
subtype_names = list(combotcell_dfs.keys())

for name in subtype_names:
    '''
    Generate training, validation, and test set from original full df using scikit learn 
    TIL ver.
    '''
    ## generate dataframe from dictionary key
    sub_df = combotcell_dfs[name]
    
    ###NORMALIZE DATA using robust scaler
    
    ### NORMALIZE COLUMNS ###
    toy_df = sub_df.transpose()
    toy_df2 = toy_df.iloc[:-4] #takes out the non-gene expression columns
    cols_to_standardize = toy_df2.columns    
    scaler = RobustScaler()
    toy_df2[cols_to_standardize] = scaler.fit_transform(toy_df2[cols_to_standardize])

    ### ADD BACK Cell type, Biotin values, and L-PHA scores ###
    scores_to_append = toy_df.iloc[-4:]
    df = pd.concat([toy_df2, scores_to_append])
    df = df.transpose()
    
    ## start random forest here
    #y: PHA-L score array
    y = df['PHA-L'].values 

    #X: glycogene transcript data array
    x = df.iloc[:, :-4].values

    # Split training, validation and test set
    x_train_val, x_test, y_train_val, y_test = train_test_split(
        x, y, test_size=0.1, random_state=42, stratify=y)

    x_train, x_val, y_train, y_val = train_test_split(
        x_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)
    
    #encode all values as numbers not anything else 
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.fit_transform(y_val)
    
    ''''Use RandomSearchCV to optimize hyperparameters'''
    
    #Generate model!!
    model = RandomForestClassifier()

    model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, 
                                      n_iter = 20, cv = 5, verbose=5, random_state=42, n_jobs = -1)

    model_random.fit(x_train, y_train)
    
    # Return the best estimator
    TILmodel_sub_robust = model_random.best_estimator_
    
    #train model
    model_evaluation(TILmodel_sub_robust, x_train, y_train)
    
    #test model
    high, low, total = model_evaluation(TILmodel_sub_robust, x_val, y_val)
    
    #extract top 5 important genes and append to dataframe 
    feature_importances = pd.DataFrame({'feature': sub_df.columns[:-3], 
                                        'importance': TILmodel_sub_robust.feature_importances_})

    # Sort the dataframe by importance score in descending order
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    feature_importances['combo'] = feature_importances['feature'] + ': '+ feature_importances['importance'].round(4).astype(str)

    # make new column that combines feature name with its importance value, get top 5 as string to add to dictionary
    top5_df = feature_importances.head()
    top5 = str(list(top5_df['combo']))
    
    #save all the info into dataframe
    #get dimension 
    dim = sub_df.shape
    robust_accuracy.loc[len(subtype_accuracy.index)] = [name, high, low, total, dim, top5] 
    
    
    