# Importing Packages and Datasets

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, cross_val_predict

In [4]:
dabigboy = pd.read_csv("base.xy.df.tsv", delimiter='\t')
dabigboy #the big dataset

KeyboardInterrupt: 

In [None]:
ensg = pd.read_csv("ensembl.tsv", delimiter='\t')
ensg #the holy grail - has all ensembl id's

In [None]:
panimmune = pd.read_csv('panimmune.tsv', delimiter='\t')
panimmune

In [None]:
duplicates_mask = panimmune['SetName'].duplicated(keep=False)

df_duplicates = panimmune[duplicates_mask]

gene_sets = df_duplicates['SetName'].unique()

print("Gene Sets:")
print(gene_sets)

# Training SVM Classifier Models With Various Hyper Parameters

In [6]:
from sklearn import svm

# Picking out a single set column and assigning it a variable name
gene_set_name = "HER2_Immune_PCA_18006808"
testing_ds = panimmune[panimmune['SetName'] == gene_set_name]

# Making ensembl filter
testing_df = pd.merge(ensg, testing_ds, left_on='symbol', right_on='Gene', how='inner')
testing_filter = testing_df.iloc[:, 1:2]

# Filtering the big df with the ensembl filter
ensemblIDs = set(testing_filter['id']) 
cols = list(dabigboy.columns)
newCols = [a for a in cols if a in ensemblIDs]
finalCols = cols[0:3]
finalCols.extend(newCols)      
testing_df = dabigboy[finalCols]

In [7]:
# Making the train/test split
df_train, df_test = train_test_split(testing_df, test_size=0.25)
x_train = df_train.drop(columns=["Immune Subtype", "TCGA Participant Barcode"]).values
y_train = df_train["Immune Subtype"].values
xt, xv, yt, yv = train_test_split(x_train, y_train, test_size=0.25)

# Making the cross fold validation with SVM model
svmModel = svm.SVC()
cross_val_score(svmModel, x_train, y_train, cv=5, scoring='accuracy')
accuracies = cross_val_score(svmModel, x_train, y_train, cv=5, scoring='accuracy')
print(accuracies)


[0.67209474 0.67703704 0.67407407 0.68888889 0.68740741]


In [22]:
# Define the parameter settings
svm_params = {
    'C': [0.2, 0.5, 1, 1.5, 2],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# Calculate and store the mean accuracy
mean_accuracy = np.mean(accuracies)
print(f'{gene_set_name} accuracy =', mean_accuracy)

for C in svm_params['C']:
    for kernel in svm_params['kernel']:
        # Create and train the SVM model
        svmModel = svm.SVC(C=C, kernel=kernel)
        accuracies = cross_val_score(svmModel, x_train, y_train, cv=5, scoring='accuracy')
        
        # Print the results
        print(f"{gene_set_name} - C={C}, kernel={kernel} - accuracy =", np.mean(accuracies))

HER2_Immune_PCA_18006808 accuracy = 0.7865508676700388
HER2_Immune_PCA_18006808 - C=0.2, kernel=linear - accuracy = 0.7712925953340461
HER2_Immune_PCA_18006808 - C=0.2, kernel=poly - accuracy = 0.4950403816103298
HER2_Immune_PCA_18006808 - C=0.2, kernel=rbf - accuracy = 0.6010961427748993
HER2_Immune_PCA_18006808 - C=0.2, kernel=sigmoid - accuracy = 0.36839016366477506
HER2_Immune_PCA_18006808 - C=0.5, kernel=linear - accuracy = 0.7712925953340461
HER2_Immune_PCA_18006808 - C=0.5, kernel=poly - accuracy = 0.5332582175069221
HER2_Immune_PCA_18006808 - C=0.5, kernel=rbf - accuracy = 0.6514599336568249
HER2_Immune_PCA_18006808 - C=0.5, kernel=sigmoid - accuracy = 0.38275362557227843
HER2_Immune_PCA_18006808 - C=1, kernel=linear - accuracy = 0.7712925953340461
HER2_Immune_PCA_18006808 - C=1, kernel=poly - accuracy = 0.5665858486169367
HER2_Immune_PCA_18006808 - C=1, kernel=rbf - accuracy = 0.6776785371604024
HER2_Immune_PCA_18006808 - C=1, kernel=sigmoid - accuracy = 0.37964174685418206
HE

# Training RF Models With Various Hyper Parameters

In [9]:
#test with rf

#picking out a single set column and assigning it a varible name
testing_ds = panimmune[panimmune['SetName'] == "HER2_Immune_PCA_18006808"]
    
    #making ensembl filter
testing_df = pd.merge(ensg, testing_ds, left_on = 'symbol', right_on = 'Gene', how = 'inner')
testing_filter = testing_df.iloc[:, 1:2]

    #filtering the big df with the filter ensmbl filter
ensemblIDs = set(testing_filter['id']) 
cols = list(dabigboy.columns)
newCols = [a for a in cols if a in ensemblIDs]
finalCols = cols[0:3]
finalCols.extend(newCols)      
testing_df = dabigboy[finalCols]
    
    #making the train/test split
df_train, df_test = train_test_split(testing_df, test_size=0.25)
x_train = df_train.drop(columns=["Immune Subtype", "TCGA Participant Barcode"]).values #we drop the columns we dont want to use to train, and we turn the graph back into arrays
y_train = df_train["Immune Subtype"].values
xt, xv, yt, yv = train_test_split(x_train, y_train, test_size=0.25)
    
    #making the cross fold validation with Random Forest model
rfModel = RandomForestClassifier()
cross_val_score(rfModel, x_train, y_train, cv=5, scoring='accuracy')
accuracies = cross_val_score(rfModel, x_train, y_train, cv=5, scoring='accuracy')
print('HER2_Immune_PCA_18006808 =') 
print(np.mean(accuracies)) #kaplin myer, diffrential survival antlsis, upset plot
print(accuracies)

HER2_Immune_PCA_18006808 =
0.7865508676700388
[0.78164323 0.8        0.75185185 0.81111111 0.78814815]


### Testing Several Sets of Parameters at a Time

In [24]:
# Define a range of hyperparameter values to test
n_estimators_values = [50, 100, 150]
max_depth_values = [None, 10, 20]

# Iterate over hyperparameter values
for n_estimators in n_estimators_values:
    for max_depth in max_depth_values:
        # Create and train the RandomForestClassifier model
        rfModel = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
        accuracies = cross_val_score(rfModel, x_train, y_train, cv=5, scoring='accuracy')
        
        # Print the results
        print(f'HER2_Immune_PCA_18006808 - n_estimators={n_estimators}, max_depth={max_depth} - accuracy =', np.mean(accuracies))


HER2_Immune_PCA_18006808 - n_estimators=50, max_depth=None - accuracy = 0.7812190695506758
HER2_Immune_PCA_18006808 - n_estimators=50, max_depth=10 - accuracy = 0.7844778901773721
HER2_Immune_PCA_18006808 - n_estimators=50, max_depth=20 - accuracy = 0.7880332264166461
HER2_Immune_PCA_18006808 - n_estimators=100, max_depth=None - accuracy = 0.7889202511171424
HER2_Immune_PCA_18006808 - n_estimators=100, max_depth=10 - accuracy = 0.7832921567014832
HER2_Immune_PCA_18006808 - n_estimators=100, max_depth=20 - accuracy = 0.7920314718863942
HER2_Immune_PCA_18006808 - n_estimators=150, max_depth=None - accuracy = 0.7936604435671794
HER2_Immune_PCA_18006808 - n_estimators=150, max_depth=10 - accuracy = 0.7825517449351647
HER2_Immune_PCA_18006808 - n_estimators=150, max_depth=20 - accuracy = 0.7924767935959646


### Testing One Set of Parameters at a Time

In [28]:
rfModel = RandomForestClassifier(n_estimators=250, max_depth=50)
accuracies = cross_val_score(rfModel, x_train, y_train, cv=5, scoring='accuracy')
        
        # Print the results
print(f'HER2_Immune_PCA_18006808 - n_estimators={n_estimators}, max_depth={max_depth} - accuracy =', np.mean(accuracies))


HER2_Immune_PCA_18006808 - n_estimators=150, max_depth=20 - accuracy = 0.7948454094360831
