In [None]:
from New_PF import*
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix, roc_auc_score, classification_report
pd.set_option('display.expand_frame_repr', False)

In [None]:
filepath = 'Data/Model/Sampled Training Epitopes.csv'
df = pd.read_csv(filepath)
input_columns = [col for col in df.columns if col != 'ID']
x = df[input_columns]
x = x.drop(columns=['Unnamed: 0'], errors='ignore')
y = df['ID']
y = y.drop(columns=['Unnamed: 0'], errors='ignore')

In [None]:
regression_models = [LinearRegression(),LogisticRegression(),RandomForestRegressor(),GradientBoostingRegressor()]
classification_models = [RandomForestClassifier(),SVC(),GaussianNB(),MLPClassifier(max_iter=1000)]

In [None]:
input_features_list = [['Immunogenicity','Alternative Immunogenicity','Antigenicity','Hydropathicity','MHCFlurry Rank','MHCFlurry WT:MUT Rank','NetMHC Rank','NetMHC Stability']]
immunogenicity_input = ['Immunogenicity','Alternative Immunogenicity','Antigenicity']
rank_input = ['MHCFlurry Rank','NetMHC Rank']
for im_in in immunogenicity_input:
    inputs = [im_in]
    inputs.append('Hydropathicity')
    inputs.append(rank_input[0])
    in2 = inputs[:-1]
    in2.append(rank_input[1])
    inputs.extend(['MHCFlurry WT:MUT Rank','NetMHC Stability'])
    in2.extend(['MHCFlurry WT:MUT Rank','NetMHC Stability'])
    input_features_list.append(inputs)
    input_features_list.append(in2)
for ra_in in rank_input:
    inputs = [ra_in]
    inputs.extend(['MHCFlurry WT:MUT Rank','NetMHC Stability'])
    input_features_list.append(inputs)

In [None]:
df_rows = []
for model in regression_models:
    for input_features in input_features_list:
        #print(input_features)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(model, x[input_features], y, cv=kf, scoring='roc_auc')
        df_rows.append([str(input_features),str(model)[:-2],scores.mean()])
df = pd.DataFrame(df_rows,columns=['Input Features','Model','AUC of ROC'])
df = df.sort_values('AUC of ROC',ascending=False)
df = df.reset_index(drop=True)
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
print(df)
df.to_csv('Data/Model/Regression Model Performance.csv')

In [None]:
#Precision/PPV: tp/(tp+fp), portion of the identified positve that are actual positives
#Accuracy: t/(t+f)
#Recall/TPR: tp/(tp+fn), portion of the actual positives that are identified
df_rows = []
for model in classification_models:
    for input_features in input_features_list:
        #print(input_features)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        precisions = cross_val_score(model, x[input_features], y, cv=kf, scoring='precision')
        accuracies = cross_val_score(model, x[input_features], y, cv=kf, scoring='accuracy')
        recalls = cross_val_score(model, x[input_features], y, cv=kf, scoring='recall')
        df_rows.append([str(input_features),str(model),precisions.mean(),accuracies.mean(),recalls.mean()])
df = pd.DataFrame(df_rows,columns=['Input Features','Model','PPV','Accuracy','TPR'])
df = df.sort_values('Accuracy',ascending=False)
df = df.reset_index(drop=True)
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
print(df)
df.to_csv('Data/Model/Classification Model Performance.csv')

In [None]:
#Random Forest Classifier
def RFC(x, y, show_features=False):
    print('Random Forest Classifier')
    model = RandomForestClassifier()
    model.fit(x, y)
    with open('Data/Model/Random Forest Classifier Model.pkl', 'wb') as file:
        pickle.dump(model, file)
    if show_features==True:
        feature_names = model.feature_names_in_
        importances = model.feature_importances_
        feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
        feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
        print(feature_importances)
        plt.figure(figsize=(8, 6))
        plt.bar(feature_importances['Feature'], feature_importances['Importance'])
        plt.xlabel('Feature')
        plt.ylabel('Importance')
        plt.title('Feature Importances')
        plt.xticks(rotation=-45)
        plt.show()

In [None]:
#Linear Regression
def LinReg(x, y):
    print('Linear Regression')
    model = LinearRegression()
    model.fit(x, y)
    with open('Data/Model/Linear Regression Model.pkl', 'wb') as file:
        pickle.dump(model, file)

In [None]:
#Logistic Regression
def LogReg(x, y):
    print('Logistic Regression')
    model = LogisticRegression()
    model.fit(x, y)
    with open('Data/Model/Logistic Regression Model.pkl', 'wb') as file:
        pickle.dump(model, file)

In [None]:
LinReg(x[['MHCFlurry Rank', 'MHCFlurry WT:MUT Rank', 'NetMHC Stability']],y)
LogReg(x[['MHCFlurry Rank', 'MHCFlurry WT:MUT Rank', 'NetMHC Stability']],y)
RFC(x[['Antigenicity', 'Hydropathicity', 'MHCFlurry Rank', 'MHCFlurry WT:MUT Rank', 'NetMHC Stability']], y, show_features=True)

In [None]:
csv_filename = 'Data/Clinical/Clinical Epitopes.csv'
P8_csv_filename = 'Data/Clinical/Clinical Cancer Peptides.csv'
regression_model_file = 'Data/Model/Linear Regression Model.pkl'
df = pd.read_csv('Data/Model/Sampled Training Epitopes.csv')
input_columns = [col for col in df.columns if col != 'ID']
x = df[input_columns]
x = x.drop(columns=['Unnamed: 0'], errors='ignore')
y = df['ID']
y = y.drop(columns=['Unnamed: 0'], errors='ignore')
df_rows = []
for input_features in input_features_list:
    for model in classification_models:
        model.fit(x[input_features], y)
        classification_model_file = 'Data/Model/'+str(model)+'.pkl'
        with open(classification_model_file, 'wb') as file:
            pickle.dump(model, file)
        rank_peptides(csv_filename,regression_model_file,classification_model_file)
        df = pd.read_csv(csv_filename)
        df = df[df['ID']==1]
        df = df.reset_index(drop=True)
        df = df.drop(columns=['Unnamed: 0'], errors='ignore')
        df.to_csv(P8_csv_filename)
        unique_seq = []
        cancer_peptides = to_peptide_list(P8_csv_filename)
        os.remove(classification_model_file)
        for peptide in cancer_peptides:
            if peptide.seq not in unique_seq:
                unique_seq.append(peptide.seq)
        clinical_epitopes = ['FVGEFFTDV', 'KTVNELQNL', 'EYYELFVNI', 'IYNEYIYDL', 'SYRNEIAYL', 'RYCNLEGPPI', 'CYTWNQMNL', 'YMDGTMSQV', 'IMDQVPFSV', 'LAGIGILTV', 'IISAVVGIL', 'KIFGSLAFL', 'IISAVVGIL', 'SAPDNRPAL', 'KIFGSLAFL', 'YLSGADLNL', 'YMFPNAPYL', 'FLGENISNFL', 'ALADGVQKV', 'ALFDGDPHL', 'SVFAGVVGV', 'LLYPTEITV', 'STAPPVHNV', 'SVASTITGV', 'YVDPVITSI', 'LAALPHSCL', 'LLGATCMFV', 'VLNLYLLGV', 'CYTWNQMNL', 'ALWAWPSEL', 'SLWAGVVVL', 'TMLARLASA', 'LTFGDVVAV', 'KIQEILTQV', 'NLDTLMTYV', 'GLWHHQTEV', 'AIIDGVESV', 'KVFAGIPTV', 'AMTQLLAGV', 'SLLMVVITQV', 'NSQPVWLCL', 'KLRQEVKQNL', 'KTVNELQNL', 'SYGVLLWEI', 'RFVPDGNRI', 'EYYELFVNI', 'IYNEYIYDL', 'RYCNLEGPPI', 'KTVNELQNL', 'YMMPVNSEV', 'KLATAQFKI', 'NYGIYKQDL', 'EYVYEFRDKL', 'RLMNDMTAV', 'KLMSSNSTDL', 'IYTWIEDHF', 'RYCNLEGPPI', 'EYYELFVNI', 'KVYLRVRPLL', 'SYGVLLWEIF', 'YGAAVQAAI', 'MVNELFDSL', 'KVYLRVRPLL', 'SYGVLLWEI', 'RFVPDGNRI', 'RMFPNAPYL', 'CYTWNQMNL', 'ALLEIASCL', 'EYYELFVNI', 'MVNELFDSL', 'LFDSLFPVI', 'SLQVTRIFL', 'LLQAEAPRL', 'KLKHYGPGWV', 'KLVERLGAA', 'DVWSFGILL', 'DLLSHAFFA', 'ASLDSDPWV', 'RLQEWCSVI', 'NVLHFFNAPL', 'DYSARWNEI', 'VYDYNCHVDL', 'HYTNASDGL', 'DYLRSVLEDF', 'RYLTQETNKV', 'LYCESVHNF', 'HYRKWIKDTI', 'DYVREHKDNI', 'WLEYYNLER', 'QIRPIFSNR', 'ILEQSGWWK', 'VIQNLERGYR', 'GIHKQKEKSR', 'GAAPLILSR', 'APAGRPSASR', 'KIREEYPDR', 'VYGIRLEHF', 'YLVPIQFPV', 'SLVLQPSVKV', 'GLMDLSTTPL', 'RFVPDGNRI', 'EIWTHSYKV', 'EIWTFSTKV']
        fn_list = [seq for seq in clinical_epitopes if seq not in unique_seq]
        df_rows.append([str(model),input_features,1-len(fn_list)/len(clinical_epitopes),len(clinical_epitopes)-len(fn_list),len(clinical_epitopes)])
df = pd.DataFrame(df_rows, columns = ['Model','Input Features', 'TPR', 'TP', 'Total Epitopes'])
df = df.sort_values('TP',ascending=False)
df = df.reset_index(drop=True)
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
df.to_csv('Data/Model/Model Validation with Clinical Epitopes.csv')