<a href="https://colab.research.google.com/github/dlee940/Parkinson-s-Disease-Progression-Prediction/blob/main/peptideProteinModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Imports**

In [None]:
import pandas as pd #python data analysis library
import numpy as np #best for working with arrays!
import matplotlib.pyplot as plt #Plotting/viz library
import seaborn as sns #data viz library based on matplotlib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

from sklearn.metrics import confusion_matrix,precision_score,recall_score,classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
import warnings

# **Reading Data from Source**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Change based on whether using Kaggle or Google Drive
patientPath = "/content/drive/MyDrive/Parkinsons/Subteam 3 Spring 2023/Kaggle Competition Spring 23/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv"
peptidePath = "/content/drive/MyDrive/Parkinsons/Subteam 3 Spring 2023/Kaggle Competition Spring 23/amp-parkinsons-disease-progression-prediction/train_peptides.csv"
proteinPath = "/content/drive/MyDrive/Parkinsons/Subteam 3 Spring 2023/Kaggle Competition Spring 23/amp-parkinsons-disease-progression-prediction/train_proteins.csv"

In [None]:
proteins = pd.read_csv(proteinPath)
peptides = pd.read_csv(peptidePath)
clinical = pd.read_csv(patientPath)

In [None]:
print('proteins shape:             ', proteins.shape)
print('peptides shape:             ', peptides.shape)
print('clinical shape:             ', clinical.shape)

proteins shape:              (232741, 5)
peptides shape:              (981834, 6)
clinical shape:              (2615, 8)


# **Data Preprocessing**

In [None]:
df_0_1 = clinical[(clinical.visit_month == 3)][['visit_id','updrs_1']]
df_0_2 = clinical[(clinical.visit_month == 3)][['visit_id','updrs_2']]
df_0_3 = clinical[(clinical.visit_month == 3)][['visit_id','updrs_3']]
df_0_4 = clinical[(clinical.visit_month == 3)][['visit_id','updrs_4']]

df_proteins = pd.merge(proteins, df_0_1, on = 'visit_id', how = 'inner').reset_index()
proteins_updrs1 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()

df_proteins = pd.merge(proteins, df_0_2, on = 'visit_id', how = 'inner').reset_index()
proteins_updrs2 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_2','mean')).reset_index()

df_proteins = pd.merge(proteins, df_0_3, on = 'visit_id', how = 'inner').reset_index()
proteins_updrs3 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_3','mean')).reset_index()

df_proteins = pd.merge(proteins, df_0_4, on = 'visit_id', how = 'inner').reset_index()
proteins_updrs4 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_4','mean')).reset_index()

df_peptides_ab = pd.merge(peptides, df_0_1, on = 'visit_id', how = 'inner').reset_index()
peptides_updrs1 = df_peptides_ab.groupby('Peptide').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()

df_peptides_ab = pd.merge(peptides, df_0_2, on = 'visit_id', how = 'inner').reset_index()
peptides_updrs2 = df_peptides_ab.groupby('Peptide').agg(updrs_1_sum = ('updrs_2','mean')).reset_index()

df_peptides_ab = pd.merge(peptides, df_0_3, on = 'visit_id', how = 'inner').reset_index()
peptides_updrs3 = df_peptides_ab.groupby('Peptide').agg(updrs_1_sum = ('updrs_3','mean')).reset_index()

df_peptides_ab = pd.merge(peptides, df_0_4, on = 'visit_id', how = 'inner').reset_index()
peptides_updrs4 = df_peptides_ab.groupby('Peptide').agg(updrs_1_sum = ('updrs_4','mean')).reset_index()

df_proteins_fts = [proteins_updrs1, proteins_updrs2, proteins_updrs3, proteins_updrs4]
df_peptides_fts = [peptides_updrs1, peptides_updrs2, peptides_updrs3, peptides_updrs4]
df_lst = [df_0_1, df_0_2, df_0_3, df_0_4]

In [None]:
def features(df, proteins, peptides, classes):
    proteins_npx_ft = proteins.groupby('visit_id').agg(NPX_min=('NPX','min'), NPX_max=('NPX','max'), NPX_mean=('NPX','mean'), NPX_std=('NPX','std'))\
                    .reset_index()
    peptides_ft_ab = peptides.groupby('visit_id').agg(Abe_min=('PeptideAbundance','min'), Abe_max=('PeptideAbundance','max'),\
                                                                    Abe_mean=('PeptideAbundance','mean'), Abe_std=('PeptideAbundance','std'))\
                    .reset_index()

    df_proteins = pd.merge(proteins, df_proteins_fts[classes], on = 'UniProt', how = 'left')
    proteins_ft = df_proteins.groupby('visit_id').agg(proteins_updrs_1_min=('updrs_1_sum','min'), proteins_updrs_1_max=('updrs_1_sum','max'),\
                                                              proteins_updrs_1_mean=('updrs_1_sum','mean'), proteins_updrs_1_std=('updrs_1_sum','std'))\
                    .reset_index()
    df_peptides = pd.merge(peptides, df_peptides_fts[classes], on = 'Peptide', how = 'left')
    peptides_ft = df_peptides.groupby('visit_id').agg(peptides_updrs_1_min=('updrs_1_sum','min'), peptides_updrs_1_max=('updrs_1_sum','max'),\
                                                              peptides_updrs_1_mean=('updrs_1_sum','mean'), peptides_updrs_1_std=('updrs_1_sum','std'))\
                    .reset_index()

    df = pd.merge(df, proteins_npx_ft, on = 'visit_id', how = 'left')
    df = pd.merge(df, peptides_ft_ab, on = 'visit_id', how = 'left')
    df = pd.merge(df, proteins_ft, on = 'visit_id', how = 'left')
    df = pd.merge(df, peptides_ft, on = 'visit_id', how = 'left')
    df = df.fillna(df.mean())
    return df

In [None]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))

    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)

    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]

    return 100 * np.mean(smap)

# **Training**

In [None]:
model = {}
mms = MinMaxScaler()
n_estimators = list(range(5,200)) # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = list(range(1, 12)) # minimum sample number to split a node
min_samples_leaf = list(range(1, 12)) # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points
kernels = ['poly','rbf','sigmoid','linear']
gamma = ['scale', 'auto']

warnings.filterwarnings('ignore')

for i in range(3):
    print('--------------------------------------------------------')
    print('Model {0}'.format(i + 1))
    train_0 = features(df_lst[i], proteins, peptides, i)
    scale_col = ['NPX_min','NPX_max','NPX_mean','NPX_std', 'Abe_min', 'Abe_max', 'Abe_mean', 'Abe_std']
    train_0[scale_col] = mms.fit_transform(train_0[scale_col])

    rfc = RandomForestRegressor()
    svr = SVR()

    forest_params = [{'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap}]
    svr_params = [{'kernel': kernels, 'gamma': gamma}]

    # Random Forest
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    clf = RandomizedSearchCV(rfc, forest_params, cv = cv, scoring=make_scorer(smape), verbose = -1)

    X = train_0.drop(columns = ['visit_id','updrs_{0}'.format(i + 1)], axis = 1)
    y = train_0['updrs_{0}'.format(i + 1)].astype(np.float32)
    clf.fit(X, y)

    print("\nRANDOM FOREST")
    print(clf.best_params_)

    print(clf.best_score_)
    print('Train smape:',smape(train_0['updrs_{0}'.format(i + 1)], clf.predict(train_0.drop(columns = ['visit_id','updrs_{0}'.format(i + 1)], axis = 1))))

    # SVR
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    clf = RandomizedSearchCV(svr, svr_params, cv = cv, scoring=make_scorer(smape), verbose = -1)

    X = train_0.drop(columns = ['visit_id','updrs_{0}'.format(i + 1)], axis = 1)
    y = train_0['updrs_{0}'.format(i + 1)].astype(np.float32)
    clf.fit(X, y)

    print("\nSVR")
    print(clf.best_params_)

    print(clf.best_score_)
    print('Train smape:',smape(train_0['updrs_{0}'.format(i + 1)], clf.predict(train_0.drop(columns = ['visit_id','updrs_{0}'.format(i + 1)], axis = 1))))
    model[i] = clf

--------------------------------------------------------
Model 1

RANDOM FOREST
{'n_estimators': 190, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
71.51256438519329
Train smape: 70.54270917364842

SVR
{'kernel': 'poly', 'gamma': 0.721}
71.75552241319085
Train smape: 70.49687673978075
--------------------------------------------------------
Model 2

RANDOM FOREST
{'n_estimators': 154, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
65.41027331731007
Train smape: 64.66225563775066

SVR
{'kernel': 'sigmoid', 'gamma': 0.4738}
63.459527547477286
Train smape: 62.980791845029884
--------------------------------------------------------
Model 3

RANDOM FOREST
{'n_estimators': 107, 'min_samples_split': 11, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 110, 'bootstrap': True}
41.332810491238796
Train smape: 40.96884011487353

SVR
{'kernel': 'poly', 'gamma': 0.4421

In [None]:
import amp_pd_peptide
env = amp_pd_peptide.make_env()
iter_test = env.iter_test()

In [None]:
def map_test(x):
    updrs = x.split('_')[2] + '_' + x.split('_')[3]
    month = int(x.split('_plus_')[1].split('_')[0])
    visit_id = x.split('_')[0] + '_' + x.split('_')[1]
    # set all predictions 0 where updrs equals 'updrs_4'
    if updrs=='updrs_3':
#         rating = updrs_3_pred[month]
        rating = df[df.visit_id == visit_id]['pred2'].values[0]
    elif updrs=='updrs_4':
        rating = 0
    elif updrs =='updrs_1':
        rating = df[df.visit_id == visit_id]['pred0'].values[0]
    else:
        rating = df[df.visit_id == visit_id]['pred1'].values[0]
    return rating

counter = 0
# The API will deliver four dataframes in this specific order:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    df = test[['visit_id']].drop_duplicates('visit_id')
    pred_0 = features(df[['visit_id']], test_proteins, test_peptides, 0)
    scale_col = ['NPX_min','NPX_max','NPX_mean','NPX_std', 'Abe_min', 'Abe_max', 'Abe_mean', 'Abe_std']
    pred_0[scale_col] = mms.fit_transform(pred_0[scale_col])
    pred_0 = model[0].predict(pred_0.drop(columns = ['visit_id'], axis = 1))
    df['pred0'] = np.ceil(pred_0)

    pred_1 = features(df[['visit_id']], test_proteins, test_peptides, 1)
    scale_col = ['NPX_min','NPX_max','NPX_mean','NPX_std', 'Abe_min', 'Abe_max', 'Abe_mean', 'Abe_std']
    pred_1[scale_col] = mms.fit_transform(pred_1[scale_col])
    pred_1 = model[1].predict(pred_1.drop(columns = ['visit_id'], axis = 1))
    df['pred1'] = np.ceil(pred_1)

    pred_2 = features(df[['visit_id']], test_proteins, test_peptides, 2)
    scale_col = ['NPX_min','NPX_max','NPX_mean','NPX_std', 'Abe_min', 'Abe_max', 'Abe_mean', 'Abe_std']
    pred_2[scale_col] = mms.fit_transform(pred_2[scale_col])
    pred_2 = model[2].predict(pred_2.drop(columns = ['visit_id'], axis = 1))
    df['pred2'] = np.ceil(pred_2)

    sample_submission['rating'] = sample_submission['prediction_id'].apply(map_test)
    env.predict(sample_submission)

    if counter == 0:
        display(test)
        display(sample_submission)

    counter += 1
