<a href="https://www.kaggle.com/code/dagartallison/parkinsons-submission-v1?scriptVersionId=127526190" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import sys
sys.path.append('/kaggle/input/amp-pd')

In [2]:
import pandas as pd
import numpy as np
import pickle
import amp_pd_peptide
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [3]:
def preprocess_train_df(train_clin_df, train_prot_df, train_pep_df):
    '''
        Takes in the train_clinical_data.csv, train_peptides.csv, train_proteins.csv as pandas dataframes
        Combines the protein and peptide data names and the joins with the train clinical data
        The dataframes are stratified kfold based on the target
        The function creates one dataframe for each target (updrs_1, updrs_2, updrs_3, updrs_4) stored in the final_df dictionary
        Returns a dictionary of the dataframes for each updrs target
    '''
    
    # drop the medication column
    train_clin_df = train_clin_df.drop(columns=['upd23b_clinical_state_on_medication'])
    
    # create a column with the UniProt and Peptide name combined
    train_pep_df['peptide_uniprot'] = train_pep_df['Peptide'] + '_'+ train_pep_df['UniProt']

    # create a table with the visit_id as the index and the proteins or peptides as the feature and the abundance as the values
    train_prot_pivot = train_prot_df.pivot(index='visit_id', values='NPX', columns='UniProt')
    train_pep_pivot = train_pep_df.pivot(index='visit_id', values='PeptideAbundance', columns='peptide_uniprot')

    # combine the two tables on the visit_id
    full_prot_train_df = train_prot_pivot.join(train_pep_pivot)

    # fill nan with 0 for this first round
    full_prot_train_df = full_prot_train_df.fillna(0)

    full_train_df = train_clin_df.merge(full_prot_train_df, how='inner', left_on='visit_id', right_on='visit_id')
    full_train_df = full_train_df.sample(frac=1).reset_index(drop=True)

    
    updrs = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

    final_dfs = dict()

    for target in updrs:
    
        to_remove = [updr for updr in updrs if updr != target]
        
        temp_train_df = full_train_df.drop(to_remove, axis=1)
        temp_train_df = temp_train_df.dropna()
        
        # calculate the number of bins by Sturge's rule
        num_bins = int(np.floor(1 + np.log2(len(full_train_df))))
        temp_train_df.loc[:, "bins"] = pd.cut(temp_train_df[target], bins=num_bins, labels=False)

        temp_train_df = temp_train_df.dropna().reset_index(drop=True)
        
        # initiate the kfold class from sklearn
        kf = StratifiedKFold(n_splits=5)
        
        # create a kfold column
        temp_train_df['kfold'] = -1

        # fill the kfold column
        for f, (t_, v_) in enumerate(kf.split(X=temp_train_df, y=temp_train_df['bins'].values)):
            temp_train_df.loc[v_, 'kfold'] = f
            
        # drop the bins column
        temp_train_df = temp_train_df.drop('bins', axis=1)
        
     

        final_dfs[target] = temp_train_df
            
    return final_dfs

In [4]:
def smape(y_true, y_pred):

    return round(np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred))/2)) * 100, 2)



def train_rf_model(df_dict):
    '''
        Takes in the preprocesses training dictionary of dataframes 
        Then trains a random forest regressor model on the data
        Returns a dictionary of models, one for each updrs target
    '''
    model_dict = {}
    visit0_col_dict = {}
    
    updr1_model = RandomForestRegressor(random_state = 42)
    updr2_model = RandomForestRegressor(random_state = 42)
    updr3_model = RandomForestRegressor(random_state = 42)
    updr4_model = RandomForestRegressor(random_state = 42)
    
    for updr in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
        df = df_dict[updr]
        df = df.drop(columns=['visit_id', 'patient_id', 'kfold'])

        y_train = df[updr].values
        df = df.drop(columns=[updr])
        x_train = df.values

        
        if updr == 'updrs_1':
            updr1_model.fit(x_train, y_train)
            preds = updr1_model.predict(x_train)
            r2 = metrics.r2_score(y_train, preds)
            mape = metrics.mean_absolute_percentage_error(y_train, preds)
            s_mape = smape(y_train, preds)
            model_dict[updr] = updr1_model
            visit0_col_dict[updr] = df.columns
            print(f'SMAPE = {s_mape}, R2 = {r2}, MAPE = {mape}')

        elif updr == 'updrs_2':
            updr2_model.fit(x_train, y_train)
            preds = updr2_model.predict(x_train)
            r2 = metrics.r2_score(y_train, preds)
            mape = metrics.mean_absolute_percentage_error(y_train, preds)
            s_mape = smape(y_train, preds)
            model_dict[updr] = updr2_model
            visit0_col_dict[updr] = df.columns
            print(f'SMAPE = {s_mape}, R2 = {r2}, MAPE = {mape}')

        elif updr == 'updrs_3':
            updr3_model.fit(x_train, y_train)
            preds = updr3_model.predict(x_train)
            r2 = metrics.r2_score(y_train, preds)
            mape = metrics.mean_absolute_percentage_error(y_train, preds)
            s_mape = smape(y_train, preds)
            model_dict[updr] = updr3_model
            visit0_col_dict[updr] = df.columns
            print(f'SMAPE = {s_mape}, R2 = {r2}, MAPE = {mape}')
        else:
            updr4_model.fit(x_train, y_train)
            preds = updr4_model.predict(x_train)
            r2 = metrics.r2_score(y_train, preds)
            mape = metrics.mean_absolute_percentage_error(y_train, preds)
            s_mape = smape(y_train, preds)
            model_dict[updr] = updr4_model
            visit0_col_dict[updr] = df.columns
            print(f'SMAPE = {s_mape}, R2 = {r2}, MAPE = {mape}')
        
    
    return model_dict, visit0_col_dict


## Create a Model for Forecasting the future months

In [5]:
def preprocess_forecast_train_df(preprocessed_train_df, target):
    '''
        Takes in the preprocessed training dataframe for a single updrs
        Returns a dataframe for forecasting which has columns for the updrs of different future visits
    '''
    
    temp_df = preprocessed_train_df[['visit_id', 'patient_id', target, 'visit_month']].sort_values(by=['patient_id', 'visit_month']).reset_index(drop=True)
    temp_pivot = temp_df.pivot(columns='visit_month', values=target, index='patient_id')
    temp_pivot = temp_pivot.reset_index()
    
    cols = [f'{target}_{month}' for month in temp_pivot.columns[1:]]
    temp_pivot.columns = ['patient_id'] + cols
    
    forecast_final = preprocessed_train_df[preprocessed_train_df['visit_month'] == 0]
    
    final_df = forecast_final.merge(temp_pivot, on=['patient_id'], how='left')
    
    final_df = final_df.drop(columns=['patient_id', target])
    if 'kfold' in final_df.columns:
        final_df = final_df.drop(columns=['kfold'])
        
    
    return final_df

### Create the forecasting dataframes

In [6]:
def train_forecast(model, processed_forecast_dict, target, month_diff):
    
    # results dictionary for the models
    forecast_model_dict = dict()
    
    # get the training dataset
    df = processed_forecast_dict[target]
    
    forecast_cols = [col for col in df.columns if 'updrs' in col]

    drop_cols = [col for col in forecast_cols if col not in  [f'{target}_0', f'{target}_{month_diff}']]

    df = df.drop(columns=drop_cols)
    df = df.drop(columns=['visit_id', 'visit_month'])
    df = df.rename(columns={f'{target}_0': target})
    
    target_mo = f'{target}_{month_diff}'
    # drop nan rows for target column
    df = df.dropna(subset=[target_mo])
    
    X, y = df, df[target_mo]
    
    X = X.drop([target_mo], axis=1).values
    

    reg = model
    reg.fit(X, y)
    preds = reg.predict(X)
    
    r2 = metrics.r2_score(y, preds)
    mape = metrics.mean_absolute_percentage_error(y, preds)
    s_mape = smape(y, preds)
    
    print(target, month_diff, 'SMAPE:', s_mape)
    
    return model, df.drop(columns=target_mo).columns

In [7]:
def preprocess_test_df(test_clin_df, test_prot_df, test_pep_df):
    '''
        Takes in the test data from the csv file in the form of a pandas dataframe
        Combines the peptide and protein data
        Outputs the dataframe for inference
    '''
    
    
    if 'upd23b_clinical_state_on_medication' in test_clin_df.columns:
        # drop the medication column
        test_clin_df = test_clin_df.drop(columns=['upd23b_clinical_state_on_medication'])
    
    if 'group_key' in test_clin_df.columns:
        # drop the group key
        test_clin_df = test_clin_df.drop(columns=['group_key'])
    
    # create a column with the UniProt and Peptide name combined
    test_pep_df['peptide_uniprot'] = test_pep_df['Peptide'] + '_'+ test_pep_df['UniProt']

    # create a table with the visit_id as the index and the proteins or peptides as the feature and the abundance as the values
    test_prot_pivot = test_prot_df.pivot(index='visit_id', values='NPX', columns='UniProt')
    test_pep_pivot = test_pep_df.pivot(index='visit_id', values='PeptideAbundance', columns='peptide_uniprot')

    # combine the two tables on the visit_id
    full_prot_test_df = test_prot_pivot.join(test_pep_pivot)

    # fill nan with 0 
    full_prot_test_df = full_prot_test_df.fillna(0)

    full_test_df = test_clin_df.merge(full_prot_test_df, how='inner', left_on='visit_id', right_on='visit_id')
    full_test_df = full_test_df.sample(frac=1).reset_index(drop=True)

    missing_row_id = [x for x in test_clin_df['row_id'] if x not in full_test_df['row_id'].to_list()]
    filtered_df = test_clin_df[test_clin_df['row_id'].isin(missing_row_id)]
    imputed_df = filtered_df.drop(columns=['visit_month']).merge(full_test_df.drop(columns=['row_id', 'visit_id']),
                                                                    how='left', 
                                                                    left_on=['patient_id', 'updrs_test'],
                                                                    right_on=['patient_id', 'updrs_test'])
    full_test_df = pd.concat([full_test_df, imputed_df])
    
    full_test_df = full_test_df.reset_index(drop=True)
    
    # remove the imputed visit month and replace from the test df
    full_test_df = full_test_df.drop(columns='visit_month').merge(test_clin_df[['row_id', 'visit_month']], how='left', left_on='row_id', right_on='row_id')
    
    return full_test_df

In [8]:

def prepare_model_df(model_df, target, train_cols, visit_month=0):
    '''
        model_df is the preprocessed test dataframe which has all of the protein data
        target is the updrs number
        train_cols are the list of columns necessary for the model to do the inference
        visit_month is the month of data we want to filter
    '''

    # add visit_month if it is not in the model_df.columns
    if 'visit_month' not in model_df.columns:
        model_df['visit_month'] = visit_month
    
    # start will all the visit_months as 0 for the first prediction
    model_df['visit_month'] = 0
    
    model_df = model_df[model_df['updrs_test'] == target]
    
    # find the columns in preds_cols that are not in the model_df.columns
    not_in_pred_cols = [col for col in train_cols if col not in model_df.columns]

    # create an empty dataframe with the columns in not_in_pred_cols
    not_in_preds_df = pd.DataFrame(columns=not_in_pred_cols)

    # combine the model_df and the not_in_preds_df so all the needed columns are in dataframe
    new_model_df = pd.concat([model_df, not_in_preds_df], axis=1)
    
    # fill the nan values with 0
    new_model_df = new_model_df.fillna(0)
    
    # keep track of the row_id order for later
    row_id_df = new_model_df[['row_id']]

    # filter the new_model_df to only include the columns in pred_cols with the correct order
    return new_model_df[train_cols], row_id_df

## Train the models for Visit 0 and the Forecasting


In [9]:
# read the training data with folds
train_df = pd.read_csv('/kaggle/input/amp-pd/train_clinical_data.csv')
train_prot_df = pd.read_csv('/kaggle/input/amp-pd/train_proteins.csv')
train_pep_df = pd.read_csv('/kaggle/input/amp-pd/train_peptides.csv')

train_df_dict = preprocess_train_df(train_df, train_prot_df, train_pep_df)

    
trained_models_dict, visit0_col_dict = train_rf_model(train_df_dict)
    

processed_forecast_dict = dict()

for updr in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
    
    forecast_data = train_df_dict[updr]
    processed_forecast_dict[updr] = preprocess_forecast_train_df(forecast_data, updr)
    
    
    
    
forecast_col_dict = {'updrs_1':{6:_, 12:_, 24:_}, 'updrs_2':{6:_, 12:_, 24:_}, 'updrs_3':{6:_, 12:_, 24:_}, 'updrs_4':{6:_, 12:_, 24:_}}

model_1_6, model_1_12, model_1_24 = RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42)
model_2_6, model_2_12, model_2_24 = RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42)
model_3_6, model_3_12, model_3_24 = RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42)
model_4_6, model_4_12, model_4_24 = RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42)

# store the instantiated models
forecast_dict = {'updrs_1':{6:model_1_6, 12:model_1_12, 24:model_1_24},
                 'updrs_2':{6:model_2_6, 12:model_2_12, 24:model_2_24},
                 'updrs_3':{6:model_3_6, 12:model_3_12, 24:model_3_24},
                 'updrs_4':{6:model_4_6, 12:model_4_12, 24:model_4_24}}


for updr in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
    for month_diff in [6, 12, 24]:
        
        try:
            forecast_model, forecast_cols = train_forecast(forecast_dict[updr][month_diff], processed_forecast_dict, updr, month_diff)
            forecast_dict[updr][month_diff] = forecast_model
            forecast_col_dict[updr][month_diff] = forecast_cols
        except:
            print(f'{updr} {month_diff} forecasting model failed!!!!!')
            
            


SMAPE = 36.99, R2 = 0.882756818063978, MAPE = 577624603892519.5
SMAPE = 67.52, R2 = 0.8889851656422887, MAPE = 1979939263145383.2
SMAPE = 58.76, R2 = 0.8979083351951844, MAPE = 4100148613493203.5
SMAPE = 129.79, R2 = 0.8688203953747919, MAPE = 1931798877069098.2
updrs_1 6 SMAPE: 37.93
updrs_1 12 SMAPE: 35.55
updrs_1 24 SMAPE: 35.17
updrs_2 6 SMAPE: 22.58
updrs_2 12 SMAPE: 67.7
updrs_2 24 SMAPE: 70.08
updrs_3 6 SMAPE: 21.06
updrs_3 12 SMAPE: 47.96
updrs_3 24 SMAPE: 53.01
updrs_4 6 forecasting model failed!!!!!
updrs_4 12 SMAPE: 127.5
updrs_4 24 SMAPE: 113.38


In [10]:
def fill_test_cols(test_df, pred_cols):
    '''
        Takes in the prediction columns and the test dataframe
        Returns the dataframe with all of the necessary columns for prediction
    '''
    # get the missing columns need for prediction
    missing_cols_from_test = [col for col in pred_cols if col not in test_df.columns]

    # create a dataframe with those columns
    missing_cols_df = pd.DataFrame(columns = missing_cols_from_test)

    # concat these columns to the test_df
    test_df = pd.concat([test_df, missing_cols_df], axis=1)

    # fill the na with 0
    test_df = test_df.fillna(0)

    return test_df[pred_cols]

## Create the Prediction Function

In [11]:
def create_submission(test_df, test_prot_df, test_pep_df, visit0_col_dict, trained_models_dict, forecast_col_dict, forecast_dict):

    '''
    Need to input the following variables:

    test_df = pd.read_csv('/kaggle/input/amp-pd/example_test_files/test.csv')
    test_prot_df = pd.read_csv('/kaggle/input/amp-pd/example_test_files/test_proteins.csv')
    test_pep_df = pd.read_csv('/kaggle/input/amp-pd/example_test_files/test_peptides.csv')
    visit0_col_dict
    trained_models_dict
    forecast_col_dict
    forecast_dict
    '''
    test_preprocessed_df = preprocess_test_df(test_df, test_prot_df, test_pep_df)



    visit0_df = pd.DataFrame()


    final_df = pd.DataFrame()

    # for visit 0
    for updr in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
    
        # predict the 0 visit first
    
        md_df, row_id = prepare_model_df(test_preprocessed_df, updr, visit0_col_dict[updr], visit_month=0)
        md_df = fill_test_cols(md_df, visit0_col_dict[updr])
        trained_model = trained_models_dict[updr]
        preds = trained_model.predict(md_df.values)
        row_id[f'{updr}'] = preds
        temp_df = pd.concat([row_id, md_df], axis=1)
        visit0_df = pd.concat([visit0_df, temp_df])

        for month in [6, 12, 24]:
            if updr == 'updrs_4' and month == 6:
                # split the difference between 0 and 12
                forecast_cols = forecast_col_dict[updr][12]
                forecast_df = visit0_df.dropna(subset=[updr])
                forecast_id = forecast_df['row_id']
                forecast_df = fill_test_cols(forecast_df, forecast_cols)
                
    
                # get the forecast model
                forecast_model = forecast_dict[updr][12]
                preds = forecast_model.predict(forecast_df.values)
                visit0_preds = forecast_df['updrs_4']
                impute_preds = (preds + visit0_preds) / 2
                forecast_df[f'{updr}_{month}'] = impute_preds
                visit0_df = visit0_df.join(forecast_df[f'{updr}_{month}'])
            
            else:
                # predict the 6, 12, and 24 later visits
                forecast_cols = forecast_col_dict[updr][month]
                forecast_df = visit0_df.dropna(subset=[updr])
                forecast_id = forecast_df['row_id']
                forecast_df = fill_test_cols(forecast_df, forecast_cols)
    
                # get the forecast model
                forecast_model = forecast_dict[updr][month]
                preds = forecast_model.predict(forecast_df.values)
                forecast_df[f'{updr}_{month}'] = preds
                visit0_df = visit0_df.join(forecast_df[f'{updr}_{month}'])
            
        final_df = pd.concat([final_df, visit0_df])
        final_df = final_df.drop_duplicates()
            
    pred_df = final_df[[col for col in final_df.columns if col == 'row_id' or col[:5] == 'updrs']]

    melted_df = pd.melt(pred_df, id_vars=['row_id'], value_vars=pred_df.columns[1:])
    melted_df = melted_df.dropna()

    for i, row in melted_df.iterrows():
        if row['variable'] in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
            melted_df.loc[i, 'row_id'] = row['row_id'] + '_plus_0_months'
        elif row['variable'] in ['updrs_1_6', 'updrs_2_6', 'updrs_3_6', 'updrs_4_6']:
            melted_df.loc[i, 'row_id'] = row['row_id'] + '_plus_6_months'
        elif row['variable'] in ['updrs_1_12', 'updrs_2_12', 'updrs_3_12', 'updrs_4_12']:
            melted_df.loc[i, 'row_id'] = row['row_id'] + '_plus_12_months'
        else:
            melted_df.loc[i, 'row_id'] = row['row_id'] + '_plus_24_months'
        
    melted_df = melted_df.rename(columns={'value':'rating'}).drop(columns=['variable'])
    result = melted_df.reset_index(drop=True)
    result = result.rename(columns={'row_id':'prediction_id'})
    
    return result

In [12]:
sample = pd.read_csv('/kaggle/input/amp-pd/example_test_files/sample_submission.csv')
sample

Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,0,0
1,3342_0_updrs_1_plus_6_months,0,0
2,3342_0_updrs_1_plus_12_months,0,0
3,3342_0_updrs_1_plus_24_months,0,0
4,3342_0_updrs_2_plus_0_months,0,0
...,...,...,...
59,50423_6_updrs_3_plus_24_months,0,6
60,50423_6_updrs_4_plus_0_months,0,6
61,50423_6_updrs_4_plus_6_months,0,6
62,50423_6_updrs_4_plus_12_months,0,6


In [13]:
test = pd.read_csv('/kaggle/input/amp-pd/example_test_files/test.csv')
test_proteins = pd.read_csv('/kaggle/input/amp-pd/example_test_files/test_proteins.csv')
test_peptides = pd.read_csv('/kaggle/input/amp-pd/example_test_files/test_peptides.csv')
result = create_submission(test, test_proteins, test_peptides, visit0_col_dict, trained_models_dict, forecast_col_dict, forecast_dict)


In [14]:
pd.set_option('display.max_rows', None)
result.sort_values(by='prediction_id')

Unnamed: 0,prediction_id,rating
2,3342_0_updrs_1_plus_0_months,6.42
10,3342_0_updrs_1_plus_12_months,6.02
14,3342_0_updrs_1_plus_24_months,6.25
6,3342_0_updrs_1_plus_6_months,6.93
18,3342_0_updrs_2_plus_0_months,4.91
26,3342_0_updrs_2_plus_12_months,6.61
30,3342_0_updrs_2_plus_24_months,6.65
22,3342_0_updrs_2_plus_6_months,6.18
34,3342_0_updrs_3_plus_0_months,18.02
42,3342_0_updrs_3_plus_12_months,20.79


## Submission Through the API

In [15]:
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()  

In [16]:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
        
    result = create_submission(test, test_proteins, test_peptides, visit0_col_dict, trained_models_dict, forecast_col_dict, forecast_dict)

    env.predict(result)   # register your predictions



This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
