The problem is predicting updrs_1, updrs_2, updrs_3 and updrs_4 values at a given visit_month and potential 6, 12 and 24 month visits after the current visit_month.

Build models using linear regression, svm, logistic regression, random forests

for lr, group 4 percentiles

In [1]:
import pandas as pd
import numpy as np

train_clinical = '~/School/EE445/kaggle/train_clinical_data.csv'
train_peptides = '~/School/EE445/kaggle/train_peptides.csv'
train_supp_clinical = '~/School/EE445/kaggle/supplemental_clinical_data.csv'
train_proteins = '~/School/EE445/kaggle/train_proteins.csv'

In [2]:
# load data
clinical_data = pd.read_csv(train_clinical)
protein_data = pd.read_csv(train_proteins)
peptide_data = pd.read_csv(train_peptides)
supp_data = pd.read_csv(train_supp_clinical)

### Initialize DF:

In [3]:
def Get_ALL_Protein_Names(df_protein):
    return df_protein['UniProt'].unique().tolist()

def Get_Protein_Names(df_protein, visit_id):
    return protein_data.loc[protein_data['visit_id'] == visit_id]['UniProt'].tolist()

def Get_Patient_Info(df_clinical):
    return df_clinical.columns[0:3].tolist() + [df_clinical.columns[-1]]

def Get_Targets(df_clinical):
    return df_clinical.columns[3:7].tolist()

def Get_Protein_Visit_ID(df_protein):
    return df_protein['visit_id'].unique().tolist()

def Get_Clinical_Visit_ID(df_clinical):
    return df_clinical['visit_id'].tolist()

def Get_NPX(df_protein, visit_id):
    return df_protein.loc[df_protein['visit_id'] == visit_id]['NPX'].tolist()

def Create_Patient_Data_Dict(visit_id):
    pat_id, vis_mon = visit_id.split('_')
    return {'patient_id': int(pat_id), 'visit_month': int(vis_mon), 'visit_id': visit_id }

def Create_Protein_Data_Dict(df_protein, NPX_data, visit_id):
    return dict(zip(Get_Protein_Names(df_protein, visit_id), NPX_data)) | Create_Patient_Data_Dict(visit_id)
    
def Find_Visit_ID_Only_Protein_Measured(df_clinical, df_protein):
    protein_list = Get_Protein_Visit_ID(df_protein)
    clinical_list = Get_Clinical_Visit_ID(df_clinical)
    only_protein = [visit_id for visit_id in protein_list if visit_id not in clinical_list]
    return only_protein

def Find_Visit_ID_Only_UPDRS_Measured(df_clinical, df_protein):
    protein_list = Get_Protein_Visit_ID(df_protein)
    clinical_list = Get_Clinical_Visit_ID(df_clinical)
    only_UPDRS = [visit_id for visit_id in clinical_list if visit_id not in protein_list]
    return only_UPDRS

def Find_Visit_ID_Protein_and_UPDRS_Measured(df_clinical, df_protein):
    return [visit_id for visit_id in Get_Clinical_Visit_ID(df_clinical) if visit_id in Get_Protein_Visit_ID(df_protein)]

def Add_Rows_Only_Protein_Measured(df, only_prot_visit_id, df_protein):
    for visit_id in only_prot_visit_id:
        NPX_data = Get_NPX(df_protein, visit_id)
        protein_info = pd.DataFrame(Create_Protein_Data_Dict(df_protein, NPX_data, visit_id), index=[0])
        df = pd.concat([df, protein_info], axis = 0)
    return df

def Add_Protein_Data(df, df_clinical, df_protein):
    prot_visit_id = Find_Visit_ID_Protein_and_UPDRS_Measured(df_clinical, df_protein)
    for visit_id in prot_visit_id:
        NPX_data = Get_NPX(df_protein, visit_id)
        #protein_info = pd.DataFrame(Create_Protein_Data_Dict(df_protein, NPX_data, visit_id), index=[0])
        df.loc[df.index[df['visit_id']==visit_id], Get_Protein_Names(df_protein, visit_id)] = NPX_data
    return df

def Sort_and_Index(df):
    df = df.sort_values(by=['patient_id', 'visit_month'])
    df = df.reset_index()
    return df.drop(['index'], axis = 1)

def Create_Combined_DataFrame(df_clinical, df_protein):
    protein_cols = Get_ALL_Protein_Names(df_protein)
    patient_cols = Get_Patient_Info(df_clinical)
    target_cols = Get_Targets(df_clinical)
    df_with_UniProt_cols = pd.DataFrame(columns = protein_cols, index=df_clinical.index)
    clinical_left = df_clinical[patient_cols]
    clinical_right = df_clinical[target_cols]
    df = pd.concat([clinical_left, df_with_UniProt_cols, clinical_right], axis = 1)
    only_p = Find_Visit_ID_Only_Protein_Measured(clinical_data, protein_data)
    df = Add_Rows_Only_Protein_Measured(df, only_p, protein_data)
    df = Sort_and_Index(df)
    return Add_Protein_Data(df, df_clinical, df_protein)

In [4]:
df = Create_Combined_DataFrame(clinical_data, protein_data)

In [5]:
df

Unnamed: 0,visit_id,patient_id,visit_month,upd23b_clinical_state_on_medication,O00391,O00533,O00584,O14498,O14773,O14791,...,P32754,P60174,Q13449,Q99683,Q99829,Q9UKV8,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,,11254.3,732430.0,39585.8,41526.9,31238.0,4202.71,...,,,,,,,10.0,6.0,15.0,
1,55_3,55,3,,,,,,,,...,,,,,,,10.0,7.0,25.0,
2,55_6,55,6,,13163.6,630465.0,35220.8,41295.0,26219.9,4416.42,...,,,49927.5,121417.0,,,8.0,10.0,34.0,
3,55_9,55,9,On,,,,,,,...,,,,,,,8.0,9.0,30.0,0.0
4,55_12,55,12,On,15257.6,815083.0,41650.9,39763.3,30703.6,4343.60,...,,20088.2,45519.2,121322.0,25589.4,65762.6,10.0,10.0,41.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,65043_48,65043,48,Off,10589.6,902434.0,44890.8,38771.5,20188.3,3137.07,...,57323.9,,,96827.3,14019.1,39046.7,7.0,6.0,13.0,0.0
2656,65043_54,65043,54,Off,,,,,,,...,,,,,,,4.0,8.0,11.0,1.0
2657,65043_60,65043,60,Off,,,,,,,...,,,,,,,6.0,6.0,16.0,1.0
2658,65043_72,65043,72,Off,,,,,,,...,,,,,,,3.0,9.0,14.0,1.0


### Validating The DF:

In [7]:
n = 100
results = []
for i in range(n):
    sample = protein_data.sample()
    p_id = sample['patient_id'].values[0]
    vm = sample['visit_month'].values[0]
    up = sample['UniProt'].values[0]
    npx = sample['NPX'].values[0]
    print('Check #:',i)
    print('Patient ID:', p_id)
    print('Visit Month:', vm)
    print('UniProt:', up)
    print('NPX from protein_data:', npx)
    result = float(df[up].loc[(df['patient_id'] == p_id)&(df['visit_month'] == vm)].values)
    print('NPX from df:', result)
    print('Pass?', result == npx)
    print('\n')
    if result == npx:
        results.append(True)
print('Percentage Correct:', sum(results)/n)
    

Check #: 0
Patient ID: 5645
Visit Month: 6
UniProt: P02765
NPX from protein_data: 2749940.0
NPX from df: 2749940.0
Pass? True


Check #: 1
Patient ID: 45161
Visit Month: 48
UniProt: P04075
NPX from protein_data: 27936.8
NPX from df: 27936.8
Pass? True


Check #: 2
Patient ID: 13618
Visit Month: 0
UniProt: P00736
NPX from protein_data: 28667.9
NPX from df: 28667.9
Pass? True


Check #: 3
Patient ID: 942
Visit Month: 48
UniProt: Q14508
NPX from protein_data: 4887.67
NPX from df: 4887.67
Pass? True


Check #: 4
Patient ID: 20664
Visit Month: 0
UniProt: P01033
NPX from protein_data: 64115.4
NPX from df: 64115.4
Pass? True


Check #: 5
Patient ID: 20707
Visit Month: 36
UniProt: P19823
NPX from protein_data: 91017.5
NPX from df: 91017.5
Pass? True


Check #: 6
Patient ID: 47171
Visit Month: 6
UniProt: P01031
NPX from protein_data: 9801.28
NPX from df: 9801.28
Pass? True


Check #: 7
Patient ID: 62792
Visit Month: 0
UniProt: P01011
NPX from protein_data: 1627220.0
NPX from df: 1627220.0
Pass?

Check #: 91
Patient ID: 23391
Visit Month: 60
UniProt: P08697
NPX from protein_data: 256764.0
NPX from df: 256764.0
Pass? True


Check #: 92
Patient ID: 5742
Visit Month: 0
UniProt: P16870
NPX from protein_data: 84518.1
NPX from df: 84518.1
Pass? True


Check #: 93
Patient ID: 27468
Visit Month: 60
UniProt: P06310
NPX from protein_data: 10181.2
NPX from df: 10181.2
Pass? True


Check #: 94
Patient ID: 10138
Visit Month: 36
UniProt: Q7Z3B1
NPX from protein_data: 78972.2
NPX from df: 78972.2
Pass? True


Check #: 95
Patient ID: 20460
Visit Month: 36
UniProt: P98160
NPX from protein_data: 1801.35
NPX from df: 1801.35
Pass? True


Check #: 96
Patient ID: 25827
Visit Month: 12
UniProt: Q06481
NPX from protein_data: 27707.7
NPX from df: 27707.7
Pass? True


Check #: 97
Patient ID: 23175
Visit Month: 3
UniProt: P02452
NPX from protein_data: 33077.5
NPX from df: 33077.5
Pass? True


Check #: 98
Patient ID: 49239
Visit Month: 0
UniProt: Q14118
NPX from protein_data: 207055.0
NPX from df: 207055

### Create Model:

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [21]:
target_cols = Get_Targets(clinical_data)
protein_cols = GeneratorExit(protein_data)
X = df.drop(target_cols, axis=1).loc[df['visit_month'] == 0]
y = df[target_cols].loc[df['visit_month'] == 0]
y = y.drop('updrs_4', axis=1)

In [22]:
X

Unnamed: 0,visit_id,patient_id,visit_month,upd23b_clinical_state_on_medication,O00391,O00533,O00584,O14498,O14773,O14791,...,Q9Y646,Q9Y6R7,P01594,P02792,P32754,P60174,Q13449,Q99683,Q99829,Q9UKV8
0,55_0,55,0,,11254.3,732430.0,39585.8,41526.9,31238.0,4202.71,...,23833.7,18953.5,,,,,,,,
13,942_0,942,0,,,,,,,,...,,,,,,,,,,
28,1517_0,1517,0,On,11648.9,419015.0,35053.8,,17466.3,2784.40,...,19771.6,14699.5,18356.40,25826.9,124675.0,18494.30,32892.8,90539.1,11373.4,55379.8
38,1923_0,1923,0,,21361.8,866985.0,32035.1,,13373.1,,...,26346.2,23597.1,,27842.6,75856.4,12385.70,22141.7,93929.3,12141.1,64302.3
45,2660_0,2660,0,,,579829.0,28259.8,29883.2,19134.1,1884.61,...,25630.0,17990.9,10813.80,,103346.0,14354.80,29919.2,86771.8,,55047.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,63875_0,63875,0,,10216.7,562169.0,31586.1,22355.7,24650.4,2123.90,...,18234.8,22909.1,12991.90,12190.2,108133.0,15677.80,28445.1,,14416.9,85051.3
2604,63889_0,63889,0,Off,,409011.0,25629.1,,14566.3,,...,13170.0,,,8943.7,,6414.41,26380.4,82380.5,,82577.7
2614,64669_0,64669,0,,10806.5,491365.0,26262.8,26712.7,15372.5,2995.08,...,,25094.2,11587.50,14343.7,,,30778.7,109909.0,15110.2,63783.2
2629,64674_0,64674,0,,,351914.0,21625.1,17712.4,20849.8,,...,22257.6,10510.5,6551.39,13407.9,,,,98204.8,,64857.7


In [23]:
y

Unnamed: 0,updrs_1,updrs_2,updrs_3
0,10.0,6.0,15.0
13,3.0,2.0,20.0
28,11.0,6.0,25.0
38,2.0,0.0,0.0
45,2.0,0.0,0.0
...,...,...,...
2595,3.0,2.0,4.0
2604,7.0,3.0,30.0
2614,12.0,14.0,27.0
2629,5.0,1.0,13.0


In [24]:
y.isna().sum()

updrs_1    0
updrs_2    0
updrs_3    0
dtype: int64

In [25]:
# Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
# Define transforms on numeric types
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Define transforms on categorical types
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Construct ColumnTransformer object for X
preprocessor_X = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, protein_cols + ['visit_month']),
        ('cat', categorical_transformer, [patient_info_cols[2]])
    ],
    remainder='passthrough',
    verbose=True,
    verbose_feature_names_out=True)

# Construct ColumnTransformer object for y
preprocessor_y = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, [target_cols[-1]]),
    ],
    remainder='passthrough',
    verbose=True,
    verbose_feature_names_out=True)

TypeError: unsupported operand type(s) for +: 'GeneratorExit' and 'list'

In [None]:
# Drop rows containing NaN
y_train_sparse = y_train.dropna()
X_train_sparse = X_train.loc[y_train_sparse.index]

y_test_sparse = y_test.dropna()
X_test_sparse = X_test.loc[y_test_sparse.index]

# Train the model
model = LinearRegression(fit_intercept=False).fit(preprocessor_X.fit_transform(X_train_sparse),
                                                  preprocessor_y.fit_transform(y_train_sparse))

In [None]:
y_pred = model.predict(preprocessor_X.fit_transform(X_test_sparse))

In [None]:
model.score(preprocessor_X.fit_transform(X_test_sparse), preprocessor_y.fit_transform(y_test_sparse))