In [61]:
import pandas as pd
import numpy as np

train_clinical = './train_clinical_data.csv'
train_peptides = './train_peptides.csv'
train_supp_clinical = './supplemental_clinical_data.csv'
train_proteins = './train_proteins.csv'

In [62]:
clinical_data = pd.read_csv(train_clinical)
protein_data = pd.read_csv(train_proteins)
peptide_data = pd.read_csv(train_peptides)
supp_data = pd.read_csv(train_supp_clinical)

clinical_data['upd23b_clinical_state_on_medication'] = clinical_data['upd23b_clinical_state_on_medication'].replace({'On': 1})
clinical_data['upd23b_clinical_state_on_medication'] = clinical_data['upd23b_clinical_state_on_medication'].replace({'Off': -1})
clinical_data['upd23b_clinical_state_on_medication'] = clinical_data['upd23b_clinical_state_on_medication'].fillna(0)

#updrs4mean = clinical_data["updrs_4"].mean()
#clinical_data['updrs_4'] = clinical_data['updrs_4'].fillna(updrs4mean)

#updrs3mean = clinical_data["updrs_3"].mean()
#clinical_data['updrs_3'] = clinical_data['updrs_3'].fillna(updrs3mean)

#updrs2mean = clinical_data["updrs_2"].mean()
#clinical_data['updrs_2'] = clinical_data['updrs_2'].fillna(updrs2mean)

#updrs1mean = clinical_data["updrs_1"].mean()
#clinical_data['updrs_1'] = clinical_data['updrs_1'].fillna(updrs1mean)

#clinical_data['visit_month'] = clinical_data['visit_month'].fillna(0)
#clinical_data['visit_month'] = clinical_data['visit_month'].fillna(0)

#supp_data['upd23b_clinical_state_on_medication'] = supp_data['upd23b_clinical_state_on_medication'].replace({'On': 1})
#supp_data['upd23b_clinical_state_on_medication'] = supp_data['upd23b_clinical_state_on_medication'].replace({'Off': -1})
#supp_data['upd23b_clinical_state_on_medication'] = supp_data['upd23b_clinical_state_on_medication'].fillna(0)

#updrs4mean = supp_data["updrs_4"].mean()
#supp_data['updrs_4'] = supp_data['updrs_4'].fillna(updrs4mean)

#updrs3mean = supp_data["updrs_3"].mean()
#supp_data['updrs_3'] = supp_data['updrs_3'].fillna(updrs3mean)

#updrs2mean = supp_data["updrs_2"].mean()
#supp_data['updrs_2'] = supp_data['updrs_2'].fillna(updrs2mean)

#updrs1mean = supp_data["updrs_1"].mean()
#supp_data['updrs_1'] = supp_data['updrs_1'].fillna(updrs1mean)


In [89]:
def Get_ALL_Protein_Names(df_protein):
    return df_protein['UniProt'].unique().tolist()

def Get_Protein_Names(df_protein, visit_id):
    return protein_data.loc[protein_data['visit_id'] == visit_id]['UniProt'].tolist()

def Get_Patient_Info(df_clinical):
    return df_clinical.columns[0:3].tolist() + [df_clinical.columns[-1]]

def Get_Targets(df_clinical):
    return df_clinical.columns[3:7].tolist()

def Get_Protein_Visit_ID(df_protein):
    return df_protein['visit_id'].unique().tolist()

def Get_Clinical_Visit_ID(df_clinical):
    return df_clinical['visit_id'].tolist()

def Get_NPX(df_protein, visit_id):
    return df_protein.loc[df_protein['visit_id'] == visit_id]['NPX'].tolist()

def Create_Patient_Data_Dict(visit_id):
    pat_id, vis_mon = visit_id.split('_')
    return {'patient_id': int(pat_id), 'visit_month': int(vis_mon), 'visit_id': visit_id }

def Create_Protein_Data_Dict(df_protein, NPX_data, visit_id):
    return dict(zip(Get_Protein_Names(df_protein, visit_id), NPX_data)) | Create_Patient_Data_Dict(visit_id)
    
def Find_Visit_ID_Only_Protein_Measured(df_clinical, df_protein):
    protein_list = Get_Protein_Visit_ID(df_protein)
    clinical_list = Get_Clinical_Visit_ID(df_clinical)
    only_protein = [visit_id for visit_id in protein_list if visit_id not in clinical_list]
    return only_protein

def Find_Visit_ID_Only_UPDRS_Measured(df_clinical, df_protein):
    protein_list = Get_Protein_Visit_ID(df_protein)
    clinical_list = Get_Clinical_Visit_ID(df_clinical)
    only_UPDRS = [visit_id for visit_id in clinical_list if visit_id not in protein_list]
    return only_UPDRS

def Find_Visit_ID_Protein_and_UPDRS_Measured(df_clinical, df_protein):
    return [visit_id for visit_id in Get_Clinical_Visit_ID(df_clinical) if visit_id in Get_Protein_Visit_ID(df_protein)]

def Add_Rows_Only_Protein_Measured(df, only_prot_visit_id, df_protein):
    for visit_id in only_prot_visit_id:
        NPX_data = Get_NPX(df_protein, visit_id)
        protein_info = pd.DataFrame(Create_Protein_Data_Dict(df_protein, NPX_data, visit_id), index=[0])
        df = pd.concat([df, protein_info], axis = 0)
    return df

def Add_Protein_Data(df, df_clinical, df_protein):
    prot_visit_id = Find_Visit_ID_Protein_and_UPDRS_Measured(df_clinical, df_protein)
    for visit_id in prot_visit_id:
        NPX_data = Get_NPX(df_protein, visit_id)
        #protein_info = pd.DataFrame(Create_Protein_Data_Dict(df_protein, NPX_data, visit_id), index=[0])
        df.loc[df.index[df['visit_id']==visit_id], Get_Protein_Names(df_protein, visit_id)] = NPX_data
    return df

def Sort_and_Index(df):
    df = df.sort_values(by=['patient_id', 'visit_month'])
    df = df.reset_index()
    return df.drop(['index'], axis = 1)

def Create_Combined_DataFrame(df_clinical, df_protein):
    protein_cols = Get_ALL_Protein_Names(df_protein)
    patient_cols = Get_Patient_Info(df_clinical)
    target_cols = Get_Targets(df_clinical)
    df_with_UniProt_cols = pd.DataFrame(columns = protein_cols, index=df_clinical.index)
    clinical_left = df_clinical[patient_cols]
    clinical_right = df_clinical[target_cols]
    df = pd.concat([clinical_left, df_with_UniProt_cols, clinical_right], axis = 1)
    only_p = Find_Visit_ID_Only_Protein_Measured(clinical_data, protein_data)
    df = Add_Rows_Only_Protein_Measured(df, only_p, protein_data)
    df = Sort_and_Index(df)
    return Add_Protein_Data(df, df_clinical, df_protein)

df = Create_Combined_DataFrame(clinical_data, protein_data)
df = df.fillna(method = 'pad')
df = df.fillna(0)

In [64]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

In [90]:
target_cols = Get_Targets(clinical_data)
protein_cols = GeneratorExit(protein_data)
x = df.drop(target_cols, axis=1).loc[df['visit_month'] == 0]
y = df[target_cols].loc[df['visit_month'] == 0]
y = y.drop('updrs_4', axis=1)

     visit_id  patient_id  visit_month  upd23b_clinical_state_on_medication  \
0        55_0          55            0                                  0.0   
13      942_0         942            0                                  0.0   
28     1517_0        1517            0                                  1.0   
38     1923_0        1923            0                                  0.0   
45     2660_0        2660            0                                  0.0   
...       ...         ...          ...                                  ...   
2595  63875_0       63875            0                                  0.0   
2604  63889_0       63889            0                                 -1.0   
2614  64669_0       64669            0                                  0.0   
2629  64674_0       64674            0                                  0.0   
2645  65043_0       65043            0                                  0.0   

       O00391    O00533   O00584   O14498   O14773 

In [91]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [92]:
# Drop rows containing NaN
y_train_sparse = y_train.dropna()
x_train_sparse = x_train.loc[y_train_sparse.index]

#x_train.to_csv('./x_train.csv', index=False)

y_test_sparse = y_test.dropna()
x_test_sparse = x_test.loc[y_test_sparse.index]

In [93]:
# Train the model
lasso = Lasso(normalize = True,alpha = .055).fit(x_train,y_train)
print(lasso.sparse_coef_)

  (0, 3)	1.9403856837722755
  (0, 20)	-1.0844201943450745e-05
  (0, 76)	-4.493554525704333e-06
  (0, 141)	-7.130289196399911e-06
  (0, 150)	4.3261196508772184e-07
  (0, 211)	1.3086046091575458e-05
  (1, 3)	1.6421896665989928
  (1, 15)	-5.5298961315451394e-05
  (1, 27)	6.590055789146742e-08
  (1, 29)	-1.463759572588744e-07
  (1, 49)	1.3186403874897435e-06
  (1, 84)	2.745879367623066e-07
  (1, 161)	-6.321068585444323e-07
  (2, 3)	2.6570816417669954
  (2, 5)	-7.257894417487691e-06
  (2, 6)	0.00013243557908285966
  (2, 15)	-0.00012461818402306236
  (2, 19)	2.3434543635772083e-06
  (2, 23)	-1.792023935878274e-06
  (2, 27)	1.8168582708525904e-07
  (2, 28)	4.1527133819703984e-07
  (2, 45)	6.818174636004226e-08
  (2, 49)	1.911201094172491e-05
  (2, 54)	-7.504308390007411e-05
  (2, 55)	-1.386300912947046e-05
  (2, 58)	1.7034905795989213e-05
  (2, 67)	5.246162977884142e-07
  (2, 71)	-4.605902170434472e-08
  (2, 83)	0.00012850035287582774
  (2, 84)	7.06070774084114e-06
  (2, 98)	2.922987276790451

In [94]:
y_pred = lasso.predict(x_test)

In [95]:
y_pred

array([[ 4.9427405 ,  4.6928123 , 14.07402938],
       [ 8.12539414,  5.4600881 , 10.37719013],
       [ 5.6519083 ,  3.89059832,  9.75157976],
       [ 7.6682729 ,  5.99606973, 10.00406182],
       [ 6.60218324,  4.63748615, 15.70060533],
       [ 5.9019943 ,  4.54449303, 13.21173954],
       [ 6.0476602 ,  3.80612191, 15.13344425],
       [ 6.15133121,  4.58400161, 13.71853377],
       [ 7.17238522,  5.90556143, 15.57723362],
       [ 6.01932093,  4.47596653, 17.78144582],
       [ 5.48019546,  3.61668624, 14.02576459],
       [ 5.35353454,  4.19009786, 10.5474609 ],
       [ 5.34037482,  4.07246274, 13.39928237],
       [ 5.30615674,  4.29011925, 17.40873685],
       [ 5.51875591,  4.22165619, 21.83815181],
       [ 5.52716971,  4.08297024, 14.83631527],
       [ 4.81769793,  4.41029041, 19.30075009],
       [ 5.06492733,  4.12870114, 15.72412584],
       [ 5.88677345,  4.60776669, 17.18101865],
       [ 5.75168779,  3.99169707, 15.35988368],
       [ 5.05728183,  4.14455802,  7.004

In [96]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

56.724038618649935

In [97]:
np.sqrt(mean_squared_error(y_test, y_pred))

7.531536272145938