In [1]:
import pandas as pd
import numpy as np

train_clinical = '~/School/EE445/kaggle/train_clinical_data.csv'
train_peptides = '~/School/EE445/kaggle/train_peptides.csv'
train_supp_clinical = '~/School/EE445/kaggle/supplemental_clinical_data.csv'
train_proteins = '~/School/EE445/kaggle/train_proteins.csv'

In [2]:
# load data
clinical_data = pd.read_csv(train_clinical)
protein_data = pd.read_csv(train_proteins)
peptide_data = pd.read_csv(train_peptides)
supp_data = pd.read_csv(train_supp_clinical)

### Initialize DF:

In [3]:
# retrieve column names
protein_cols = protein_data['UniProt'].unique().tolist()
patient_info_cols = clinical_data.columns[1:3].tolist()
target_cols = clinical_data.columns[-5:].to_list()
cols = patient_info_cols + protein_cols + target_cols
df = pd.DataFrame(columns=cols)

In [4]:
# create list of tuples that will be used to retrieve the protein_data corresponding to patient and visit month
tuples = [(clinical_data['patient_id'].loc[i], clinical_data['visit_month'].loc[i]) for i in range(clinical_data.shape[0])]

for i in tuples:
    # fill target and patient info data
    partial_row1= clinical_data[patient_info_cols + target_cols].loc[tuples.index(i)]
    # fill protein/NPX data
    protein_measurements = protein_data[['UniProt', 'NPX']].loc[(protein_data['patient_id'] == i[0]) & (protein_data['visit_month'] == i[1])]
    partial_row2 = pd.Series(dict(zip(protein_measurements['UniProt'].to_list(), protein_measurements['NPX'].tolist())), dtype='float64')
    # add row to df
    row = pd.concat([partial_row1, partial_row2])
    df.loc[tuples.index(i)] = row

In [5]:
df

Unnamed: 0,patient_id,visit_month,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,...,P60174,Q13449,Q99683,Q99829,Q9UKV8,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55,0,11254.3,732430.0,39585.8,41526.9,31238.0,4202.71,177775.0,62898.2,...,,,,,,10.0,6.0,15.0,,
1,55,3,,,,,,,,,...,,,,,,10.0,7.0,25.0,,
2,55,6,13163.6,630465.0,35220.8,41295.0,26219.9,4416.42,165638.0,62567.5,...,,49927.5,121417.0,,,8.0,10.0,34.0,,
3,55,9,,,,,,,,,...,,,,,,8.0,9.0,30.0,0.0,On
4,55,12,15257.6,815083.0,41650.9,39763.3,30703.6,4343.6,151073.0,66963.1,...,20088.2,45519.2,121322.0,25589.4,65762.6,10.0,10.0,41.0,0.0,On
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043,48,10589.6,902434.0,44890.8,38771.5,20188.3,3137.07,203487.0,85782.0,...,,,96827.3,14019.1,39046.7,7.0,6.0,13.0,0.0,Off
2611,65043,54,,,,,,,,,...,,,,,,4.0,8.0,11.0,1.0,Off
2612,65043,60,,,,,,,,,...,,,,,,6.0,6.0,16.0,1.0,Off
2613,65043,72,,,,,,,,,...,,,,,,3.0,9.0,14.0,1.0,Off


### Create Model:

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
X = df.drop(target_cols+['patient_id'], axis=1)
y = df[target_cols]

In [8]:
# Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
# Define transforms on numeric types
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Define transforms on categorical types
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Construct ColumnTransformer object for X
preprocessor_X = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, protein_cols + ['visit_month']),
    ],
    remainder='passthrough',
    verbose=True,
    verbose_feature_names_out=True)

# Construct ColumnTransformer object for y
preprocessor_y = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, [target_cols[-1]]),
    ],
    remainder='passthrough',
    verbose=True,
    verbose_feature_names_out=True)

In [10]:
# Drop rows containing NaN
y_train_sparse = y_train.dropna()
X_train_sparse = X_train.loc[y_train_sparse.index]

y_test_sparse = y_test.dropna()
X_test_sparse = X_test.loc[y_test_sparse.index]

# Train the model
model = LinearRegression(fit_intercept=False).fit(preprocessor_X.fit_transform(X_train_sparse),
                                                  preprocessor_y.fit_transform(y_train_sparse))

[ColumnTransformer] ........... (1 of 1) Processing num, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s


In [11]:
y_pred = model.predict(preprocessor_X.fit_transform(X_test_sparse))

[ColumnTransformer] ........... (1 of 1) Processing num, total=   0.0s


In [12]:
model.score(preprocessor_X.fit_transform(X_test_sparse), preprocessor_y.fit_transform(y_test_sparse))

[ColumnTransformer] ........... (1 of 1) Processing num, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s


-2.5599249832220354