The problem is predicting updrs_1, updrs_2, updrs_3 and updrs_4 values at a given visit_month and potential 6, 12 and 24 month visits after the current visit_month.

Build models using linear regression, svm, logistic regression, random forests

for lr, group 4 percentiles

In [1]:
import pandas as pd
import numpy as np

In [2]:
# load data
clinical_data = pd.read_csv(r'C:\Users\cjdri\Documents\EE445\kaggle\train_clinical_data.csv')
protein_data = pd.read_csv(r'C:\Users\cjdri\Documents\EE445\kaggle\train_proteins.csv')
peptide_data = pd.read_csv(r'C:\Users\cjdri\Documents\EE445\kaggle\train_peptides.csv')
supp_data = pd.read_csv(r'C:\Users\cjdri\Documents\EE445\kaggle\supplemental_clinical_data.csv')

### Initialize DF:

In [3]:
def Get_ALL_Protein_Names(df_protein):
    return df_protein['UniProt'].unique().tolist()

def Get_Protein_Names(df_protein, visit_id):
    return protein_data.loc[protein_data['visit_id'] == visit_id]['UniProt'].tolist()

def Get_Patient_Info(df_clinical):
    return df_clinical.columns[0:3].tolist() + [df_clinical.columns[-1]]

def Get_Targets(df_clinical):
    return df_clinical.columns[3:7].tolist()

def Get_Protein_Visit_ID(df_protein):
    return df_protein['visit_id'].unique().tolist()

def Get_Clinical_Visit_ID(df_clinical):
    return df_clinical['visit_id'].tolist()

def Get_NPX(df_protein, visit_id):
    return df_protein.loc[df_protein['visit_id'] == visit_id]['NPX'].tolist()

def Create_Patient_Data_Dict(visit_id):
    pat_id, vis_mon = visit_id.split('_')
    return {'patient_id': int(pat_id), 'visit_month': int(vis_mon), 'visit_id': visit_id }

def Create_Protein_Data_Dict(df_protein, NPX_data, visit_id):
    return dict(zip(Get_Protein_Names(df_protein, visit_id), NPX_data)) | Create_Patient_Data_Dict(visit_id)
    
def Find_Visit_ID_Only_Protein_Measured(df_clinical, df_protein):
    protein_list = Get_Protein_Visit_ID(df_protein)
    clinical_list = Get_Clinical_Visit_ID(df_clinical)
    only_protein = [visit_id for visit_id in protein_list if visit_id not in clinical_list]
    return only_protein

def Find_Visit_ID_Only_UPDRS_Measured(df_clinical, df_protein):
    protein_list = Get_Protein_Visit_ID(df_protein)
    clinical_list = Get_Clinical_Visit_ID(df_clinical)
    only_UPDRS = [visit_id for visit_id in clinical_list if visit_id not in protein_list]
    return only_UPDRS

def Find_Visit_ID_Protein_and_UPDRS_Measured(df_clinical, df_protein):
    return [visit_id for visit_id in Get_Clinical_Visit_ID(df_clinical) if visit_id in Get_Protein_Visit_ID(df_protein)]

def Add_Rows_Only_Protein_Measured(df, only_prot_visit_id, df_protein):
    for visit_id in only_prot_visit_id:
        NPX_data = Get_NPX(df_protein, visit_id)
        protein_info = pd.DataFrame(Create_Protein_Data_Dict(df_protein, NPX_data, visit_id), index=[0])
        df = pd.concat([df, protein_info], axis = 0)
    return df

def Add_Protein_Data(df, df_clinical, df_protein):
    prot_visit_id = Find_Visit_ID_Protein_and_UPDRS_Measured(df_clinical, df_protein)
    for visit_id in prot_visit_id:
        NPX_data = Get_NPX(df_protein, visit_id)
        #protein_info = pd.DataFrame(Create_Protein_Data_Dict(df_protein, NPX_data, visit_id), index=[0])
        df.loc[df.index[df['visit_id']==visit_id], Get_Protein_Names(df_protein, visit_id)] = NPX_data
    return df

def Sort_and_Index(df):
    df = df.sort_values(by=['patient_id', 'visit_month'])
    df = df.reset_index()
    return df.drop(['index'], axis = 1)

def Create_Combined_DataFrame(df_clinical, df_protein):
    protein_cols = Get_ALL_Protein_Names(df_protein)
    patient_cols = Get_Patient_Info(df_clinical)
    target_cols = Get_Targets(df_clinical)
    df_with_UniProt_cols = pd.DataFrame(columns = protein_cols, index=df_clinical.index)
    clinical_left = df_clinical[patient_cols]
    clinical_right = df_clinical[target_cols]
    df = pd.concat([clinical_left, df_with_UniProt_cols, clinical_right], axis = 1)
    only_p = Find_Visit_ID_Only_Protein_Measured(clinical_data, protein_data)
    df = Add_Rows_Only_Protein_Measured(df, only_p, protein_data)
    df = Sort_and_Index(df)
    return Add_Protein_Data(df, df_clinical, df_protein)

In [4]:
df = Create_Combined_DataFrame(clinical_data, protein_data)

In [5]:
df

Unnamed: 0,visit_id,patient_id,visit_month,upd23b_clinical_state_on_medication,O00391,O00533,O00584,O14498,O14773,O14791,...,P32754,P60174,Q13449,Q99683,Q99829,Q9UKV8,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,,11254.3,732430.0,39585.8,41526.9,31238.0,4202.71,...,,,,,,,10.0,6.0,15.0,
1,55_3,55,3,,,,,,,,...,,,,,,,10.0,7.0,25.0,
2,55_6,55,6,,13163.6,630465.0,35220.8,41295.0,26219.9,4416.42,...,,,49927.5,121417.0,,,8.0,10.0,34.0,
3,55_9,55,9,On,,,,,,,...,,,,,,,8.0,9.0,30.0,0.0
4,55_12,55,12,On,15257.6,815083.0,41650.9,39763.3,30703.6,4343.60,...,,20088.2,45519.2,121322.0,25589.4,65762.6,10.0,10.0,41.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,65043_48,65043,48,Off,10589.6,902434.0,44890.8,38771.5,20188.3,3137.07,...,57323.9,,,96827.3,14019.1,39046.7,7.0,6.0,13.0,0.0
2656,65043_54,65043,54,Off,,,,,,,...,,,,,,,4.0,8.0,11.0,1.0
2657,65043_60,65043,60,Off,,,,,,,...,,,,,,,6.0,6.0,16.0,1.0
2658,65043_72,65043,72,Off,,,,,,,...,,,,,,,3.0,9.0,14.0,1.0


In [6]:
df.describe()

Unnamed: 0,patient_id,visit_month,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,...,P32754,P60174,Q13449,Q99683,Q99829,Q9UKV8,updrs_1,updrs_2,updrs_3,updrs_4
count,2660.0,2660.0,764.0,1112.0,1100.0,1032.0,1047.0,942.0,1113.0,1050.0,...,778.0,729.0,909.0,933.0,489.0,907.0,2614.0,2613.0,2590.0,1577.0
mean,32625.830827,30.839098,11641.264435,511164.9,26505.529157,27305.934884,17688.295406,3004.990691,126151.780054,50773.474638,...,73994.153085,15290.254403,28216.340372,102498.454662,15393.15728,69924.939658,7.110559,6.74359,19.421236,1.861763
std,18538.098639,25.22637,2817.00353,235735.7,10705.15254,8446.187506,7166.325369,1142.159575,72748.393517,21382.028764,...,31142.479712,5378.119619,13279.201144,21270.573763,5805.729783,25943.573085,5.525955,6.32323,15.000289,3.022112
min,55.0,0.0,873.778,59718.2,591.103,8945.34,2811.12,336.517,10717.4,5806.84,...,12718.2,2710.32,223.218,47187.9,3364.64,3439.81,0.0,0.0,0.0,0.0
25%,16572.0,9.0,9736.8575,349059.0,19941.075,21123.65,12920.05,2189.0875,70560.6,37008.975,...,53235.8,11700.9,18613.5,88157.0,11489.9,54132.6,3.0,1.0,6.0,0.0
50%,29313.0,24.0,11546.4,483442.5,26529.7,26624.0,17399.6,2865.46,116900.0,50375.8,...,68369.5,14906.2,25872.1,100530.0,14420.1,67571.7,6.0,5.0,19.0,0.0
75%,50149.0,48.0,13383.025,648557.2,33222.8,32459.275,22077.05,3593.1475,164947.0,63446.7,...,88072.825,18229.0,35692.2,114190.0,18626.9,83879.3,10.0,10.0,29.0,3.0
max,65043.0,108.0,21361.8,1806980.0,66252.4,65347.9,49695.6,9352.64,538862.0,137369.0,...,202714.0,33801.8,94180.5,196434.0,37061.7,226139.0,33.0,40.0,86.0,20.0


### Validating The DF:

In [7]:
# Testing n random UniProt and NPX from protein_data match DF
n = 1000
results = []
for i in range(n):
    sample = protein_data.sample()
    p_id = sample['patient_id'].values[0]
    vm = sample['visit_month'].values[0]
    up = sample['UniProt'].values[0]
    npx = sample['NPX'].values[0]
    #print('Check #:',i)
    #print('Patient ID:', p_id)
    #print('Visit Month:', vm)
    #print('UniProt:', up)
    #print('NPX from protein_data:', npx)
    result = float(df[up].loc[(df['patient_id'] == p_id)&(df['visit_month'] == vm)].values)
    #print('NPX from df:', result)
    #print('Pass?', result == npx)
    #print('\n')
    if result == npx:
        results.append(True)
print('Percentage Correct:', sum(results)/n)
    

Percentage Correct: 1.0


In [8]:
protein_cols = Get_ALL_Protein_Names(protein_data)
target_cols = Get_Targets(clinical_data)

# Testing n random samples from DF match protein_data
n = 1000
results = []
for i in range(n):
    sample = df.sample()
    valid_protein_columns = sample[protein_cols].dropna(axis=1)
    temp_df = pd.concat([pd.DataFrame(valid_protein_columns.columns, columns=['UniProt']),
                         pd.DataFrame(valid_protein_columns.values.transpose(), columns=['NPX'])], axis=1)
    prot_df = protein_data[['UniProt', 'NPX']].loc[protein_data['visit_id'] == sample['visit_id'].values[0]]
    prot_df = prot_df.sort_values('UniProt')
    temp_df = temp_df.sort_values('UniProt')
    prot_df = prot_df.reset_index(drop=True)
    temp_df = temp_df.reset_index(drop=True)
    truth_df = temp_df.sort_values('UniProt') == prot_df.sort_values('UniProt')
    if truth_df.sum().sum() == truth_df.shape[0]*2:
        results.append(True)
print('percentage correct:', sum(results)/n)

percentage correct: 1.0


In [9]:
# Testing the rows where only protein measurements were taken
test_visit_ID = Find_Visit_ID_Only_Protein_Measured(clinical_data, protein_data)

# Loop over visit_id where only protein measurements were taken
for vis_id in test_visit_ID:
    # Retrieve Protein and NPX values from df, drop all NAN columns
    row_df = df[protein_cols].loc[df['visit_id']==vis_id]
    row_df = row_df[sorted(protein_cols)].dropna(axis=1)
    # Retrieve Protein and NPX
    prodat_df = protein_data.loc[protein_data['visit_id']==vis_id][['UniProt', 'NPX']]
    # Check that protein ids match
    print('Visit ID:',vis_id)
    print('Protein_IDs match:', row_df.columns.tolist() == prodat_df['UniProt'].tolist())
    # Check that NPX values match
    print('NPX match:', row_df.values.tolist()[0] == prodat_df['NPX'].tolist())
    # Check that the UPDRs are all NaN
    print('UPDRS are NaN:', df[target_cols].loc[df['visit_id']==vis_id].isna().sum().sum()==4) 
    print('\n')

Visit ID: 23175_3
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 57468_3
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 2660_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 5027_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 5036_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 5178_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 7117_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 7151_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 11686_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 12636_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 13968_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 14035_6
Protein_IDs match: True
NPX match: True
UPDRS are NaN: True


Visit ID: 14450_6
Protein_IDs match: True
NPX 

### Create training data (all visit_months == 0 OR 3)

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, mean_squared_error

In [91]:
def Relabel_By_Percentile(updrs, percentile_25, percentile_50, percentile_75):
    if updrs <= percentile_25:
        return '1q'
    elif (updrs > percentile_25) and (updrs <= percentile_50):
        return '2q'
    elif (updrs > percentile_50) and (updrs <= percentile_75):
        return '3q'
    else:
        return '4q'
    
def Reset_Data(df):
    target_cols = Get_Targets(clinical_data)
    protein_cols = Get_ALL_Protein_Names(protein_data)

    # train the model using measurements taken from months 0 and 3
    X_train = df.drop(target_cols + ['visit_id', 'patient_id'], axis=1).loc[(df['visit_month'] == 0) | (df['visit_month'] == 3)]
    y_train = df[target_cols].loc[(df['visit_month'] == 0) | (df['visit_month'] == 3)]
    y_train = y_train.drop('updrs_4', axis=1)

    # drop NaN rows
    # Get indices with NaN in row
    inds = pd.isnull(y_train).any(1).to_numpy().nonzero()[0].tolist()
    inds = y_train.index[inds].tolist()
    X_train = X_train.drop(inds, axis=0)
    y_train = y_train.dropna()



    # Create columns with boolean labels that depend on whether the entry > the 25th percentile of the column
    for col in y_train.columns:
        percentile_25 = y_train[col].quantile(0.25)
        percentile_50 = y_train[col].quantile(0.50)
        percentile_75 = y_train[col].quantile(0.75)
        y_train[col+'_qlabel'] = y_train[col].apply(lambda x: Relabel_By_Percentile(x, percentile_25, percentile_50, percentile_75))
    
    # throw out numeric columns
    y_train_q_cols = y_train.columns[3:].tolist()
    y_train = y_train[y_train_q_cols]
    
    return X_train, y_train

In [92]:
X_train, y_train = Reset_Data(df)

In [93]:
y_train.isna().sum()

updrs_1_qlabel    0
updrs_2_qlabel    0
updrs_3_qlabel    0
dtype: int64

### Create preprocessing pipelines

In [82]:
# Define transforms on numeric types
numeric_transformer_X = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

numeric_transformer_y = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Define transforms on categorical types
categorical_transformer_X = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_transformer_y = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[['1q', '2q', '3q', '4q'], ['1q', '2q', '3q', '4q'], ['1q', '2q', '3q', '4q']]))])

# Construct ColumnTransformer object for X
preprocessor_X = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_X, protein_cols + ['visit_month']),
        ('cat', categorical_transformer_X, [Get_Patient_Info(clinical_data)[-1]])
    ],
    remainder='passthrough',
    verbose=True,
    verbose_feature_names_out=True)

# Construct ColumnTransformer object for y
preprocessor_y = ColumnTransformer(
    transformers=[
        ('oe', categorical_transformer_y, y_train_q_cols)
    ],
    remainder='passthrough',
    verbose=True,
    verbose_feature_names_out=True)

## Percentile Training

In [84]:
y_train

Unnamed: 0,0,1,2
0,3.0,2.0,1.0
1,3.0,2.0,3.0
2,1.0,1.0,2.0
3,2.0,1.0,2.0
4,3.0,2.0,3.0
...,...,...,...
357,3.0,3.0,3.0
358,2.0,0.0,1.0
359,2.0,1.0,1.0
360,0.0,2.0,1.0


In [35]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,221,222,223,224,225,226,227,228,229,230
0,-0.323951,1.244350e+00,1.779477,2.062533e+00,2.450512,1.242922,7.955933e-01,0.702289,2.399700e+00,0.337233,...,7.810684e-16,0.000000,0.000000,0.000000,0.000000,8.256776e-16,-0.677995,0.0,0.0,1.0
1,0.000000,-6.992733e-16,0.000000,-5.484802e-16,0.000000,0.000000,-5.392192e-16,0.000000,-5.193612e-16,0.000000,...,7.810684e-16,0.000000,0.000000,0.000000,0.000000,8.256776e-16,1.474937,0.0,0.0,1.0
2,0.000000,-6.992733e-16,0.000000,-5.484802e-16,0.000000,0.000000,-5.392192e-16,0.000000,-5.193612e-16,0.000000,...,7.810684e-16,0.000000,0.000000,0.000000,0.000000,8.256776e-16,-0.677995,0.0,0.0,1.0
3,0.000000,-6.992733e-16,0.000000,-5.484802e-16,0.000000,0.000000,-5.392192e-16,0.000000,-5.193612e-16,0.000000,...,7.810684e-16,0.000000,0.000000,0.000000,0.000000,8.256776e-16,1.474937,0.0,0.0,1.0
4,-0.105606,-6.382437e-01,1.181766,-5.484802e-16,-0.127996,-0.281774,-2.979511e-01,-0.764778,-5.813975e-01,-1.222455,...,2.781323e+00,0.999345,0.298794,-0.898524,-1.408331,-8.822437e-01,-0.677995,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,0.000000,-6.992733e-16,0.000000,-5.484802e-16,0.000000,0.000000,-5.392192e-16,0.000000,-5.193612e-16,0.000000,...,7.810684e-16,0.000000,0.000000,0.000000,0.000000,8.256776e-16,1.474937,0.0,0.0,1.0
358,0.000000,-1.041300e+00,-0.589301,-1.527863e+00,0.505504,0.000000,-1.207420e+00,-1.354969,-2.181794e+00,-0.700617,...,7.810684e-16,0.000000,0.000000,-0.372785,0.000000,-3.444664e-01,-0.677995,0.0,0.0,1.0
359,0.000000,-6.992733e-16,0.000000,-5.484802e-16,0.000000,0.000000,-5.392192e-16,0.000000,-5.193612e-16,0.000000,...,7.810684e-16,0.000000,0.000000,0.000000,0.000000,8.256776e-16,1.474937,0.0,0.0,1.0
360,0.903397,2.418806e+00,2.185121,2.384612e+00,0.359744,0.249796,2.436457e+00,4.267988,3.279286e+00,2.030216,...,7.810684e-16,0.000000,2.986288,-0.187460,0.000000,-1.891946e+00,-0.677995,0.0,0.0,1.0


### UPDRS_1

In [110]:
# Reset data
X_train, y_train = Reset_Data(df)
X_train = pd.DataFrame(preprocessor_X.fit_transform(X_train))
y_train = pd.DataFrame(preprocessor_y.fit_transform(y_train))

# Create model to predict updrs_1_bool
clf1 = LogisticRegression(fit_intercept=False)

# GridSearch CV Method
folds = 10
skf = StratifiedKFold(folds, shuffle=True, random_state=1)

solvers = ['newton-cg', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers, penalty=penalty, C=c_values)
grid_search = GridSearchCV(estimator=clf1, param_grid=grid, n_jobs=-1, cv=skf, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X_train, y_train[0])
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

[ColumnTransformer] ........... (1 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ............ (1 of 1) Processing oe, total=   0.0s
Best: 0.309159 using {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}


In [111]:
# Train-Test-Split Method
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

model1 = LogisticRegression(fit_intercept = False,
                           penalty = grid_result.best_params_['penalty'],
                           solver = grid_result.best_params_['solver'],
                           C=grid_result.best_params_['C']
                          )

model1 = model1.fit(X_train, y_train[0])
scores = cross_val_score(model1, X_train, y_train[0], cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.30 accuracy with a standard deviation of 0.02


In [112]:
y_pred1 = model1.predict(X_test)

In [113]:
confusion_matrix(y_test[0], y_pred1)

array([[16,  7,  3,  6],
       [17,  2,  1,  3],
       [20,  5,  8,  6],
       [13,  5,  4,  4]], dtype=int64)

In [114]:
mean_squared_error(y_test[0], y_pred1)

2.7916666666666665

### UPDRS_2

In [115]:
# Reset data
X_train, y_train = Reset_Data(df)
X_train = pd.DataFrame(preprocessor_X.fit_transform(X_train))
y_train = pd.DataFrame(preprocessor_y.fit_transform(y_train))

# Train the model to predict updrs_2_bool
clf2 = LogisticRegression(fit_intercept=False)

# GridSearch CV Method
folds = 10
skf = StratifiedKFold(folds, shuffle=True, random_state=1)

solvers = ['newton-cg', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search = GridSearchCV(estimator=clf2, param_grid=grid, n_jobs=-1, cv=skf, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X_train, y_train[1])
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

[ColumnTransformer] ........... (1 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ............ (1 of 1) Processing oe, total=   0.0s
Best: 0.287462 using {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}


In [116]:
# Train-Test-Split Method
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

model2 = LogisticRegression(fit_intercept = False,
                           penalty = grid_result.best_params_['penalty'],
                           solver = grid_result.best_params_['solver'],
                           C=grid_result.best_params_['C']
                          )

model2 = model2.fit(X_train, y_train[1])
scores = cross_val_score(model2, X_train, y_train[1], cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.29 accuracy with a standard deviation of 0.02


In [117]:
y_pred2 = model2.predict(X_test)

In [118]:
confusion_matrix(y_test[1], y_pred2)

array([[14, 12,  3,  5],
       [ 3,  7,  1, 20],
       [ 2,  9,  2, 10],
       [ 2, 16,  2, 12]], dtype=int64)

In [119]:
mean_squared_error(y_test[1], y_pred2)

2.2

### UPDRS_3

In [120]:
# Reset data
X_train, y_train = Reset_Data(df)
X_train = pd.DataFrame(preprocessor_X.fit_transform(X_train))
y_train = pd.DataFrame(preprocessor_y.fit_transform(y_train))

# Train the model to predict updrs_3_bool
clf3 = LogisticRegression(fit_intercept=False)

# GridSearch CV Method
folds = 10
skf = StratifiedKFold(folds, shuffle=True, random_state=1)

solvers = ['newton-cg', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search = GridSearchCV(estimator=clf3, param_grid=grid, n_jobs=-1, cv=skf, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X_train, y_train[2])
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

[ColumnTransformer] ........... (1 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ............ (1 of 1) Processing oe, total=   0.0s
Best: 0.317492 using {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}


In [121]:
# Train-Test-Split Method
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

model3 = LogisticRegression(fit_intercept = False,
                           penalty = grid_result.best_params_['penalty'],
                           solver = grid_result.best_params_['solver'],
                           C=grid_result.best_params_['C']
                          )

model3 = model3.fit(X_train, y_train[2])
scores = cross_val_score(model3, X_train, y_train[2], cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
y_pred3 = model3.predict(X_test)

0.29 accuracy with a standard deviation of 0.04


In [122]:
confusion_matrix(y_test[2], y_pred3)

array([[12,  8, 13,  5],
       [ 3,  5,  2, 11],
       [ 2,  6,  3, 17],
       [ 2, 12,  4, 15]], dtype=int64)

In [123]:
mean_squared_error(y_test[2], y_pred3)

2.125