In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# classifiers we will use
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

# model selection bits
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import learning_curve, validation_curve

# evaluation
from sklearn.metrics import f1_score, accuracy_score


import scipy

In [2]:
# Load the competition datasets into Pandas DataFrame
proteins = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
peptides = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
clinical = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
supplemental = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv")

In [3]:
# drop the "medication status" column (due to over 50% NaN values), keep a copy of the original
# for later access. 
clinical_copy = clinical.copy()

clinical.drop('upd23b_clinical_state_on_medication', axis=1, inplace=True)

In [4]:
# and use interpolate() on the rest of the dataset.

clinical['updrs_1'].interpolate(method='linear', limit_direction='both', inplace=True)
clinical['updrs_2'].interpolate(method='linear', limit_direction='both', inplace=True)
clinical['updrs_3'].interpolate(method='linear', limit_direction='both', inplace=True)
clinical['updrs_4'].interpolate(method='linear', limit_direction='both', inplace=True)

In [5]:
clinical.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,10.0,6.0,15.0,0.0
1,55_3,55,3,10.0,7.0,25.0,0.0
2,55_6,55,6,8.0,10.0,34.0,0.0
3,55_9,55,9,8.0,9.0,30.0,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0


In [6]:
proteins.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [7]:
peptides.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [8]:
protein_peptides = pd.merge(proteins, peptides, on=['visit_id', 'UniProt'], how='left')

In [9]:
protein_peptides.drop('visit_month_y', inplace=True, axis=1)
protein_peptides.drop('patient_id_y', inplace=True, axis=1)
protein_peptides

Unnamed: 0,visit_id,visit_month_x,patient_id_x,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.30
1,55_0,0,55,O00533,732430.0,GNPEPTFSWTK,102060.00
2,55_0,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.00
3,55_0,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90
4,55_0,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.70
...,...,...,...,...,...,...,...
981829,58648_108,108,58648,Q9UHG2,369437.0,ILAGSADSEGVAAPR,202820.00
981830,58648_108,108,58648,Q9UKV8,105830.0,SGNIPAGTTVDTK,105830.00
981831,58648_108,108,58648,Q9Y646,21257.6,LALLVDTVGPR,21257.60
981832,58648_108,108,58648,Q9Y6R7,17953.1,AGC(UniMod_4)VAESTAVC(UniMod_4)R,5127.26


In [10]:
protein_peptides.rename(columns={
    'visit_month_x':'visit_month',
    'patient_id_x':'patient_id'
}, inplace=True)

In [11]:
protein_peptides

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.30
1,55_0,0,55,O00533,732430.0,GNPEPTFSWTK,102060.00
2,55_0,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.00
3,55_0,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90
4,55_0,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.70
...,...,...,...,...,...,...,...
981829,58648_108,108,58648,Q9UHG2,369437.0,ILAGSADSEGVAAPR,202820.00
981830,58648_108,108,58648,Q9UKV8,105830.0,SGNIPAGTTVDTK,105830.00
981831,58648_108,108,58648,Q9Y646,21257.6,LALLVDTVGPR,21257.60
981832,58648_108,108,58648,Q9Y6R7,17953.1,AGC(UniMod_4)VAESTAVC(UniMod_4)R,5127.26


In [12]:
data = pd.merge(clinical, protein_peptides, on=['visit_id', 'visit_month', 'patient_id'], how='left')

In [13]:
data

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,55,0,10.0,6.0,15.0,0.0,O00391,11254.3,NEQEQPLGQWHLS,11254.3
1,55_0,55,0,10.0,6.0,15.0,0.0,O00533,732430.0,GNPEPTFSWTK,102060.0
2,55_0,55,0,10.0,6.0,15.0,0.0,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0
3,55_0,55,0,10.0,6.0,15.0,0.0,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,55,0,10.0,6.0,15.0,0.0,O00533,732430.0,SMEQNGPGLEYR,30838.7
...,...,...,...,...,...,...,...,...,...,...,...
943286,65043_48,65043,48,7.0,6.0,13.0,0.0,Q9Y6R7,39535.0,GATTSPGVYELSSR,28410.9
943287,65043_54,65043,54,4.0,8.0,11.0,1.0,,,,
943288,65043_60,65043,60,6.0,6.0,16.0,1.0,,,,
943289,65043_72,65043,72,3.0,9.0,14.0,1.0,,,,


In [14]:
data.dropna(subset=['UniProt'], inplace=True, axis=0)

In [15]:
data.isna().sum()

visit_id            0
patient_id          0
visit_month         0
updrs_1             0
updrs_2             0
updrs_3             0
updrs_4             0
UniProt             0
NPX                 0
Peptide             0
PeptideAbundance    0
dtype: int64

In [22]:
data.drop(['visit_id', "patient_id", 'visit_month'], inplace=True, axis=1)

In [23]:
data

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4,UniProt,NPX,Peptide,PeptideAbundance
0,10.0,6.0,15.0,0.0,O00391,11254.3,NEQEQPLGQWHLS,11254.3
1,10.0,6.0,15.0,0.0,O00533,732430.0,GNPEPTFSWTK,102060.0
2,10.0,6.0,15.0,0.0,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0
3,10.0,6.0,15.0,0.0,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,10.0,6.0,15.0,0.0,O00533,732430.0,SMEQNGPGLEYR,30838.7
...,...,...,...,...,...,...,...,...
943282,7.0,6.0,13.0,0.0,Q9UHG2,320821.0,ILAGSADSEGVAAPR,185566.0
943283,7.0,6.0,13.0,0.0,Q9UKV8,39046.7,SGNIPAGTTVDTK,39046.7
943284,7.0,6.0,13.0,0.0,Q9Y646,20198.8,LALLVDTVGPR,20198.8
943285,7.0,6.0,13.0,0.0,Q9Y6R7,39535.0,AGC(UniMod_4)VAESTAVC(UniMod_4)R,11124.1


In [24]:
targets = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

In [29]:
X = data.drop(targets, axis=1)
y = data['updrs_1']

In [30]:
rfr = RandomForestRegressor()

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) 
cv = StratifiedKFold(n_splits=5)

In [32]:
rfc_score = cross_val_score(estimator=rfr, X=X_train, y=y_train, cv=cv, scoring='accuracy')

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\ensemble\_forest.py", line 345, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'P02768'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\ensemble\_forest.py", line 345, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\13392\miniconda3\envs\cn_ml_course\Lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Q92823'
