## AMP® - Parkinson's Disease Progression Prediction - Baselines

## 1. Setup

In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

dataset_directory = Path('/kaggle/input/amp-parkinsons-disease-progression-prediction')
sys.path.append(dataset_directory)
import amp_pd_peptide

In [2]:
df_train_clinical_data = pd.read_csv(dataset_directory / 'train_clinical_data.csv')
print(f'Train Clinical Data Shape: {df_train_clinical_data.shape} - Memory Usage: {df_train_clinical_data.memory_usage().sum() / 1024 ** 2:.2f} MB')

patient_count = df_train_clinical_data['patient_id'].nunique()
visit_count_mean = df_train_clinical_data.groupby('patient_id')['visit_month'].count().mean()
print(f'Patient Count: {patient_count} - Mean Visit Count: {visit_count_mean:.2f}')

Train Clinical Data Shape: (2615, 8) - Memory Usage: 0.16 MB
Patient Count: 248 - Mean Visit Count: 10.54


In [3]:
df_supplemental_clinical_data = pd.read_csv(dataset_directory / 'supplemental_clinical_data.csv')
print(f'Supplemental Clinical Data Shape: {df_supplemental_clinical_data.shape} - Memory Usage: {df_supplemental_clinical_data.memory_usage().sum() / 1024 ** 2:.2f} MB')

patient_count = df_supplemental_clinical_data['patient_id'].nunique()
visit_count_mean = df_supplemental_clinical_data.groupby('patient_id')['visit_month'].count().mean()
print(f'Patient Count: {patient_count} - Mean Visit Count: {visit_count_mean:.2f}')

Supplemental Clinical Data Shape: (2223, 8) - Memory Usage: 0.14 MB
Patient Count: 771 - Mean Visit Count: 2.88


## 2. Evaluation

The problem is predicting `updrs_1`, `updrs_2`, `updrs_3` and `updrs_4` values at a given `visit_month` and potential **6**, **12** and **24** month visits after the current `visit_month`.

Predictions are evaluated on symmetric mean absolute percentage error (SMAPE) + 1.

In [4]:
def symmetric_mean_absolute_percentage_error(y_true, y_pred):

    """
    Calculate symmetric mean absolute percentage error from given ground-truth and predictions
    
    Parameters
    ----------
    y_true: array-like of shape (n_samples)
        Array of ground-truth values
        
    y_pred: array-like of shape (n_samples)
        Array of prediction values
        
    Returns
    -------
    smape: float
        Symmetric mean absolute percentage error
    """

    smape = 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

    return smape


def score(df, target_columns, prediction_columns):
    
    """
    Concatenate targets and prediction into a single array and calculate SMAPE + 1
    
    Parameters
    ----------
    target_columns: list of shape (4)
        Array of target column names
        
    prediction_columns: list of shape (4)
        Array of prediction column names
        
    Returns
    -------
    score: float
        Symmetric mean absolute percentage error
    """
    
    y_true = []
    y_pred = []
    
    for target_column, prediction_column in zip(target_columns, prediction_columns):
        target_idx = df[target_column].notna()
        y_true.append(df.loc[target_idx, target_column].values + 1)
        y_pred.append(df.loc[target_idx, prediction_column].values + 1)
        
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
        
    score = symmetric_mean_absolute_percentage_error(
        y_true=y_true,
        y_pred=y_pred
    )
    
    return score


## 3. Baselines

Median values at each visit month is a strong baseline since this kind of forecasting problems are hard. `visit_month` median values of `updrs_1` are calculated on clinical data and median values of `updrs_2`, `updrs_3`, `updrs_4` are calculated on clinical and supplemental data.

When clinical and supplemental data are concatenated, different `visit_month` values (**5**) exist on the baselines table. Unseen `visit_month` values also exist in hidden test set which will be handled accordingly.

Default median values of visit months have a flaw. Median updrs values might decrease as `visit_month` increases since dataset is not large enough and noisy. A baseline like that might overfit to training set and generalize poorly. Replacing median values with expanding window max values increases validation and leaderboard scores.

In [5]:
target_columns_clinical_data = ['updrs_1']
target_columns_clinical_and_supplemental_data = ['updrs_2', 'updrs_3', 'updrs_4']

target_visit_month_medians_clinical_data = df_train_clinical_data.groupby('visit_month')[target_columns_clinical_data].median()
target_visit_month_medians_clinical_and_supplemental_data = pd.concat((
    df_train_clinical_data,
    df_supplemental_clinical_data
), axis=0).groupby('visit_month')[target_columns_clinical_and_supplemental_data].median()

# Drop 5th month visit that is coming from the supplemental clinical data
target_visit_month_medians_clinical_and_supplemental_data = target_visit_month_medians_clinical_and_supplemental_data.drop(5)

# Concatenate visit_month medians of targets
target_visit_month_medians = pd.concat((
    target_visit_month_medians_clinical_data,
    target_visit_month_medians_clinical_and_supplemental_data
), axis=1, ignore_index=False)

# Replace expanding window max of updrs values with current updrs values
target_visit_month_medians = target_visit_month_medians.expanding(min_periods=1).max()
target_visit_month_medians

Unnamed: 0_level_0,updrs_1,updrs_2,updrs_3,updrs_4
visit_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4.5,4.0,18.0,0.0
3,4.5,5.0,19.0,0.0
6,6.0,6.0,21.0,0.0
9,6.0,6.0,21.0,0.0
12,6.0,6.0,21.0,0.0
18,6.0,6.0,21.0,0.0
24,6.0,6.0,21.0,0.0
30,7.0,6.0,22.0,0.0
36,7.0,6.0,22.0,0.0
42,7.0,7.0,23.0,0.0


## 4. Validation

* Medians of `visit_month` groups calculated on train clinical data
  * `updrs_1` Validation: **55.63**
  * `updrs_2` Validation: **70.51**
  * `updrs_3` Validation: **70.94**
  * `updrs_4` Validation: **48.00**
  * Global Validation: **62.71**
  * Public LB Score: **57.8** (unseen visit months filled with global target medians)
  
* Medians of `visit_month` groups calculated on train + supplemental clinical data
  * `updrs_1` Validation: **56.16**
  * `updrs_2` Validation: **70.53**
  * `updrs_3` Validation: **70.16**
  * `updrs_4` Validation: **48.00**
  * Global Validation: **62.65**
  * Public LB Score: **57.2** (unseen visit months filled with global target medians)
  
* Medians of `visit_month` groups calculated on train + supplemental clinical data replaced with expanding window max
  * `updrs_1` Validation: **56.29**
  * `updrs_2` Validation: **70.79**
  * `updrs_3` Validation: **69.19**
  * `updrs_4` Validation: **48.01**
  * Global Validation: **62.49**
  * Public LB Score: **56.4** (unseen visit months filled with global target medians)
  
* Medians of `visit_month` groups calculated on train for `updrs_1` and train + supplemental clinical data for `updrs_2`, `updrs_3`, `updrs_4` 
  * `updrs_1` Validation: **55.74**
  * `updrs_2` Validation: **70.79**
  * `updrs_3` Validation: **69.19**
  * `updrs_4` Validation: **48.01**
  * Global Validation: **62.34**
  * Public LB Score: **56.3** (unseen visit months filled with closest visit month median)

In [6]:
target_columns = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

for target_column in target_columns:
        
    target_idx = df_train_clinical_data[target_column].notna()
    df_train = df_train_clinical_data.loc[target_idx]
    print(f'Target: {target_column} Dataset Shape: {df_train.shape}')
        
    df_train_clinical_data.loc[target_idx, f'{target_column}_predictions'] = df_train_clinical_data.loc[target_idx, 'visit_month'].map(target_visit_month_medians[target_column])
    val_score = score(
        df=df_train_clinical_data.loc[target_idx],
        target_columns=[target_column],
        prediction_columns=[f'{target_column}_predictions']
    )
    print(f'Validation SMAPE: {val_score:.4f}\n')
    
global_val_score = score(
    df=df_train_clinical_data,
    target_columns=target_columns,
    prediction_columns=[f'{target_column}_predictions' for target_column in target_columns]
)
print(f'Global Validation SMAPE: {global_val_score:.4f}')

Target: updrs_1 Dataset Shape: (2614, 8)
Validation SMAPE: 55.7414

Target: updrs_2 Dataset Shape: (2613, 9)
Validation SMAPE: 70.7972

Target: updrs_3 Dataset Shape: (2590, 10)
Validation SMAPE: 69.1957

Target: updrs_4 Dataset Shape: (1577, 11)
Validation SMAPE: 48.0103

Global Validation SMAPE: 62.3409


## 5. Submission

Since hidden test has unseen `visit_month` values, there will be some missing predictions after mapping the baselines. In order to deal with those cases, closest `visit_month` baseline values are used for filling missing predictions after the map operation.

In [7]:
env = amp_pd_peptide.make_env()
test_iterator = env.iter_test() 

for (df_test, df_test_peptides, df_test_proteins, df_submission) in test_iterator:
    
    df_submission['patient_id'] = df_submission.apply('prediction_id').str.split('_', expand=True)[0].astype(int)
    df_submission['current_visit_month'] = df_submission.apply('prediction_id').str.split('_', expand=True)[1].astype(int)
    df_submission['visit_month_offset'] = df_submission.apply('prediction_id').str.split('_', expand=True)[5].astype(int)
    df_submission['prediction_visit_month'] = df_submission['current_visit_month'] + df_submission['visit_month_offset'].astype(int)
    df_submission['updrs'] = df_submission.apply('prediction_id').str.split('_', expand=True)[3].astype(int)

    for updrs in range(1, 5):
        updrs_idx = df_submission['updrs'] == updrs
        df_submission.loc[updrs_idx, 'rating'] = df_submission.loc[updrs_idx, 'prediction_visit_month'].map(target_visit_month_medians[f'updrs_{updrs}'])
        
        missing_idx = df_submission['rating'].isnull()
        # Iterate over missing prediction rows after mapping the baselines
        for idx, row in df_submission[updrs_idx & missing_idx].iterrows():
            # Find the closest visit_month value from the baselines table
            target_visit_month_median_idx = np.argmin(np.abs(target_visit_month_medians.index - row['prediction_visit_month']))
            # Write the closest visit_month value to the unseen visit_month
            df_submission.loc[idx, 'rating'] = target_visit_month_medians.iloc[target_visit_month_median_idx, updrs - 1]
    
    df_submission = df_submission.loc[:, ['prediction_id', 'rating']]
    env.predict(df_submission)


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
