# Pre-processing

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# Read TADPOLE D1 & D2
original_df = pd.read_csv("../tadpole_challenge/TADPOLE_D1_D2.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Remove left-truncated patients

In [3]:
# Compute percentage of left-truncated patients
left_truncated_events = original_df[original_df['DX_bl'] == 'AD']

# Compute percentage of left-truncated patients
print(f"Percentage of left-truncated patients: {len(pd.unique(left_truncated_events['PTID'])) / len(pd.unique(original_df['PTID'])) * 100:.2f}%")

Percentage of left-truncated patients: 19.69%


In [4]:
# Remove left_truncated patients
non_truncated_events = original_df[original_df['DX_bl'] != 'AD']

print(f"Percentage of non-truncated patients: {len(pd.unique(non_truncated_events['PTID'])) / len(pd.unique(original_df['PTID'])) * 100:.2f}%")

Percentage of non-truncated patients: 80.31%


## Select desired data columns

In [5]:
# TODO check Intracranial
desired_columns = ['PTID', 'DX', 'AGE', 'APOE4', 'PTEDUCAT', 'PTETHCAT', 'PTGENDER', 'PTMARRY', 
                   'PTRACCAT', 'Entorhinal', 'Fusiform', 'Hippocampus', 'ICV', 'MidTemp', 'Ventricles', 
                   'WholeBrain', 'ADAS11', 'ADAS13', 'CDRSB', 'MMSE', 'RAVLT_forgetting', 
                   'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_perc_forgetting', 'Month']

study_df = non_truncated_events[desired_columns]
study_df = study_df.copy()
study_df.columns = desired_columns

study_df.head()

Unnamed: 0,PTID,DX,AGE,APOE4,PTEDUCAT,PTETHCAT,PTGENDER,PTMARRY,PTRACCAT,Entorhinal,...,WholeBrain,ADAS11,ADAS13,CDRSB,MMSE,RAVLT_forgetting,RAVLT_immediate,RAVLT_learning,RAVLT_perc_forgetting,Month
0,011_S_0002,NL,74.3,0.0,16,Not Hisp/Latino,Male,Married,White,4177.0,...,1229740.0,10.67,18.67,0.0,28.0,6.0,44.0,4.0,54.5455,0
5,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3983.0,...,1154980.0,14.33,21.33,1.0,27.0,4.0,37.0,7.0,36.3636,0
6,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3711.0,...,1116280.0,17.33,25.33,0.5,28.0,1.0,33.0,7.0,11.1111,6
7,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3519.0,...,1117390.0,15.0,22.0,1.0,26.0,1.0,37.0,4.0,12.5,12
8,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3764.0,...,1095210.0,20.33,28.33,1.0,27.0,2.0,44.0,8.0,16.6667,18


In [6]:
# Make label binary, mark all Dementia instances as positive
study_df['DX'] = study_df['DX'].replace('Dementia', 1)
study_df['DX'] = study_df['DX'].replace(['MCI', 'NL', 'MCI to Dementia', 'NL to MCI', 'MCI to NL', 'Dementia to MCI', 'NL to Dementia'], 0)

In [7]:
# Compute some statistics for verification with data from the paper
ad_patients = study_df[study_df['DX'] == 1]
nr_ad_patients = len(ad_patients['PTID'].unique())
tot_patients = len(study_df['PTID'].unique())

print(f'Percentage of patients with a stable AD diagnosis: {nr_ad_patients / tot_patients * 100:.2f}%')
print(f'Effective percentage of measurements with positive event label: {nr_ad_patients / len(study_df) * 100:.2f}%')

# Employ label forwarding: mark all measures after the stable diagnosis of AD as positive
for pt_id in ad_patients['PTID'].unique():
    
    # Get events for this patient
    events = study_df[study_df['PTID'] == pt_id]
    
    # Get index of first stable diagnosis of AD
    ad_index = events.index[events['DX'] == 1][0]
    
    # Get indexes of measurements after first stable diagnosis
    forwarding_indexes = events.index[events.index > ad_index]
    
    # Employ label-forwarding
    study_df.loc[forwarding_indexes, 'DX'] = study_df.loc[forwarding_indexes, 'DX'].fillna(1)  

Percentage of patients with a stable AD diagnosis: 17.13%
Effective percentage of measurements with positive event label: 2.14%


In [8]:
study_df['DX'].fillna(0, inplace=True)
study_df['DX'].value_counts()

0.0    10096
1.0     1077
Name: DX, dtype: int64

In [9]:
# Sort dataframe by PTID, Month
study_df.reset_index(drop=True, inplace=True)
study_df = study_df.sort_values(['PTID', 'Month'])

In [10]:
study_df.describe()

Unnamed: 0,DX,AGE,APOE4,PTEDUCAT,Entorhinal,Fusiform,Hippocampus,ICV,MidTemp,Ventricles,WholeBrain,ADAS11,ADAS13,CDRSB,MMSE,RAVLT_forgetting,RAVLT_immediate,RAVLT_learning,RAVLT_perc_forgetting,Month
count,11173.0,11173.0,11165.0,11173.0,5656.0,5656.0,5900.0,6839.0,5656.0,6425.0,6624.0,7809.0,7754.0,7853.0,7825.0,7735.0,7754.0,7754.0,7698.0,11173.0
mean,0.096393,73.634046,0.487595,16.096483,3543.82302,17434.179632,6844.94322,1533900.0,19542.532001,40509.616031,1019044.0,9.844216,15.397016,1.693429,27.296102,4.235294,36.413464,4.357235,55.31198,28.071333
std,0.295143,6.858434,0.632026,2.793559,775.613086,2716.95516,1180.228456,163719.9,2969.753463,22496.709386,109846.3,7.298982,10.198531,2.45282,3.343853,2.604619,13.097253,2.776476,38.236892,27.020656
min,0.0,54.4,0.0,4.0,1041.0,9019.0,2219.0,291.884,8361.0,5650.0,649091.0,0.0,0.0,0.0,0.0,-12.0,0.0,-5.0,-500.0,0.0
25%,0.0,69.6,0.0,14.0,3036.75,15636.75,6042.75,1421775.0,17662.5,24046.0,943727.8,5.0,8.0,0.0,26.0,3.0,27.0,2.0,26.6667,6.0
50%,0.0,73.6,0.0,16.0,3578.0,17414.5,6935.0,1524440.0,19485.0,35967.0,1018190.0,8.0,13.0,1.0,28.0,4.0,35.0,4.0,54.5455,18.0
75%,0.0,78.4,1.0,18.0,4080.0,19178.0,7671.25,1639385.0,21602.25,51126.0,1092052.0,12.67,20.33,2.5,30.0,6.0,45.0,6.0,100.0,42.0
max,1.0,91.4,2.0,20.0,6711.0,29950.0,11207.0,2110290.0,32189.0,162729.0,1486040.0,70.0,85.0,18.0,30.0,15.0,75.0,14.0,100.0,126.0


## Data imputation

In [11]:
# Compute missingness stats per column
def count_missing(count_df):
    for column in count_df.columns:
        missing = count_df[column].isna().sum()
        print(f'Column name: {column} - missing values: {missing} - {missing / len(study_df) * 100:.1f}%')

In [12]:
# Check missing values per column
test_df = study_df.copy()

count_missing(test_df)

Column name: PTID - missing values: 0 - 0.0%
Column name: DX - missing values: 0 - 0.0%
Column name: AGE - missing values: 0 - 0.0%
Column name: APOE4 - missing values: 8 - 0.1%
Column name: PTEDUCAT - missing values: 0 - 0.0%
Column name: PTETHCAT - missing values: 0 - 0.0%
Column name: PTGENDER - missing values: 0 - 0.0%
Column name: PTMARRY - missing values: 0 - 0.0%
Column name: PTRACCAT - missing values: 0 - 0.0%
Column name: Entorhinal - missing values: 5517 - 49.4%
Column name: Fusiform - missing values: 5517 - 49.4%
Column name: Hippocampus - missing values: 5273 - 47.2%
Column name: ICV - missing values: 4334 - 38.8%
Column name: MidTemp - missing values: 5517 - 49.4%
Column name: Ventricles - missing values: 4748 - 42.5%
Column name: WholeBrain - missing values: 4549 - 40.7%
Column name: ADAS11 - missing values: 3364 - 30.1%
Column name: ADAS13 - missing values: 3419 - 30.6%
Column name: CDRSB - missing values: 3320 - 29.7%
Column name: MMSE - missing values: 3348 - 30.0%
Col

As can be seen, 8 records are missing APOE4 information. Upon manual inspection, it turned out that for these patients only a single measurement was available.

In [23]:
test_df[test_df['PTID'] == '941_S_4376'].fillna(method='ffill')

Unnamed: 0,PTID,DX,AGE,APOE4,PTEDUCAT,PTETHCAT,PTGENDER,PTMARRY,PTRACCAT,Entorhinal,...,WholeBrain,ADAS11,ADAS13,CDRSB,MMSE,RAVLT_forgetting,RAVLT_immediate,RAVLT_learning,RAVLT_perc_forgetting,Month
8718,941_S_4376,0.0,76.5,0.0,16,Not Hisp/Latino,Female,Married,White,,...,955752.0,7.0,10.0,0.0,29.0,5.0,53.0,10.0,38.4615,0
8719,941_S_4376,0.0,76.5,0.0,16,Not Hisp/Latino,Female,Married,White,,...,965388.0,7.0,10.0,0.0,29.0,5.0,53.0,10.0,38.4615,3
8720,941_S_4376,0.0,76.5,0.0,16,Not Hisp/Latino,Female,Married,White,,...,962661.0,5.0,10.0,0.0,27.0,5.0,39.0,6.0,41.6667,6
8721,941_S_4376,0.0,76.5,0.0,16,Not Hisp/Latino,Female,Married,White,,...,960885.0,4.0,6.0,0.0,27.0,5.0,41.0,6.0,41.6667,12
8722,941_S_4376,0.0,76.5,0.0,16,Not Hisp/Latino,Female,Married,White,,...,960885.0,4.0,6.0,0.0,27.0,5.0,41.0,6.0,41.6667,18
9653,941_S_4376,0.0,76.5,0.0,16,Not Hisp/Latino,Female,Married,White,,...,960885.0,4.0,7.0,0.0,27.0,12.0,41.0,6.0,100.0,24
11025,941_S_4376,0.0,76.5,0.0,16,Not Hisp/Latino,Female,Married,White,,...,960885.0,7.0,9.0,0.0,28.0,2.0,46.0,2.0,20.0,54


In [13]:
# Use zero-order interpolation on the data (execute per patient)
for pt in tqdm(test_df['PTID'].unique()):
    events = test_df.loc[test_df['PTID'] == pt]
    test_df.loc[events.index, events.columns[2:-1]] = events[events.columns[2:-1]].fillna(method='ffill')

100%|██████████| 1395/1395 [00:13<00:00, 104.51it/s]


In [14]:
count_missing(test_df)

Column name: PTID - missing values: 0 - 0.0%
Column name: DX - missing values: 0 - 0.0%
Column name: AGE - missing values: 0 - 0.0%
Column name: APOE4 - missing values: 8 - 0.1%
Column name: PTEDUCAT - missing values: 0 - 0.0%
Column name: PTETHCAT - missing values: 0 - 0.0%
Column name: PTGENDER - missing values: 0 - 0.0%
Column name: PTMARRY - missing values: 0 - 0.0%
Column name: PTRACCAT - missing values: 0 - 0.0%
Column name: Entorhinal - missing values: 581 - 5.2%
Column name: Fusiform - missing values: 581 - 5.2%
Column name: Hippocampus - missing values: 486 - 4.3%
Column name: ICV - missing values: 17 - 0.2%
Column name: MidTemp - missing values: 581 - 5.2%
Column name: Ventricles - missing values: 123 - 1.1%
Column name: WholeBrain - missing values: 54 - 0.5%
Column name: ADAS11 - missing values: 4 - 0.0%
Column name: ADAS13 - missing values: 7 - 0.1%
Column name: CDRSB - missing values: 0 - 0.0%
Column name: MMSE - missing values: 0 - 0.0%
Column name: RAVLT_forgetting - mis

In [28]:
test_df['APOE4'].value_counts()

0.0    6556
1.0    3774
2.0     835
Name: APOE4, dtype: int64

In [30]:
test = pd.Series([0, 1, 3, np.nan])
test.interpolate()

0    0.0
1    1.0
2    3.0
3    3.0
dtype: float64

In [19]:
# Fill remaining numerical column nan values with mean of all measurements
for column in test_df.columns[np.r_[2, 4:5, 9:24]]:
    test_df[column].fillna(test_df[column].mean(), inplace=True)

In [20]:
count_missing(test_df)

Column name: PTID - missing values: 0 - 0.0%
Column name: DX - missing values: 0 - 0.0%
Column name: AGE - missing values: 0 - 0.0%
Column name: APOE4 - missing values: 8 - 0.1%
Column name: PTEDUCAT - missing values: 0 - 0.0%
Column name: PTETHCAT - missing values: 0 - 0.0%
Column name: PTGENDER - missing values: 0 - 0.0%
Column name: PTMARRY - missing values: 0 - 0.0%
Column name: PTRACCAT - missing values: 0 - 0.0%
Column name: Entorhinal - missing values: 0 - 0.0%
Column name: Fusiform - missing values: 0 - 0.0%
Column name: Hippocampus - missing values: 0 - 0.0%
Column name: ICV - missing values: 0 - 0.0%
Column name: MidTemp - missing values: 0 - 0.0%
Column name: Ventricles - missing values: 0 - 0.0%
Column name: WholeBrain - missing values: 0 - 0.0%
Column name: ADAS11 - missing values: 0 - 0.0%
Column name: ADAS13 - missing values: 0 - 0.0%
Column name: CDRSB - missing values: 0 - 0.0%
Column name: MMSE - missing values: 0 - 0.0%
Column name: RAVLT_forgetting - missing values:

## Normalise numerical features and encode categorical features

In [16]:
# Normalize numerical features
for column in test_df.columns[np.r_[2:5, 9:24]]:
    test_df[column] = (test_df[column] - test_df[column].mean()) / test_df[column].std()

In [18]:
# One-hot encode categorical features
for column in test_df.columns[5:9]:
    test_df = pd.concat([test_df, pd.get_dummies(test_df[column], prefix=column)], axis=1)

# Drop categorical columns
test_df = test_df.drop(test_df.columns[5:9], axis=1)

In [19]:
test_df

Unnamed: 0,PTID,DX,AGE,APOE4,PTEDUCAT,Entorhinal,Fusiform,Hippocampus,ICV,MidTemp,...,PTMARRY_Never married,PTMARRY_Unknown,PTMARRY_Widowed,PTRACCAT_Am Indian/Alaskan,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White
629,002_S_0295,0.0,1.628062,0.810734,0.681395,0.091423,-0.342252,0.186539,0.622589,0.090772,...,0,0,0,0,0,0,0,0,0,1
630,002_S_0295,0.0,1.628062,0.810734,0.681395,0.146811,0.202485,0.040799,0.692865,0.089423,...,0,0,0,0,0,0,0,0,0,1
631,002_S_0295,0.0,1.628062,0.810734,0.681395,0.030882,-0.038598,0.039105,0.730952,0.120441,...,0,0,0,0,0,0,0,0,0,1
5723,002_S_0295,0.0,1.628062,0.810734,0.681395,,,,,,...,0,0,0,0,0,0,0,0,0,1
632,002_S_0295,0.0,1.628062,0.810734,0.681395,0.395414,0.245917,-0.007498,0.724932,-0.098030,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10355,941_S_5124,0.0,0.447034,-0.771480,1.397328,,,,,,...,0,0,0,0,0,0,0,0,0,1
4298,941_S_5193,0.0,-0.165351,-0.771480,-0.034538,0.792148,-0.475492,0.322958,-0.414906,-0.626001,...,0,0,0,0,0,0,0,0,0,1
4524,941_S_5193,0.0,-0.165351,-0.771480,-0.034538,0.391550,-0.760007,-0.049017,-0.413739,-0.789179,...,0,0,0,0,0,0,0,0,0,1
9318,941_S_5193,0.0,-0.165351,-0.771480,-0.034538,,,,,,...,0,0,0,0,0,0,0,0,0,1
