# Pre-processing

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# Settings

apply_label_forwarding = True
apply_oversampling = False

In [3]:
# Read TADPOLE D1 & D2
original_df = pd.read_csv("../tadpole_challenge/TADPOLE_D1_D2.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Remove left-truncated patients

In [4]:
# Compute percentage of left-truncated patients
left_truncated_events = original_df[original_df['DX_bl'] == 'AD']

# Compute percentage of left-truncated patients
print(f"Percentage of left-truncated patients: {len(pd.unique(left_truncated_events['PTID'])) / len(pd.unique(original_df['PTID'])) * 100:.2f}%")

Percentage of left-truncated patients: 19.69%


In [5]:
# Remove left_truncated patients
non_truncated_events = original_df[original_df['DX_bl'] != 'AD']

print(f"Percentage of non-truncated patients: {len(pd.unique(non_truncated_events['PTID'])) / len(pd.unique(original_df['PTID'])) * 100:.2f}%")

Percentage of non-truncated patients: 80.31%


## Select desired data columns

In [6]:
desired_columns = ['PTID', 'DX', 'AGE', 'APOE4', 'PTEDUCAT', 'PTETHCAT', 'PTGENDER', 'PTMARRY', 
                   'PTRACCAT', 'Entorhinal', 'Fusiform', 'Hippocampus', 'ICV', 'MidTemp', 'Ventricles', 
                   'WholeBrain', 'ADAS11', 'ADAS13', 'CDRSB', 'MMSE', 'RAVLT_forgetting', 
                   'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_perc_forgetting', 'Month']

study_df = non_truncated_events[desired_columns]
study_df = study_df.copy()
study_df.columns = desired_columns

study_df.head()

Unnamed: 0,PTID,DX,AGE,APOE4,PTEDUCAT,PTETHCAT,PTGENDER,PTMARRY,PTRACCAT,Entorhinal,...,WholeBrain,ADAS11,ADAS13,CDRSB,MMSE,RAVLT_forgetting,RAVLT_immediate,RAVLT_learning,RAVLT_perc_forgetting,Month
0,011_S_0002,NL,74.3,0.0,16,Not Hisp/Latino,Male,Married,White,4177.0,...,1229740.0,10.67,18.67,0.0,28.0,6.0,44.0,4.0,54.5455,0
5,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3983.0,...,1154980.0,14.33,21.33,1.0,27.0,4.0,37.0,7.0,36.3636,0
6,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3711.0,...,1116280.0,17.33,25.33,0.5,28.0,1.0,33.0,7.0,11.1111,6
7,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3519.0,...,1117390.0,15.0,22.0,1.0,26.0,1.0,37.0,4.0,12.5,12
8,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3764.0,...,1095210.0,20.33,28.33,1.0,27.0,2.0,44.0,8.0,16.6667,18


In [7]:
# Make label binary, mark all Dementia instances as positive
study_df['DX'] = study_df['DX'].replace('Dementia', 1)
study_df['DX'] = study_df['DX'].replace(['MCI', 'NL', 'MCI to Dementia', 'NL to MCI', 'MCI to NL', 'Dementia to MCI', 'NL to Dementia'], 0)

In [8]:
# Compute some statistics for verification with data from the paper
ad_patients = study_df[study_df['DX'] == 1]
nr_ad_patients = len(ad_patients['PTID'].unique())
tot_patients = len(study_df['PTID'].unique())

print(f'Percentage of patients with a stable AD diagnosis: {nr_ad_patients / tot_patients * 100:.2f}%')
print(f'Effective percentage of measurements with positive event label: {nr_ad_patients / len(study_df) * 100:.2f}%')

Percentage of patients with a stable AD diagnosis: 17.13%
Effective percentage of measurements with positive event label: 2.14%


### Label forwarding

In [9]:
# Employ label forwarding: mark all measures after the stable diagnosis of AD as positive

if apply_label_forwarding:
    for pt_id in ad_patients['PTID'].unique():

        # Get events for this patient
        events = study_df[study_df['PTID'] == pt_id]

        # Get index of first stable diagnosis of AD
        ad_index = events.index[events['DX'] == 1][0]

        # Get indexes of measurements after first stable diagnosis
        forwarding_indexes = events.index[events.index > ad_index]

        # Employ label-forwarding
        study_df.loc[forwarding_indexes, 'DX'] = study_df.loc[forwarding_indexes, 'DX'].fillna(1)  

In [10]:
study_df['DX'].fillna(0, inplace=True)
study_df['DX'].value_counts()

0.0    10096
1.0     1077
Name: DX, dtype: int64

In [11]:
# Sort dataframe by PTID, Month
study_df.reset_index(drop=True, inplace=True)
study_df = study_df.sort_values(['PTID', 'Month'])

In [12]:
study_df.describe()

Unnamed: 0,DX,AGE,APOE4,PTEDUCAT,Entorhinal,Fusiform,Hippocampus,ICV,MidTemp,Ventricles,WholeBrain,ADAS11,ADAS13,CDRSB,MMSE,RAVLT_forgetting,RAVLT_immediate,RAVLT_learning,RAVLT_perc_forgetting,Month
count,11173.0,11173.0,11165.0,11173.0,5656.0,5656.0,5900.0,6839.0,5656.0,6425.0,6624.0,7809.0,7754.0,7853.0,7825.0,7735.0,7754.0,7754.0,7698.0,11173.0
mean,0.096393,73.634046,0.487595,16.096483,3543.82302,17434.179632,6844.94322,1533900.0,19542.532001,40509.616031,1019044.0,9.844216,15.397016,1.693429,27.296102,4.235294,36.413464,4.357235,55.31198,28.071333
std,0.295143,6.858434,0.632026,2.793559,775.613086,2716.95516,1180.228456,163719.9,2969.753463,22496.709386,109846.3,7.298982,10.198531,2.45282,3.343853,2.604619,13.097253,2.776476,38.236892,27.020656
min,0.0,54.4,0.0,4.0,1041.0,9019.0,2219.0,291.884,8361.0,5650.0,649091.0,0.0,0.0,0.0,0.0,-12.0,0.0,-5.0,-500.0,0.0
25%,0.0,69.6,0.0,14.0,3036.75,15636.75,6042.75,1421775.0,17662.5,24046.0,943727.8,5.0,8.0,0.0,26.0,3.0,27.0,2.0,26.6667,6.0
50%,0.0,73.6,0.0,16.0,3578.0,17414.5,6935.0,1524440.0,19485.0,35967.0,1018190.0,8.0,13.0,1.0,28.0,4.0,35.0,4.0,54.5455,18.0
75%,0.0,78.4,1.0,18.0,4080.0,19178.0,7671.25,1639385.0,21602.25,51126.0,1092052.0,12.67,20.33,2.5,30.0,6.0,45.0,6.0,100.0,42.0
max,1.0,91.4,2.0,20.0,6711.0,29950.0,11207.0,2110290.0,32189.0,162729.0,1486040.0,70.0,85.0,18.0,30.0,15.0,75.0,14.0,100.0,126.0


In [13]:
meta_data = study_df.iloc[:, 0:2]
months = study_df.iloc[:, -1]
feature_set = study_df.iloc[:, 2:-1]

## Missing values

In [14]:
# Compute missingness stats per column
def count_missing(count_df):
    for column in count_df.columns:
        missing = count_df[column].isna().sum()
        print(f'Column name: {column} - missing values: {missing} - {missing / len(study_df) * 100:.1f}%')

In [15]:
# Make all missing values (-1 & -4 to NaN) equal
# -1 == confirmed missing at point of data entry
# -4 == passively’ missing or not applicable
for column in feature_set.columns:
    feature_set[column] = feature_set[column].replace([-4, -1], np.nan)

### Masking

In [16]:
missing_masks = feature_set.copy()

# Replace all ones with zeros (1 means missing in the masked feature set)
missing_masks = missing_masks.replace(1, 0)

# Set all missing values to 1, present values to 0
missing_masks = missing_masks.isnull().astype('int')

# Check if mask is correct
print(f'Correct values set in mask: {(missing_masks == 1).equals(feature_set.isna())}' )

Correct values set in mask: True


In [17]:
# One-hot encode categorical features
ohe_features = None

for column in study_df.columns[5:9]:
    encoded_feature = pd.get_dummies(study_df[column], prefix=column, dummy_na=True)
    
    # Encode missing values as all ones
    encoded_feature.loc[encoded_feature[f'{column}_nan'] == 1] = np.ones(encoded_feature.shape[1])
    
    # Encode present values as all zeros
    encoded_feature.loc[encoded_feature[f'{column}_nan'] == 1] = np.zeros(encoded_feature.shape[1])
    
    ohe_features = pd.concat([ohe_features, encoded_feature.iloc[:, :-1]], axis=1)

# Concat one-hot encoded features and drop original categorical columns
missing_masks = pd.concat([missing_masks, ohe_features], axis=1)
missing_masks = missing_masks.drop(missing_masks.columns[3:7], axis=1)

In [18]:
# Add meta data to mask dataframe
missing_masks = pd.concat([meta_data, missing_masks, months], axis=1)
missing_masks

Unnamed: 0,PTID,DX,AGE,APOE4,PTEDUCAT,Entorhinal,Fusiform,Hippocampus,ICV,MidTemp,...,PTMARRY_Unknown,PTMARRY_Widowed,PTRACCAT_Am Indian/Alaskan,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White,Month
629,002_S_0295,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
630,002_S_0295,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6
631,002_S_0295,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,12
5723,002_S_0295,0.0,0,0,0,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18
632,002_S_0295,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10355,941_S_5124,0.0,0,0,0,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,24
4298,941_S_5193,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4524,941_S_5193,0.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
9318,941_S_5193,0.0,0,0,0,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6


### Data imputation

In [19]:
# Check missing values per column
count_missing(feature_set)

Column name: AGE - missing values: 0 - 0.0%
Column name: APOE4 - missing values: 8 - 0.1%
Column name: PTEDUCAT - missing values: 0 - 0.0%
Column name: PTETHCAT - missing values: 0 - 0.0%
Column name: PTGENDER - missing values: 0 - 0.0%
Column name: PTMARRY - missing values: 0 - 0.0%
Column name: PTRACCAT - missing values: 0 - 0.0%
Column name: Entorhinal - missing values: 5517 - 49.4%
Column name: Fusiform - missing values: 5517 - 49.4%
Column name: Hippocampus - missing values: 5273 - 47.2%
Column name: ICV - missing values: 4334 - 38.8%
Column name: MidTemp - missing values: 5517 - 49.4%
Column name: Ventricles - missing values: 4748 - 42.5%
Column name: WholeBrain - missing values: 4549 - 40.7%
Column name: ADAS11 - missing values: 3364 - 30.1%
Column name: ADAS13 - missing values: 3419 - 30.6%
Column name: CDRSB - missing values: 3320 - 29.7%
Column name: MMSE - missing values: 3348 - 30.0%
Column name: RAVLT_forgetting - missing values: 3552 - 31.8%
Column name: RAVLT_immediate -

As can be seen, 8 records are missing APOE4 information. Upon manual inspection, it turned out that for these patients only a single measurement was available.

In [20]:
# Use zero-order interpolation on the data (execute per patient)
for pt in tqdm(study_df['PTID'].unique()):
    events = study_df.loc[study_df['PTID'] == pt]
    feature_set.loc[events.index, feature_set.columns] = events[feature_set.columns].fillna(method='ffill')

100%|██████████| 1395/1395 [00:12<00:00, 107.43it/s]


In [21]:
count_missing(feature_set)

Column name: AGE - missing values: 0 - 0.0%
Column name: APOE4 - missing values: 8 - 0.1%
Column name: PTEDUCAT - missing values: 0 - 0.0%
Column name: PTETHCAT - missing values: 0 - 0.0%
Column name: PTGENDER - missing values: 0 - 0.0%
Column name: PTMARRY - missing values: 0 - 0.0%
Column name: PTRACCAT - missing values: 0 - 0.0%
Column name: Entorhinal - missing values: 581 - 5.2%
Column name: Fusiform - missing values: 581 - 5.2%
Column name: Hippocampus - missing values: 486 - 4.3%
Column name: ICV - missing values: 17 - 0.2%
Column name: MidTemp - missing values: 581 - 5.2%
Column name: Ventricles - missing values: 123 - 1.1%
Column name: WholeBrain - missing values: 54 - 0.5%
Column name: ADAS11 - missing values: 4 - 0.0%
Column name: ADAS13 - missing values: 7 - 0.1%
Column name: CDRSB - missing values: 0 - 0.0%
Column name: MMSE - missing values: 0 - 0.0%
Column name: RAVLT_forgetting - missing values: 4 - 0.0%
Column name: RAVLT_immediate - missing values: 4 - 0.0%
Column nam

In [22]:
# Fill remaining numerical column nan values with mean of all measurements
for column in feature_set.columns[np.r_[0, 2:3, 7:22]]:
    feature_set[column].fillna(feature_set[column].mean(), inplace=True)

In [23]:
count_missing(feature_set)

Column name: AGE - missing values: 0 - 0.0%
Column name: APOE4 - missing values: 8 - 0.1%
Column name: PTEDUCAT - missing values: 0 - 0.0%
Column name: PTETHCAT - missing values: 0 - 0.0%
Column name: PTGENDER - missing values: 0 - 0.0%
Column name: PTMARRY - missing values: 0 - 0.0%
Column name: PTRACCAT - missing values: 0 - 0.0%
Column name: Entorhinal - missing values: 0 - 0.0%
Column name: Fusiform - missing values: 0 - 0.0%
Column name: Hippocampus - missing values: 0 - 0.0%
Column name: ICV - missing values: 0 - 0.0%
Column name: MidTemp - missing values: 0 - 0.0%
Column name: Ventricles - missing values: 0 - 0.0%
Column name: WholeBrain - missing values: 0 - 0.0%
Column name: ADAS11 - missing values: 0 - 0.0%
Column name: ADAS13 - missing values: 0 - 0.0%
Column name: CDRSB - missing values: 0 - 0.0%
Column name: MMSE - missing values: 0 - 0.0%
Column name: RAVLT_forgetting - missing values: 0 - 0.0%
Column name: RAVLT_immediate - missing values: 0 - 0.0%
Column name: RAVLT_lea

In [24]:
# Compute apoe4 stats
apoe4_stats = feature_set['APOE4'].value_counts()
apoe4_stats = apoe4_stats / len(feature_set)

In [25]:
rng = np.random.default_rng(seed=42)
nan_apoe_rows = feature_set.index[feature_set['APOE4'].isna()]

# Apply data imputation on APOE4 column. Values are replaced based on their probability of occurence
for nan_apoe in nan_apoe_rows:
    rnd = rng.random()
    
    if rnd < apoe4_stats[0]:
        feature_set.loc[nan_apoe, 'APOE4'] = 0
    elif rnd >= apoe4_stats[0] and rnd <= apoe4_stats[1]:
        feature_set.loc[nan_apoe, 'APOE4'] = 1
    else:
        feature_set.loc[nan_apoe, 'APOE4'] = 2

In [26]:
count_missing(feature_set)

Column name: AGE - missing values: 0 - 0.0%
Column name: APOE4 - missing values: 0 - 0.0%
Column name: PTEDUCAT - missing values: 0 - 0.0%
Column name: PTETHCAT - missing values: 0 - 0.0%
Column name: PTGENDER - missing values: 0 - 0.0%
Column name: PTMARRY - missing values: 0 - 0.0%
Column name: PTRACCAT - missing values: 0 - 0.0%
Column name: Entorhinal - missing values: 0 - 0.0%
Column name: Fusiform - missing values: 0 - 0.0%
Column name: Hippocampus - missing values: 0 - 0.0%
Column name: ICV - missing values: 0 - 0.0%
Column name: MidTemp - missing values: 0 - 0.0%
Column name: Ventricles - missing values: 0 - 0.0%
Column name: WholeBrain - missing values: 0 - 0.0%
Column name: ADAS11 - missing values: 0 - 0.0%
Column name: ADAS13 - missing values: 0 - 0.0%
Column name: CDRSB - missing values: 0 - 0.0%
Column name: MMSE - missing values: 0 - 0.0%
Column name: RAVLT_forgetting - missing values: 0 - 0.0%
Column name: RAVLT_immediate - missing values: 0 - 0.0%
Column name: RAVLT_lea

## Normalise numerical features and encode categorical features

In [27]:
feature_set.columns[np.r_[0:3, 7:22]]

Index(['AGE', 'APOE4', 'PTEDUCAT', 'Entorhinal', 'Fusiform', 'Hippocampus',
       'ICV', 'MidTemp', 'Ventricles', 'WholeBrain', 'ADAS11', 'ADAS13',
       'CDRSB', 'MMSE', 'RAVLT_forgetting', 'RAVLT_immediate',
       'RAVLT_learning', 'RAVLT_perc_forgetting'],
      dtype='object')

In [28]:
# Normalize numerical features
for column in feature_set.columns[np.r_[0:3, 7:22]]:
    feature_set[column] = (feature_set[column] - feature_set[column].mean()) / feature_set[column].std()

In [29]:
# One-hot encode categorical features
for column in feature_set.columns[3:7]:
    feature_set = pd.concat([feature_set, pd.get_dummies(feature_set[column], prefix=column)], axis=1)

# Drop categorical columns
feature_set = feature_set.drop(feature_set.columns[3:7], axis=1)

In [30]:
# Concat study dataframe
study_df = pd.concat([meta_data, feature_set, months], axis=1)
study_df

Unnamed: 0,PTID,DX,AGE,APOE4,PTEDUCAT,Entorhinal,Fusiform,Hippocampus,ICV,MidTemp,...,PTMARRY_Unknown,PTMARRY_Widowed,PTRACCAT_Am Indian/Alaskan,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White,Month
629,002_S_0295,0.0,1.628062,0.808592,0.681395,1.450490e-01,-2.795334e-01,0.251196,0.612624,1.359999e-01,...,0,0,0,0,0,0,0,0,1,0
630,002_S_0295,0.0,1.628062,0.808592,0.681395,2.007542e-01,2.807774e-01,0.105337,0.682891,1.346479e-01,...,0,0,0,0,0,0,0,0,1,6
631,002_S_0295,0.0,1.628062,0.808592,0.681395,8.416189e-02,3.280204e-02,0.103641,0.720972,1.657440e-01,...,0,0,0,0,0,0,0,0,1,12
5723,002_S_0295,0.0,1.628062,0.808592,0.681395,8.416189e-02,3.280204e-02,0.103641,0.720972,1.657440e-01,...,0,0,0,0,0,0,0,0,1,18
632,002_S_0295,0.0,1.628062,0.808592,0.681395,4.507799e-01,3.254508e-01,0.057000,0.714953,-5.328085e-02,...,0,0,0,0,0,0,0,0,1,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10355,941_S_5124,0.0,0.447034,-0.771677,1.397328,-2.356446e-14,5.784645e-14,-1.879030,-0.261713,-8.853422e-14,...,0,0,0,0,0,0,0,0,1,24
4298,941_S_5193,0.0,-0.165351,-0.771677,-0.034538,8.497847e-01,-4.165823e-01,0.387727,-0.424726,-5.825910e-01,...,0,0,0,0,0,0,0,0,1,0
4524,941_S_5193,0.0,-0.165351,-0.771677,-0.034538,4.468935e-01,-7.092312e-01,0.015447,-0.423559,-7.461837e-01,...,0,0,0,0,0,0,0,0,1,3
9318,941_S_5193,0.0,-0.165351,-0.771677,-0.034538,4.468935e-01,-7.092312e-01,0.015447,-0.423559,-7.461837e-01,...,0,0,0,0,0,0,0,0,1,6


### Oversampling the data

In [97]:
def preprocess_oversampling(data_frame):
    features = []
    labels = []

    for name, trajectory in data_frame.groupby("PTID"):

        # Check if sampled trajectory should be labelled postive
        ad_diagnosed = trajectory[trajectory['DX'] == 1]
        label = 1 if len(ad_diagnosed) > 0 else 0

        # Collect sampled trajectory + label
        features.append(name)
        labels.append(label)
        
    labels = np.array(labels)
    features = np.array(features)
    
    # Extract postive and negative samples from the data
    bool_labels = labels == 1
    pos_features = features[bool_labels]
    neg_features = features[~bool_labels]

    pos_labels = labels[bool_labels]
    neg_labels = labels[~bool_labels]
    
    return pos_features, neg_features, pos_labels, neg_labels

def oversample_training_data(pos_features, neg_features, pos_labels, neg_labels, ratio = 1):
    rng = np.random.default_rng(seed=42)
    
    # Randomly choose positive samples based on target_ratio
    ids = np.arange(len(pos_features))
    choices = rng.choice(ids, ratio * len(neg_labels))
    
    # Sample positive features
    res_pos_features = pos_features[choices]
    res_pos_labels = pos_labels[choices]
    
    # Concatenate features and labels again
    res_features = np.concatenate([res_pos_features, neg_features])
    res_labels = np.concatenate([res_pos_labels, neg_labels])
    
    # Shuffle positive and negative instances
    order = np.arange(len(res_labels))
    rng.shuffle(order)
    res_features = res_features[order]
    res_labels = res_labels[order]
    
    return res_features, res_labels

In [98]:
pos_features, neg_features, pos_labels, neg_labels = preprocess_oversampling(study_df)
res_features, res_labels = oversample_training_data(pos_features, neg_features, pos_labels, neg_labels)