# Pre-processing

In [1]:
import pandas as pd

In [2]:
# Read TADPOLE D1 & D2
original_df = pd.read_csv("../tadpole_challenge/TADPOLE_D1_D2.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Remove left-truncated patients

In [3]:
# Compute percentage of left-truncated patients
left_truncated_events = original_df[original_df['DX_bl'] == 'AD']

# Compute percentage of left-truncated patients
print(f"Percentage of left-truncated patients: {len(pd.unique(left_truncated_events['PTID'])) / len(pd.unique(original_df['PTID'])) * 100:.2f}%")

Percentage of left-truncated patients: 19.69%


In [4]:
# Remove left_truncated patients
non_truncated_events = original_df[original_df['DX_bl'] != 'AD']

print(f"Percentage of non-truncated patients: {len(pd.unique(non_truncated_events['PTID'])) / len(pd.unique(original_df['PTID'])) * 100:.2f}%")

Percentage of non-truncated patients: 80.31%


## Select desired data columns

In [5]:
# TODO check Intracranial
desired_columns = ['PTID', 'DX', 'AGE', 'APOE4', 'PTEDUCAT', 'PTETHCAT', 'PTGENDER', 'PTMARRY', 
                   'PTRACCAT', 'Entorhinal', 'Fusiform', 'Hippocampus', 'ICV', 'MidTemp', 'Ventricles', 
                   'WholeBrain', 'ADAS11', 'ADAS13', 'CDRSB', 'MMSE', 'RAVLT_forgetting', 
                   'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_perc_forgetting', 'Month']

study_df = non_truncated_events[desired_columns]
study_df = study_df.copy()
study_df.columns = desired_columns

study_df.head()

Unnamed: 0,PTID,DX,AGE,APOE4,PTEDUCAT,PTETHCAT,PTGENDER,PTMARRY,PTRACCAT,Entorhinal,...,WholeBrain,ADAS11,ADAS13,CDRSB,MMSE,RAVLT_forgetting,RAVLT_immediate,RAVLT_learning,RAVLT_perc_forgetting,Month
0,011_S_0002,NL,74.3,0.0,16,Not Hisp/Latino,Male,Married,White,4177.0,...,1229740.0,10.67,18.67,0.0,28.0,6.0,44.0,4.0,54.5455,0
5,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3983.0,...,1154980.0,14.33,21.33,1.0,27.0,4.0,37.0,7.0,36.3636,0
6,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3711.0,...,1116280.0,17.33,25.33,0.5,28.0,1.0,33.0,7.0,11.1111,6
7,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3519.0,...,1117390.0,15.0,22.0,1.0,26.0,1.0,37.0,4.0,12.5,12
8,022_S_0004,MCI,67.5,0.0,10,Hisp/Latino,Male,Married,White,3764.0,...,1095210.0,20.33,28.33,1.0,27.0,2.0,44.0,8.0,16.6667,18


In [6]:
# Make label binary, mark all Dementia instances as positive
study_df['DX'] = study_df['DX'].replace('Dementia', 1)
study_df['DX'] = study_df['DX'].replace(['MCI', 'NL', 'MCI to Dementia', 'NL to MCI', 'MCI to NL', 'Dementia to MCI', 'NL to Dementia'], 0)

In [7]:
# Compute some statistics for verification with data from the paper
ad_patients = study_df[study_df['DX'] == 1]
nr_ad_patients = len(ad_patients['PTID'].unique())
tot_patients = len(study_df['PTID'].unique())

print(f'Percentage of patients with a stable AD diagnosis: {nr_ad_patients / tot_patients * 100:.2f}%')
print(f'Effective percentage of measurements with positive event label: {nr_ad_patients / len(study_df) * 100:.2f}%')

# Employ label forwarding: mark all measures after the stable diagnosis of AD as positive
for pt_id in ad_patients['PTID'].unique():
    
    # Get events for this patient
    events = study_df[study_df['PTID'] == pt_id]
    
    # Get index of first stable diagnosis of AD
    ad_index = events.index[events['DX'] == 1][0]
    
    # Get indexes of measurements after first stable diagnosis
    forwarding_indexes = events.index[events.index > ad_index]
    
    # Employ label-forwarding
    study_df.loc[forwarding_indexes, 'DX'] = study_df.loc[forwarding_indexes, 'DX'].fillna(1)  

Percentage of patients with a stable AD diagnosis: 17.13%
Effective percentage of measurements with positive event label: 2.14%


In [8]:
study_df['DX'].fillna(0, inplace=True)
study_df['DX'].value_counts()

0.0    10096
1.0     1077
Name: DX, dtype: int64

In [9]:
# Sort dataframe by PTID, Month
study_df = study_df.sort_values(['PTID', 'Month'])
study_df[study_df['PTID'] == '023_S_0042']

Unnamed: 0,PTID,DX,AGE,APOE4,PTEDUCAT,PTETHCAT,PTGENDER,PTMARRY,PTRACCAT,Entorhinal,...,WholeBrain,ADAS11,ADAS13,CDRSB,MMSE,RAVLT_forgetting,RAVLT_immediate,RAVLT_learning,RAVLT_perc_forgetting,Month
84,023_S_0042,0.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,2784.0,...,952780.0,7.0,12.0,0.5,30.0,8.0,29.0,6.0,88.8889,0
85,023_S_0042,0.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,2131.0,...,938210.0,4.67,9.67,1.0,28.0,4.0,22.0,3.0,80.0,6
86,023_S_0042,0.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,2773.0,...,920277.0,11.0,15.0,2.0,24.0,7.0,30.0,1.0,100.0,12
87,023_S_0042,1.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,1825.0,...,926228.0,11.67,20.67,2.0,21.0,5.0,28.0,0.0,100.0,18
88,023_S_0042,1.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,2346.0,...,924895.0,9.33,18.33,3.0,22.0,7.0,25.0,2.0,100.0,24
5817,023_S_0042,1.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,,...,,,,,,,,,,30
89,023_S_0042,1.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,2350.0,...,910787.0,13.33,22.33,5.0,23.0,6.0,23.0,2.0,100.0,36
5818,023_S_0042,1.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,,...,,,,,,,,,,42
90,023_S_0042,1.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,2411.0,...,890128.0,20.67,30.67,4.5,23.0,4.0,20.0,0.0,100.0,54
5819,023_S_0042,1.0,72.8,0.0,18,Not Hisp/Latino,Male,Married,White,,...,,,,,,,,,,60


## Data imputation