In [6]:
import pandas as pd
import numpy as np

In [7]:
A_train_more = pd.read_csv('../csv/generated/A_train_label_more_features.csv')
A_val_more = pd.read_csv('../csv/generated/A_val_label_more_features.csv')

In [16]:
def create_modeling_df(
    df,
    nfl_df,
    train=True,
    hippo_tuple=None):
    
    df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
    df['RID'] = df['Subject ID'].str.split('_').str[-1].str.lstrip('0')
    
    # Rename some amyloid columns so they are clearer
    df = df.rename(columns={'WCN':'amyloid_WCN', 'CLS':'amyloid_CLS'})
    
    # Hippocampal volumes
    ucb = pd.read_csv('../csv/original/UCBERKELEYAV45_01-12-2021.csv')
    ucb = ucb.loc[:, ['RID', 'EXAMDATE', 'LEFT_HIPPOCAMPUS_VOLUME', 'RIGHT_HIPPOCAMPUS_VOLUME']]
    ucb['RID'] = ucb['RID'].astype('str')

    merged_df = pd.merge(
        df,
        ucb.loc[:, ['RID', 'EXAMDATE', 'LEFT_HIPPOCAMPUS_VOLUME', 'RIGHT_HIPPOCAMPUS_VOLUME']],
        left_on=['RID', 'PET Date'],
        right_on=['RID', 'EXAMDATE'],
        how='left'
    )
    
    if train:
        L_min = merged_df['LEFT_HIPPOCAMPUS_VOLUME'].min()
        L_max = merged_df['LEFT_HIPPOCAMPUS_VOLUME'].max()
        R_min = merged_df['RIGHT_HIPPOCAMPUS_VOLUME'].min()
        R_max = merged_df['RIGHT_HIPPOCAMPUS_VOLUME'].max()
        
        merged_df['LEFT_HIPPOCAMPUS_VOLUME'] = \
            (merged_df['LEFT_HIPPOCAMPUS_VOLUME'] - L_min)/ L_max
        merged_df['RIGHT_HIPPOCAMPUS_VOLUME'] = \
            (merged_df['RIGHT_HIPPOCAMPUS_VOLUME'] - R_min)/ R_max
        
    else:
        L_min = 0
        L_max = 0
        R_min = 0
        R_max = 0
        
        merged_df['LEFT_HIPPOCAMPUS_VOLUME'] = \
            (merged_df['LEFT_HIPPOCAMPUS_VOLUME'] - hippo_tuple[0])/ hippo_tuple[1]
        merged_df['RIGHT_HIPPOCAMPUS_VOLUME'] = \
            (merged_df['RIGHT_HIPPOCAMPUS_VOLUME'] - hippo_tuple[2])/ hippo_tuple[3]
    
    merged_df = merged_df.drop(['RID', 'EXAMDATE'], axis=1)
    
    
    # Research groups
    nfl_df = nfl_df.loc[:, ['Subject ID', 'MRI Image ID (Original)', 'MMSE Total Score', 'Research Group', 'ADAS13', 'PLASMA_NFL', 'NFL_CLASS']]
    
    merged_df = pd.merge(
        merged_df,
        nfl_df,
        on=['MRI Image ID (Original)'],
        how='left'
    )
    merged_df = merged_df.drop(['Subject ID_y'], axis=1)
    merged_df = merged_df.rename(columns={'Subject ID_x':'Subject ID'})
    
    # Get dummies for research group
    dummies = pd.get_dummies(merged_df['Research Group'])
    
    merged_df = pd.concat([merged_df, dummies], axis=1)

    
    return merged_df, (L_min, L_max, R_min, R_max)

In [17]:
A_train_complete, hippo_tuple = create_modeling_df(
    A_train_more, 
    A_train_nfl
)

A_val_complete, _ = create_modeling_df(
    A_val_more, 
    A_val_nfl,
    train=False,
    hippo_tuple = hippo_tuple
)

# This one is for predicting MMSE and ADAS13 since they are not yet imputed or have the median
A_train_complete.to_csv('../csv/generated/A_train_complete_for_mmse_adas13.csv', index=False)
A_val_complete.to_csv('../csv/generated/A_val_complete_for_mmse_adas13.csv', index=False)

In [20]:
# Now we need to impute and normalize

mmse_median = np.median(np.array(A_train_complete['MMSE Total Score'].dropna()))
adas13_median = np.median(np.array(A_train_complete['ADAS13'].dropna()))

A_train_complete['MMSE Total Score'] = A_train_complete['MMSE Total Score'].fillna(mmse_median)
A_val_complete['MMSE Total Score'] = A_val_complete['MMSE Total Score'].fillna(mmse_median)

mmse_min = A_train_complete['MMSE Total Score'].min()
mmse_max = A_train_complete['MMSE Total Score'].max()

A_train_complete['MMSE Total Score'] = (A_train_complete['MMSE Total Score'] - mmse_min)/mmse_max

A_train_complete['ADAS13'] = A_train_complete['ADAS13'].fillna(mmse_median)
A_val_complete['ADAS13'] = A_val_complete['ADAS13'].fillna(mmse_median)

adas_min = A_train_complete['ADAS13'].min()
adas_max = A_train_complete['ADAS13'].max()

A_train_complete['ADAS13'] = (A_train_complete['ADAS13'] - adas_min)/adas_max

# This one is for predicting MMSE and ADAS13 since they are not yet imputed or have the median
A_train_complete.to_csv('../csv/generated/A_train_complete.csv', index=False)
A_val_complete.to_csv('../csv/generated/A_val_complete.csv', index=False)

In [24]:
A_train_complete.columns

Index(['Subject ID', 'MRI Date', 'PET Date', 'PET Type', 'Interval (day)',
       'MRI Image ID (Original)', 'PET Image ID (Standardized)',
       'PET Manufacturer', 'amyloid_WCN', 'amyloid_CLS', 'Image ID', 'Age',
       'Sex_F', 'Sex_M', 'APOE A1_2', 'APOE A1_3', 'APOE A1_4', 'APOE A2_2',
       'APOE A2_3', 'APOE A2_4', 'LEFT_HIPPOCAMPUS_VOLUME',
       'RIGHT_HIPPOCAMPUS_VOLUME', 'MMSE Total Score', 'Research Group',
       'ADAS13', 'PLASMA_NFL', 'NFL_CLASS', 'AD', 'CN', 'EMCI', 'LMCI', 'MCI',
       'SMC'],
      dtype='object')

In [None]:
feature_cols = [
    'Age',
    'Sex_F', 
    'Sex_M',
    'APOE A1_2', 
    'APOE A1_3', 
    'APOE A1_4', 
    'APOE A2_2',
    'APOE A2_3', 
    'APOE A2_4', 
    'LEFT_HIPPOCAMPUS_VOLUME',
    'RIGHT_HIPPOCAMPUS_VOLUME', 
    'MMSE Total Score',
    'ADAS13',
    'AD', 
    'CN', 
    'EMCI', 
    'LMCI', 
    'MCI',
    'SMC'
]

print(len(feature_cols))

with pd.option_context('display.max_columns', None):
    display(A_train_complete)