### Data Preparation 

In [1]:
import warnings
warnings.filterwarnings('ignore') 

import math 
from os import path
import pandas as pd 
from tqdm import tqdm 

In [2]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

#### Loading dataset 

In [3]:
### Baseline 
bl_assessmentDF = pd.read_csv("Dataset/missing_handled_by_mean/baseline_data/Assessment_NOT_Normalized_bl.csv")
bl_CognitiveScoresDF = pd.read_csv("Dataset/missing_handled_by_mean/baseline_data/CognitiveScores_NOT_Normalized_bl.csv")
bl_MRIDF = pd.read_csv("Dataset/missing_handled_by_mean/baseline_data/MRI_NOT_Normalized_bl.csv") 

### Time Series 
ts_assessmentDF = pd.read_csv("Dataset/missing_handled_by_mean/BL_M18_data/Assessment_NOT_Normalized_ts.csv")
ts_CognitiveScoresDF = pd.read_csv("Dataset/missing_handled_by_mean/BL_M18_data/CognitiveScores_NOT_Normalized_ts.csv")
ts_MRIDF = pd.read_csv("Dataset/missing_handled_by_mean/BL_M18_data/MRI_NOT_Normalized_ts.csv") 

In [4]:
print("Baseline: ")
print("[BL] Assessment (shape):       {}".format(bl_assessmentDF.shape))
print("[BL] Cognitive Scores (shape): {}".format(bl_CognitiveScoresDF.shape))
print("[BL] MRI (shape):              {}".format(bl_MRIDF.shape))

print("")
print("Time Series: ")
print("[TS] Assessment (shape):       {}".format(ts_assessmentDF.shape))
print("[TS] Cognitive Scores (shape): {}".format(ts_CognitiveScoresDF.shape))
print("[TS] MRI (shape):              {}".format(ts_MRIDF.shape)) 

Baseline: 
[BL] Assessment (shape):       (1371, 19)
[BL] Cognitive Scores (shape): (1371, 11)
[BL] MRI (shape):              (1371, 45)

Time Series: 
[TS] Assessment (shape):       (5484, 19)
[TS] Cognitive Scores (shape): (5484, 11)
[TS] MRI (shape):              (5484, 45)


In [5]:
side_by_side(ts_assessmentDF.isnull().sum(), ts_assessmentDF.count())

RID                      0    RID                      5484
DX                       0    DX                       5484
ADNI_EF                  0    ADNI_EF                  5484
ADNI_MEM                 0    ADNI_MEM                 5484
AVDEL30MIN               0    AVDEL30MIN               5484
AVTOT4                   0    AVTOT4                   5484
AVTOT5                   0    AVTOT5                   5484
AVTOT6                   0    AVTOT6                   5484
EcogptMem                0    EcogptMem                5484
EcogspDivatt             0    EcogspDivatt             5484
EcogspLang               0    EcogspLang               5484
EcogspMem                0    EcogspMem                5484
EcogspOrgan              0    EcogspOrgan              5484
EcogspPlan               0    EcogspPlan               5484
EcogspTotal              0    EcogspTotal              5484
EcogspVisspat            0    EcogspVisspat            5484
LDELTOTAL                0    LDELTOTAL 

In [6]:
bl_assessmentDF.DX.unique()

array(['AD', 'sMCI', 'CN', 'pMCI'], dtype=object)

In [7]:
bl_assessmentDF.RID.nunique()

1371

In [8]:
bl_assessmentDF.head()

Unnamed: 0,RID,DX,ADNI_EF,ADNI_MEM,AVDEL30MIN,AVTOT4,AVTOT5,AVTOT6,EcogptMem,EcogspDivatt,EcogspLang,EcogspMem,EcogspOrgan,EcogspPlan,EcogspTotal,EcogspVisspat,LDELTOTAL,RAVLT_immediate,RAVLT_perc_forgetting
0,3,AD,-1.112,-0.848,0,5,4,3,2.333169,3.293456,2.797922,3.504395,3.19673,2.996852,3.086731,2.817981,2,22,100.0
1,4,sMCI,-0.929,0.096,7,8,11,6,2.212622,1.968075,1.690837,2.142118,1.675533,1.592488,1.760836,1.464665,4,37,36.3636
2,5,CN,0.338,0.589,5,8,9,4,1.670452,1.294579,1.17621,1.360968,1.168683,1.142392,1.209621,1.108819,12,37,44.4444
3,6,sMCI,-0.607,-0.242,1,7,6,4,2.212622,1.968075,1.690837,2.142118,1.675533,1.592488,1.760836,1.464665,3,30,83.3333
4,7,AD,-2.177,-1.334,1,5,4,1,2.333169,3.293456,2.797922,3.504395,3.19673,2.996852,3.086731,2.817981,0,17,75.0


In [9]:
ts_assessmentDF.head()

Unnamed: 0,RID,DX,ADNI_EF,ADNI_MEM,AVDEL30MIN,AVTOT4,AVTOT5,AVTOT6,EcogptMem,EcogspDivatt,EcogspLang,EcogspMem,EcogspOrgan,EcogspPlan,EcogspTotal,EcogspVisspat,LDELTOTAL,RAVLT_immediate,RAVLT_perc_forgetting
0,3,AD,-1.112,-0.848,0,5,4,3,2.333169,3.293456,2.797922,3.504395,3.19673,2.996852,3.086731,2.817981,2,22,100.0
1,3,AD,-0.335,-0.967,0,4,6,0,2.333169,3.293456,2.797922,3.504395,3.19673,2.996852,3.086731,2.817981,2,19,100.0
2,3,AD,-1.125,-0.736,0,6,7,2,2.333169,3.293456,2.797922,3.504395,3.19673,2.996852,3.086731,2.817981,0,31,100.0
3,3,AD,-1.125,-0.736,0,6,7,2,2.333169,3.293456,2.797922,3.504395,3.19673,2.996852,3.086731,2.817981,0,31,100.0
4,4,sMCI,-0.929,0.096,7,8,11,6,2.212622,1.968075,1.690837,2.142118,1.675533,1.592488,1.760836,1.464665,4,37,36.3636


In [10]:
bl_CognitiveScoresDF.head()

Unnamed: 0,RID,DX,ADAS 11,ADAS 13,CDGLOBAL,CDRSB,FAQTOTAL,GDTOTAL,MMSCORE,MOCA,NPISCORE
0,3,AD,22.0,31.0,1.0,6.0,10,2.0,24.0,18.8,1
1,4,sMCI,14.33,21.33,0.5,0.5,0,2.0,28.0,21.2,0
2,5,CN,8.67,14.67,0.0,0.0,0,0.0,29.0,23.7,0
3,6,sMCI,18.67,25.67,0.5,1.0,0,0.0,21.0,20.9,0
4,7,AD,27.33,40.33,1.0,6.0,17,2.0,23.0,12.7,10


In [11]:
ts_CognitiveScoresDF.head()

Unnamed: 0,RID,DX,ADAS 11,ADAS 13,CDGLOBAL,CDRSB,FAQTOTAL,GDTOTAL,MMSCORE,MOCA,NPISCORE
0,3,AD,22.0,31.0,1.0,6.0,10,2.0,24.0,18.8,1
1,3,AD,19.0,30.0,1.0,6.0,12,2.0,24.0,18.6,0
2,3,AD,24.0,35.0,0.5,3.5,17,2.0,17.0,18.7,4
3,3,AD,24.0,35.0,0.5,3.5,17,2.0,17.0,21.8,4
4,4,sMCI,14.33,21.33,0.5,0.5,0,2.0,28.0,21.2,0


In [12]:
bl_MRIDF.head()

Unnamed: 0,RID,DX,ST103CV,ST103TA,ST117CV,ST117TA,ST119CV,ST119TA,ST127SV,ST129CV,...,ST85TA,ST88SV,ST89SV,ST91CV,ST91TA,ST96SV,ST99CV,ST99TA,ST9SV,ST10CV
0,3,AD,1881,2.2,10352,2.3,1341,3.0,3155,6078,...,2.3,2376,5921,8355,2.4,28743,10387,2.5,3304,1920690
1,4,sMCI,2123,2.7,10615,2.7,1442,3.5,1336,5887,...,2.6,3419,480,10985,2.8,18952,11156,2.7,1338,1679440
2,5,CN,2305,2.7,8889,2.3,1999,3.5,1548,5733,...,2.6,3831,365,9870,2.8,16620,11579,2.9,1623,1640770
3,6,sMCI,1612,2.2,9079,2.7,1766,3.2,2651,5528,...,2.4,2657,926,6909,2.8,18023,9641,2.9,1035,1485830
4,7,AD,2312,2.8,7921,2.3,1795,3.1,1771,4596,...,2.4,3703,829,7385,2.4,9213,9069,2.5,1536,1353520


In [13]:
ts_MRIDF.head()

Unnamed: 0,RID,DX,ST103CV,ST103TA,ST117CV,ST117TA,ST119CV,ST119TA,ST127SV,ST129CV,...,ST85TA,ST88SV,ST89SV,ST91CV,ST91TA,ST96SV,ST99CV,ST99TA,ST9SV,ST10CV
0,3,AD,1881,2.2,10352,2.3,1341,3.0,3155,6078,...,2.3,2376,5921,8355,2.4,28743,10387,2.5,3304,1920690
1,3,AD,1994,2.2,10308,2.3,1159,2.5,3036,6178,...,2.2,2471,6224,8612,2.4,30036,9311,2.4,3173,1906430
2,3,AD,1776,2.0,10058,2.3,1350,2.9,2981,6045,...,2.2,2310,6501,7542,2.3,30538,9736,2.5,3235,1903820
3,3,AD,1776,2.0,10058,2.3,1350,2.9,2981,6045,...,2.2,2310,6501,7542,2.3,30538,9736,2.5,3235,1903820
4,4,sMCI,2123,2.7,10615,2.7,1442,3.5,1336,5887,...,2.6,3419,480,10985,2.8,18952,11156,2.7,1338,1679440


#### Time Series > Statistical Data  

In [17]:
def convert_column(column_name, rid, column_values): 
    if not math.isnan(column_values.std()): 
        std_value = column_values.std()
    else: 
        std_value = 0
        
    mean_value = column_values.mean()
    min_value  = column_values.min()
    max_value  = column_values.max()
    amp_value  = max_value - min_value 
     
    
    # create columns     
    slice_columns = ["RID", 
               (column_name + "_mean"),
               (column_name + "_max"),
               (column_name + "_min"), 
               (column_name + "_std")]
    
    # Creating a row 
    df = pd.DataFrame(columns = slice_columns)
    df.loc[0] = [rid, mean_value, max_value, min_value, std_value]
    
    return df  

In [18]:
def converter(df, filename): 
    df_columns = df.columns.to_list()[2:]
    
    parts_list = [] 
    
    for col in df_columns: 
        part = convert_column(col, df['RID'].iloc[0], df[col])
        parts_list.append(part) 
    
    join_dfs = parts_list[0] 
    
    for i in range(1, len(parts_list)): 
        join_dfs = pd.merge(join_dfs, parts_list[i], on='RID', how='inner') 
    
    if path.isfile(filename): 
        join_dfs.to_csv(filename, mode='a', header=False, index=False)
    else: 
        join_dfs.to_csv(filename, header=True, index=False)

In [19]:
# Three dataframes: ts_assessmentDF, ts_CognitiveScoresDF, ts_MRIDF 
assessment_filename       = "data/assessment_statistics.csv"
cogniteive_score_filename = "data/cogniteive_score_statistics.csv"
mri_filename              = "data/mri_statistics.csv"  

# finding unique rids 
unique_rids = ts_assessmentDF['RID'].unique()  

# Converting 
for i in tqdm(range(0, len(unique_rids))): 
    converter(ts_assessmentDF.loc[ts_assessmentDF['RID'] == unique_rids[i]], assessment_filename)
    converter(ts_CognitiveScoresDF.loc[ts_CognitiveScoresDF['RID'] == unique_rids[i]], cogniteive_score_filename) 
    converter(ts_MRIDF.loc[ts_MRIDF['RID'] == unique_rids[i]], mri_filename)

100%|███████████████████████████████████████████████████████████████████████████████| 1371/1371 [03:24<00:00,  6.70it/s]


In [20]:
labelsDF = bl_assessmentDF[['RID', 'DX']][:]
labelsDF.to_csv("Processed_dataset/four_labels.csv", index=False) 

#### End of Notebook