In [2]:
import pandas as pd
import numpy as np
import re
from os import walk, path

### Basically what I did here was take the variable labels I got from Dallan and figure out how to combine those with the original file paths so that it all matches up correctly and is ready to be used in training a CNN

In [None]:
# These are functions I pulled from matrix_script.py in order to complete this task

pattern = re.compile(r'ADNI_.+_.+_.+_.+_.+[_].+_(?:.+).+_(\d+)_.+_.+')
pattern2 = re.compile(r'I\d{4,7}')
pattern3 = re.compile(r'3d_matrix')
pattern4 = re.compile(r'MCI|AD|CN|EMCI|LMCI|SMC')
pattern5 = re.compile(r'ADNI_.+_.+_.+_.+_.+[_].+_(?:.+).+_(\d+)_.+_.+_pm.npy')
pattern6 = re.compile(r'ADNI_1\/ADNI\/(\d{3}_S_\d+)\/')

def populate_labels(paths_list, labels_list):
    labels = []
    length = len(paths_list)
    for i in range(length):
        image_id = re.findall(pattern2, paths_list[i])[0]
        labels_list_entry = next(entry for entry in labels_list if re.search(image_id, entry))
        label = re.search(pattern4, labels_list_entry)[0]
        labels.append(label)
    if len(labels) == length:
        df = pd.DataFrame({'File Path':paths_list, 'Research Group':labels})
        for i in range(len(df)):
            if (df['Research Group'][i] == 'EMCI') | (df['Research Group'][i] == 'LMCI'):
                df['Research Group'][i] = 'MCI'
            elif df['Research Group'][i] == 'SMC':
                df['Research Group'][i] = 'AD'
    else:
        print('Lengths incompatible')
    return df

def create_csv(directory1):
    paths_2d = []
    for root, dirs, files in walk(directory1):
        for name in files:
            a = 'No'
            new_path = path.join(root, name)
            if name == '.DS_Store':
                continue
            if re.search(pattern5, name):
                paths_2d.append(new_path)
    return paths_2d

In [304]:
# Get the updated data frame and original data frame

updated_diagnoses_df = pd.read_csv('../ADNI_1_updated_diagnoses.csv')
dallan_data = pd.read_csv('../Splitting_Data/Dallan_data.csv')

In [305]:
# Sort updated_diagnoses_df and dallan_data to be in the same order
# Need to first standardize datetime and rename a column

updated_diagnoses_df.rename(columns={'Acq.Date': 'Acq Date'}, inplace=True)
updated_diagnoses_df['Acq Date'] = pd.to_datetime(updated_diagnoses_df['Acq Date'])
dallan_data['Acq Date'] = pd.to_datetime(dallan_data['Acq Date'])

updated_diagnoses_df = updated_diagnoses_df.sort_values(['RID', 'Acq Date']).reset_index(drop=True)
dallan_data = dallan_data.sort_values(['RID', 'Acq Date']).reset_index(drop=True)

  updated_diagnoses_df['Acq Date'] = pd.to_datetime(updated_diagnoses_df['Acq Date'])


In [306]:
print(updated_diagnoses_df)
print(dallan_data)

       RID     Subject Group   Acq Date Visit DIAGNOSIS
0        2  011_S_0002    CN 2005-08-26    sc        CN
1        2  011_S_0002    CN 2005-08-26    sc        CN
2        3  011_S_0003    AD 2005-09-01    sc        AD
3        3  011_S_0003    AD 2005-09-01    sc        AD
4        3  011_S_0003    AD 2006-03-13   m06        AD
...    ...         ...   ...        ...   ...       ...
9161  1430  128_S_1430    AD 2008-04-04   m06        AD
9162  1430  128_S_1430    AD 2009-10-01   m24        AD
9163  1430  128_S_1430    AD 2009-10-01   m24        AD
9164  1435  041_S_1435    AD 2007-09-07    sc        AD
9165  1435  041_S_1435    AD 2007-09-07    sc        AD

[9166 rows x 6 columns]
         Subject   RID Group   Acq Date Visit            Description  \
0     011_S_0002     2    CN 2005-08-26    sc                 MPRAGE   
1     011_S_0002     2    CN 2005-08-26    sc          MPRAGE Repeat   
2     011_S_0003     3    AD 2005-09-01    sc                 MPRAGE   
3     011_S_000

In [308]:
# Ensure they are the same

(updated_diagnoses_df[['RID', 'Acq Date', 'Group', 'Subject']] == dallan_data[['RID', 'Acq Date', 'Group', 'Subject']]).sum()

RID         9166
Acq Date    9166
Group       9166
Subject     9166
dtype: int64

In [309]:
# Get original labels and updated labels

group = updated_diagnoses_df['Group'].to_list()
diagnosis = updated_diagnoses_df['DIAGNOSIS'].to_list()
diagnosis_series = updated_diagnoses_df['DIAGNOSIS']

In [313]:
# find all NA values in updated labels

na_values = diagnosis_series[diagnosis_series.isna()].index.to_list()

In [314]:
# Replace all NA updated labels with original labels
for i in na_values:
    diagnosis[i] = group[i]

In [None]:
# Put Image ID together with Diagnosis

for i in range(len(diagnosis)):
    diagnosis[i] = diagnosis[i]+dallan_data['Image Data ID'][i]

In [319]:
# Verify no more NA values and IDs were added correctly

print(diagnosis)

pd.Series(diagnosis).isna().sum()

['CNI7025', 'CNI7024', 'ADI7055', 'ADI7054', 'ADI16548', 'ADI16547', 'ADI24696', 'ADI24697', 'ADI73249', 'ADI73250', 'MCII7273', 'MCII7274', 'MCII16102', 'MCII16101', 'MCII30799', 'MCII30800', 'MCII65892', 'MCII65893', 'MCII130027', 'MCII130026', 'CNI7070', 'CNI7071', 'CNI11738', 'CNI11739', 'CNI24618', 'CNI24617', 'CNI193500', 'CNI193495', 'CNI117777', 'CNI117776', 'MCII8793', 'MCII8794', 'MCII16637', 'MCII16636', 'MCII30818', 'MCII30817', 'MCII55088', 'MCII55089', 'MCII82890', 'MCII82891', 'MCII142392', 'MCII142393', 'ADI7161', 'ADI7162', 'ADI14275', 'ADI14274', 'ADI79672', 'ADI79673', 'CNI7210', 'CNI7211', 'CNI12210', 'CNI12209', 'ADI8460', 'ADI8461', 'ADI14867', 'ADI14868', 'ADI91037', 'ADI91038', 'ADI91071', 'ADI91072', 'CNI7332', 'CNI7331', 'CNI15636', 'CNI15637', 'CNI31067', 'CNI31066', 'CNI84572', 'CNI84573', 'CNI127032', 'CNI127031', 'CNI7294', 'CNI7295', 'CNI8390', 'CNI8389', 'CNI14056', 'CNI14057', 'CNI14423', 'CNI14424', 'CNI26572', 'CNI26573', 'CNI26583', 'CNI26582', 'CNI7

0

Getting Dallan's data into a network-ready format

In [3]:
df_long = pd.read_csv('../2D_Images.csv')
long_paths_list = df_long['Paths']

In [328]:
df2 = populate_labels(paths_list=long_paths_list, labels_list=diagnosis)

In [336]:
df2.to_csv('../2D_Images_updated.csv',index=False)

In [4]:
df2 = pd.read_csv('../2D_Images_updated.csv')

In [5]:
(df_long['Labels'] == df2['Research Group']).sum()

1297430