In [8]:
#load data
import pandas as pd

path = r'Z:/03-Lab-meeting/2024-Tutorial/TADPOLE/TADPOLE_D1_D2.csv'
data = pd.read_csv(path)

  data = pd.read_csv(path)


In [9]:
#convert patients diagnosed with LMCI and EMCI to MCI
data['DX_bl'] = data['DX_bl'].replace(['EMCI', 'LMCI'], 'MCI')

#drop patients diagnosed with SMC
data = data[data['DX_bl'] != 'SMC']

In [10]:
#filter only the cortical region volumes from the Desikan atlas
columns_exclude = [
    'ST123CV_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16',
    'ST22CV_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16',
    'ST64CV_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16',
    'ST81CV_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16'
]

cv_columns = [col for col in data.columns if 'CV_UCSFFSL' in col and col not in columns_exclude]

#normalize each region volume by the intracranial volume
for col in cv_columns:
    data[col] = data[col] / data['ICV_bl']

selected_columns = ['RID', 'VISCODE', 'DX_bl', 'DX', 'DXCHANGE', 'ICV'] + cv_columns
filtered_data = data[selected_columns]

print(filtered_data)

        RID VISCODE DX_bl        DX  DXCHANGE        ICV  \
0         2      bl    CN        NL       1.0  1984660.0   
1         3      bl    AD  Dementia       3.0  1920690.0   
2         3     m06    AD  Dementia       3.0  1906430.0   
3         3     m12    AD  Dementia       3.0  1903820.0   
4         3     m24    AD  Dementia       3.0  1903420.0   
...     ...     ...   ...       ...       ...        ...   
12736  4167     m60   MCI       NaN       NaN        NaN   
12737  4199     m60   MCI       NaN       NaN        NaN   
12738  4557     m60   MCI       NaN       NaN        NaN   
12739  4512     m48   MCI       MCI       NaN        NaN   
12740  2380     m72   MCI       MCI       NaN        NaN   

       ST102CV_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16  \
0                                                 NaN   
1                                            0.001698   
2                                                 NaN   
3                                            0.0015

In [11]:
#include only patients with at least two visits
filtered_data = filtered_data.groupby('RID').filter(lambda x: len(x) > 1)

#use only sessions where patients visited at six-month interval
def convert_viscode(value):
    if value == 'bl':
        return 0
    elif value.startswith('m') and value[1:].isdigit():
        return int(value[1:])
    else:
        return value
filtered_data['VISCODE'] = filtered_data['VISCODE'].apply(convert_viscode)        

def trim_to_six_month_intervals(group):
    viscodes = sorted(group['VISCODE'].unique())
    
    expected_viscodes = list(range(0, max(viscodes) + 1, 6))
    
    if viscodes != expected_viscodes:
        for i, viscode in enumerate(viscodes):
           if i >= len(expected_viscodes) or viscode != expected_viscodes[i]:
                return group[group['VISCODE'] <= expected_viscodes[i-1]]
    return group

filtered_data = filtered_data.groupby('RID').apply(trim_to_six_month_intervals)

filtered_data.reset_index(drop=True, inplace=True)

print(filtered_data)

       RID  VISCODE DX_bl        DX  DXCHANGE        ICV  \
0        2        0    CN        NL       1.0  1984660.0   
1        2        6    CN        NL       1.0        NaN   
2        3        0    AD  Dementia       3.0  1920690.0   
3        3        6    AD  Dementia       3.0  1906430.0   
4        3       12    AD  Dementia       3.0  1903820.0   
...    ...      ...   ...       ...       ...        ...   
6454  5251        0    AD  Dementia       3.0  1267190.0   
6455  5251        6    AD  Dementia       3.0        NaN   
6456  5252        0    AD  Dementia       3.0  1672300.0   
6457  5252        6    AD  Dementia       3.0  1672830.0   
6458  5275        0    AD  Dementia       3.0  1748210.0   

      ST102CV_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16  \
0                                                NaN   
1                                                NaN   
2                                           0.001698   
3                                                NaN   

  filtered_data = filtered_data.groupby('RID').apply(trim_to_six_month_intervals)


In [12]:
#for patients whose diagnosis changed, drop any data beyond the point of diagnosis change
target_dxchange = [4,5,6,7,8,9]
def filter_group(group):
    group = group.sort_values(by='VISCODE')
    condition_idx = group[group['DXCHANGE'].isin(target_dxchange)].last_valid_index()
    if condition_idx is not None:
        return group.loc[:condition_idx]
    else:
        return group

filtered_data = filtered_data.groupby('RID').apply(filter_group).reset_index(drop=True)
print(filtered_data)

       RID  VISCODE DX_bl        DX  DXCHANGE        ICV  \
0        2        0    CN        NL       1.0  1984660.0   
1        2        6    CN        NL       1.0        NaN   
2        3        0    AD  Dementia       3.0  1920690.0   
3        3        6    AD  Dementia       3.0  1906430.0   
4        3       12    AD  Dementia       3.0  1903820.0   
...    ...      ...   ...       ...       ...        ...   
5691  5251        0    AD  Dementia       3.0  1267190.0   
5692  5251        6    AD  Dementia       3.0        NaN   
5693  5252        0    AD  Dementia       3.0  1672300.0   
5694  5252        6    AD  Dementia       3.0  1672830.0   
5695  5275        0    AD  Dementia       3.0  1748210.0   

      ST102CV_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16  \
0                                                NaN   
1                                                NaN   
2                                           0.001698   
3                                                NaN   

  filtered_data = filtered_data.groupby('RID').apply(filter_group).reset_index(drop=True)


In [13]:
filtered_data = filtered_data.groupby('RID').filter(lambda x: len(x) > 1)
filtered_data = filtered_data.fillna(0)

print(filtered_data)

       RID  VISCODE DX_bl        DX  DXCHANGE        ICV  \
0        2        0    CN        NL       1.0  1984660.0   
1        2        6    CN        NL       1.0        0.0   
2        3        0    AD  Dementia       3.0  1920690.0   
3        3        6    AD  Dementia       3.0  1906430.0   
4        3       12    AD  Dementia       3.0  1903820.0   
...    ...      ...   ...       ...       ...        ...   
5690  5231        6    AD  Dementia       3.0        0.0   
5691  5251        0    AD  Dementia       3.0  1267190.0   
5692  5251        6    AD  Dementia       3.0        0.0   
5693  5252        0    AD  Dementia       3.0  1672300.0   
5694  5252        6    AD  Dementia       3.0  1672830.0   

      ST102CV_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16  \
0                                           0.000000   
1                                           0.000000   
2                                           0.001698   
3                                           0.000000   

In [14]:
filtered_data.to_csv('preprocessed_data.csv', index=False)

1. What is the difference between longitudinal and cross-sectional data, and why is longitudinal data required for this task?
- cross sectional data는 특정 시점에서 수집된 데이터, longitudinal data는 동일 대상에 대해 시간에 따라 수집된 데이터
우리는 각 환자들이 미래에 어떻게 되는지 보고자 하므로 longitudinal data가 필요

2. Why do we need to normalize volume values using ICV?
- 환자마다 머리 크기가 다르기 때문. 정규화를 통해 개인 간 크기 차이 보정

3. How many cortical regions are included after filtering?
-