The purpose of this notebook is to pair MRI and PET scans that were collected within 30d of each other for each patient. The pairs are then separated into the A, T, N markers. These are each then split into a training, validation, and testing set. 

The data fed into this notebook is not included in the repository and must be aquired from the ADNI database. Some parts of this code may need to be changed to properly process your own data stored in csv files. 

## Setup

In [None]:
import os
os.environ['MPLCONFIGDIR'] = "/tmp/matplotlib/graph"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
import math
from math import ceil, floor
import shutil
import random

## MRI-PET Pairing

In [None]:
# calculate the difference between two study date
def date_diff(date1, date2):
    date1 = datetime.strptime(date1, '%m/%d/%Y').date()
    date2 = datetime.strptime(date2, '%m/%d/%Y').date()
    diff = abs((date1 - date2).days)
    return diff

# scan the total subject list and output the list of pairs between MRI and PET for each subject
def pair(file, time=30):
    # read and prepare the data
    df_total = pd.read_csv(file)
    df_total['Imaging Protocol'] = df_total['Imaging Protocol'].str.replace('Weighting=', '')
    df_total['Imaging Protocol'] = df_total['Imaging Protocol'].str.replace('Radiopharmaceutical=', '')
    
    PET_RADIO = ['18F-AV45', '18F-AV1451', '18F-FDG']
    PET_TYPE = ['A', 'T', 'N']
    
    interval_threshold = time # the threshold for date difference
    list_pair = []
    num_pair = 0
    num_pair_radio = np.zeros(5, dtype=int) # use 5 number to count pairs of 5 PET radios separately
    total_subject_id = df_total['Subject ID'].unique()
    
    for subject_id in tqdm(total_subject_id):
        df_subject = df_total.loc[df_total['Subject ID'] == subject_id]
        df_subject_mri = df_subject.loc[df_subject['Modality'] == 'MRI']
        df_subject_pet = df_subject.loc[df_subject['Modality'] == 'PET']
        
        list_mri_date = df_subject_mri['Study Date'].unique()
        list_pet_date = df_subject_pet['Study Date'].unique()
    
        for pet_date in list_pet_date:
            pet_radio = df_subject_pet.loc[df_subject_pet['Study Date'] == pet_date, ['Imaging Protocol']]
            pet_radio = pet_radio.values[0,0]
            for mri_date in list_mri_date:
                interval = date_diff(pet_date, mri_date)
                if interval <= interval_threshold:
                    num_pair += 1
                    if pet_radio in PET_RADIO:
                        if PET_RADIO.index(pet_radio) in range(len(PET_RADIO)):
                            num_pair_radio[PET_RADIO.index(pet_radio)] += 1
                            pet_type = PET_TYPE[PET_RADIO.index(pet_radio)]
                            list_pair.append([subject_id, mri_date, pet_date, pet_type, interval])
                        else:
                            print('PET type needs to be checked for subject {0} on study day {1}!'.format(subject_id, pet_date))
                    
    num_A = np.sum(num_pair_radio[0])
    num_T = num_pair_radio[1]
    num_N = num_pair_radio[2]
    print('Interval = {0} days. Found {1} pairs: {2} A, {3} T, {4} N'.format(time, num_pair, num_A, num_T, num_N))
    df_pair = pd.DataFrame(list_pair)
    df_pair.columns = ['Subject ID', 'MRI Date', 'PET Date', 'PET Type', 'Interval (day)']
    
    # insert some empty columns in df_pair
    df_pair['MRI Image ID (Original)'] = np.nan 
    #df_pair['MRI Manufacturer'] = np.nan 
    #df_pair['MRI field strength'] = np.nan 
    df_pair['PET Image ID (Dynamic)'] = np.nan 
    df_pair['PET Image ID (Averaged)'] = np.nan 
    df_pair['PET Image ID (Standardized)'] = np.nan 
    df_pair['PET Image ID (Uniform)'] = np.nan 
    df_pair['PET Manufacturer'] = np.nan 
    #df_pair['PET Radiopharmaceutical'] = np.nan 
    return df_pair

In [None]:
# Pairing MRI and PET using specific interval thresholds

file = 'csv/original/Total_original_12_25_2020.csv'

df_pair = pair(file, time=30)
df_pair

## Image Matching

Change date from 0-start to non-0-start for MRI and PET csv files

In [None]:
def form_date(file):
    df = pd.read_csv(file)
    for index, row in tqdm(df.iterrows()):
        date = row['Acq Date']
        year = date.split(sep='/')[2]
        month = date.split(sep='/')[0]
        day = date.split(sep='/')[1]
        if month[0] == '0':
            month = month[1:]
        if day[0] == '0':
            day = day[1:]
        date = month + '/' + day + '/' + year
        df.loc[index, ['Acq Date']] = date
    df.to_csv(file, index=False)

In [None]:
form_date('csv/original/MRI_original_1_10_2021.csv')
form_date('csv/original/PET_pre-processed_1_16_2021.csv')
form_date('csv/original/PET_manufacturer_3_10_2021.csv')

In [None]:
def match_pet_id(df_pet_, df_pair_):
    '''
    Scan all collected PET volumes (listed in df_pet_);
    If subject id and study date are matched in df_pair_, add image id to df_pair_.
    '''
    
    # sort values by 'Subject', 'Acq Date' and 'Description' in df_pet_
    df_pet_.sort_values(by=['Subject', 'Acq Date', 'Description'])
    
    # change date from 0-start to non-0-start
    for index, row in tqdm(df_pet_.iterrows()):
        date = row['Acq Date']
        n_matched_pet = df_pair_.loc[(df_pair_['Subject ID']==row['Subject']) & (df_pair_['PET Date']==date)].shape[0]
        if n_matched_pet > 0: # if subject and study date in df_pair_
            if 'Co-registered Dynamic' in row['Description']:
                df_pair_.loc[(df_pair_['Subject ID']==row['Subject']) & (df_pair_['PET Date']==date), ['PET Image ID (Dynamic)']] = row['Image Data ID']
            elif 'Co-registered, Averaged' in row['Description']:
                df_pair_.loc[(df_pair_['Subject ID']==row['Subject']) & (df_pair_['PET Date']==date), ['PET Image ID (Averaged)']] = row['Image Data ID']
            elif ('Coreg, Avg, Standardized Image and Voxel Size' in row['Description']) and ('Early' not in row['Description']):
                df_pair_.loc[(df_pair_['Subject ID']==row['Subject']) & (df_pair_['PET Date']==date), ['PET Image ID (Standardized)']] = row['Image Data ID']
            elif ('Coreg, Avg, Std Img and Vox Siz, Uniform Resolution' in row['Description']) and ('Early' not in row['Description']):
                df_pair_.loc[(df_pair_['Subject ID']==row['Subject']) & (df_pair_['PET Date']==date), ['PET Image ID (Uniform)']] = row['Image Data ID']
    df_pair_.reset_index(drop=True, inplace=True)

In [None]:
df_pet = pd.read_csv('csv/original/PET_pre-processed_1_16_2021.csv')

In [None]:
# Match PET image IDs for df_pair
match_pet_id(df_pet, df_pair)
df_pair

In [None]:
df_mri = pd.read_csv('csv/original/MRI_original_1_10_2021.csv')

des_list = list(set(df_mri['Description']))
for i in range(len(des_list)):
    des = des_list[i]
    count = df_mri.loc[df_mri['Description']==des].shape[0]
    des_list[i] = [des, count]

df_mri_des = pd.DataFrame(des_list,columns=['MRI Description', 'count'])
df_mri_des.sort_values(by=['count'], ascending=False, inplace=True, ignore_index=True)

In [None]:
def match_mri_id(df_mri_, df_pair_, desc):
    '''
    Scan all collected MRI volumes (listed in df_mri_);
    If subject id and study date are matched in df_pair_, add image id to df_pair_.
    '''
    df_mri_desc = df_mri_.loc[df_mri_['Description']==desc]
    df_mri_desc.sort_values(by=['Subject', 'Acq Date', 'Description']) # sort values
    
    for index, row in df_mri_desc.iterrows():
        if df_pair_.loc[(df_pair_['Subject ID']==row['Subject']) & (df_pair_['MRI Date']==row['Acq Date'])].shape[0] > 0: # if subject and study date in df_pair_
            if df_pair_.loc[(df_pair_['Subject ID']==row['Subject']) & (df_pair_['MRI Date']==row['Acq Date']), ['MRI Image ID (Original)']].isnull().values.any():
                df_pair_.loc[(df_pair_['Subject ID']==row['Subject']) & (df_pair_['MRI Date']==row['Acq Date']), ['MRI Image ID (Original)']] = row['Image Data ID']
    df_pair_.reset_index(drop=True, inplace=True)

In [None]:
# match MRI volumes through different Descriptions.

for i, r in df_mri_des.iterrows():
    des = r['MRI Description']
    match_mri_id(df_mri, df_pair, des)

## Add PET Manufacturer Info to df_pair

In [None]:
df_pet_vendor = pd.read_csv('csv/original/PET_manufacturer_3_10_2021.csv')
df_pet_vendor.dropna(subset=['Imaging Protocol'], inplace=True)


for i, r in tqdm(df_pair.iterrows()):
    if df_pair.loc[[i], 'PET Manufacturer'].isnull().values.any():
        mfg_info = df_pet_vendor.loc[(df_pet_vendor['Subject ID']==r['Subject ID']) & (df_pet_vendor['Acq Date']==r['PET Date']), ['Imaging Protocol']]
        if mfg_info.shape[0]>0:
            mfg_info = mfg_info.iat[0,0]
            if mfg_info.find('Manufacturer=') != -1:
                loc = mfg_info.find('Manufacturer=')+len('Manufacturer=')
                df_pair.loc[[i], 'PET Manufacturer'] = mfg_info[loc:]
            else:
                print('no manufacturer info.')
                

## Delete rows with NaN and reset index

In [None]:
df_pair = df_pair[['Subject ID', 
                   'MRI Date', 
                   'PET Date', 
                   'PET Type', 
                   'Interval (day)', 
                   'MRI Image ID (Original)', 
                   'PET Image ID (Standardized)', 
                   'PET Manufacturer']]

df_pair.dropna(inplace=True)
df_pair.reset_index(drop=True, inplace=True)
df_pair

## Shuffle df_pair based on subjects in order to split dataset

In [None]:
df_pair_shuffle = pd.DataFrame(columns = df_pair.columns) # create an empty DataFrame with the same columns as df_pair
subject_list = list(set(df_pair['Subject ID'])) # the list of all subject IDs
random.Random(1).shuffle(subject_list)  # use seed to shuffle the list to get the same order every time)

for s in tqdm(subject_list):  #for subject in subject_list:
    df_subject = df_pair.loc[df_pair['Subject ID']==s]
    df_pair_shuffle = df_pair_shuffle.append(df_subject, ignore_index=True)
df_pair = df_pair_shuffle
df_pair

## Split Data into PET_A, PET_T, PET_N

In [None]:
df_pair_A = df_pair.loc[df_pair['PET Type']=='A']
df_pair_A.reset_index(drop=True, inplace=True)

df_pair_T = df_pair.loc[df_pair['PET Type']=='T']
df_pair_T.reset_index(drop=True, inplace=True)

df_pair_N = df_pair.loc[df_pair['PET Type']=='N']
df_pair_N.reset_index(drop=True, inplace=True)

## Split dataset into training, validation and testing

In [None]:
# split the dataset based on the table above

train_percent = 0.70
validation_percent = 0.10
test_percent = 0.20

A_train_size = int(len(df_pair_A) * train_percent)
A_validation_size = int(len(df_pair_A) * validation_percent)

A_train = df_pair_A[:A_train_size]
A_train.reset_index(drop=True, inplace=True)
A_train.to_csv('csv/generated/A_train.csv', index=False)
A_val = df_pair_A[A_train_size:A_train_size+A_validation_size]
A_val.reset_index(drop=True, inplace=True)
A_val.to_csv('csv/generated/A_val.csv', index=False)
A_test = df_pair_A[A_train_size+A_validation_size:]
A_test.reset_index(drop=True, inplace=True)
A_test.to_csv('csv/generated/A_test.csv', index=False)

T_train_size = int(len(df_pair_T) * train_percent)
T_validation_size = int(len(df_pair_T) * validation_percent)

T_train = df_pair_T[:T_train_size]
T_train.reset_index(drop=True, inplace=True)
T_train.to_csv('csv/generated/T_train.csv', index=False)
T_val = df_pair_T[T_train_size:T_train_size+T_validation_size]
T_val.reset_index(drop=True, inplace=True)
T_val.to_csv('csv/generated/T_val.csv', index=False)
T_test = df_pair_T[T_train_size+T_validation_size:]
T_test.reset_index(drop=True, inplace=True)
T_test.to_csv('csv/generated/T_test.csv', index=False)

N_train_size = int(len(df_pair_N) * train_percent)
N_validation_size = int(len(df_pair_N) * validation_percent)

N_train = df_pair_N[:N_train_size]
N_train.reset_index(drop=True, inplace=True)
N_train.to_csv('csv/generated/N_train.csv', index=False)
N_val = df_pair_N[N_train_size:N_train_size+N_validation_size]
N_val.reset_index(drop=True, inplace=True)
N_val.to_csv('csv/generated/N_val.csv', index=False)
N_test = df_pair_N[N_train_size+N_validation_size:]
N_test.reset_index(drop=True, inplace=True)
N_test.to_csv('csv/generated/N_test.csv', index=False)