In this file we are going to perform a stratified undersampling based on the following attributes:
- Age
- Weight
- Sex

**Importing libraries**

In [1]:
import pandas as pd
from pathlib import Path
import os
from tqdm import tqdm
import shutil
import random
import numpy as np

Getting path to the metadata directory

In [4]:
path = path = str(Path(os.getcwd()).parent.absolute().parent.absolute()) + '\\data\\adni-metadata'
print(path)

d:\Projects\ADNI-brain-MRI-alzheimer-classification-and-GAN-generation\data\adni-metadata


Getting the needed attributes

In [5]:
weight_df = pd.read_csv(path + '\\weight.csv')


In [6]:
def weight_to_cat(weight):
    if weight < 90:
        return 'underweight'
    else:
        return 'overweight'

In [7]:
weight_df['weight_cat'] = weight_df['Weight'].apply(weight_to_cat)
weight_df = weight_df.drop('Weight', axis=1)

In [8]:
age_sex_df = pd.read_csv(path + '\\downloaded_images_subject_list.csv')
age_sex_df.head()

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I63897,941_S_1363,MCI,F,70,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,3/12/2007,NiFTI,5/20/2023
1,I123812,941_S_1311,MCI,M,71,m18,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,9/21/2008,NiFTI,5/20/2023
2,I143861,941_S_1311,MCI,M,71,m24,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,3/31/2009,NiFTI,5/20/2023
3,I112538,941_S_1311,MCI,M,70,m12,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,6/01/2008,NiFTI,5/20/2023
4,I97327,941_S_1311,MCI,M,69,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,3/02/2007,NiFTI,5/20/2023


In [9]:
age_sex_df['RID'] = age_sex_df['Subject'].apply(lambda x: int(x.split('_')[2]))

In [10]:
def age_to_cat(age):
    if age<70:
        return '<70'
    elif age>=70 and age<80:
        return '70-80'
    else:
        return '>80'

In [11]:
age_sex_df['Age_cat'] = age_sex_df['Age'].apply(lambda x: age_to_cat(x))

In [12]:
age_sex_df = age_sex_df[['RID','Age_cat', 'Sex']]

In [13]:
df = weight_df.merge(age_sex_df, on='RID', how='inner')

In [14]:
df = df.drop_duplicates(subset='RID', keep="last")

In [15]:
df.head()

Unnamed: 0,RID,weight_cat,Age_cat,Sex
24,295,underweight,>80,M
49,413,underweight,70-80,F
53,559,overweight,>80,M
54,619,overweight,>80,M
70,685,underweight,>80,F


Getting the combinations of attributes to perform the stratified undersampling

In [16]:
categories = []
for i in list(df.drop('RID', axis=1).value_counts().index):
    tuple = {'weight_cat': i[0],
             'Age_cat': i[1],
             'Sex': i[2]}
    categories.append(tuple)
categories


[{'weight_cat': 'underweight', 'Age_cat': '70-80', 'Sex': 'F'},
 {'weight_cat': 'underweight', 'Age_cat': '70-80', 'Sex': 'M'},
 {'weight_cat': 'underweight', 'Age_cat': '>80', 'Sex': 'M'},
 {'weight_cat': 'underweight', 'Age_cat': '>80', 'Sex': 'F'},
 {'weight_cat': 'overweight', 'Age_cat': '70-80', 'Sex': 'M'},
 {'weight_cat': 'underweight', 'Age_cat': '<70', 'Sex': 'F'},
 {'weight_cat': 'underweight', 'Age_cat': '<70', 'Sex': 'M'},
 {'weight_cat': 'overweight', 'Age_cat': '>80', 'Sex': 'M'},
 {'weight_cat': 'overweight', 'Age_cat': '<70', 'Sex': 'M'},
 {'weight_cat': 'overweight', 'Age_cat': '70-80', 'Sex': 'F'},
 {'weight_cat': 'overweight', 'Age_cat': '>80', 'Sex': 'F'}]

Getting the subjects for each class

In [18]:
os.getcwd()

'd:\\Projects\\ADNI-brain-MRI-alzheimer-classification-and-GAN-generation\\scripts\\2_class_imbalance_management'

In [19]:
path = str(Path(os.getcwd()).parent.absolute().parent.absolute()) + '\\adni-images-Full-resized160x192\\metadata\\subjects_train_ad.txt'
subjects_ad = []
with open(path, 'r') as f:
    subjects = f.readlines()
    subjects_ad = [int(subject.strip().split('_')[2]) for subject in subjects]

In [20]:
path = str(Path(os.getcwd()).parent.absolute().parent.absolute()) + '\\adni-images-Full-resized160x192\\metadata\\subjects_train_nor.txt'
subjects_nc = []
with open(path, 'r') as f:
    subjects = f.readlines()
    subjects_nc = [int(subject.strip().split('_')[2]) for subject in subjects]

In [21]:
path = str(Path(os.getcwd()).parent.absolute().parent.absolute()) + '\\adni-images-Full-resized160x192\\metadata\\subjects_train_mci.txt'
subjects_mci = []
with open(path, 'r') as f:
    subjects = f.readlines()
    subjects_mci = [int(subject.strip().split('_')[2]) for subject in subjects]

Determing which class has the least number of subjects

In [22]:
min = float('inf')
min_class = ''
if len(subjects_ad) < min:
    min = len(subjects_ad)
    min_class = 'ad'
if len(subjects_nc) < min:
    min = len(subjects_nc)
    min_class = 'nor'
if len(subjects_mci) < min:
    min = len(subjects_mci)
    min_class = 'mci'

print('Min class: ' + min_class, 'with ' + str(min) + ' subjects')

Min class: ad with 89 subjects


The minority class is **ad**. Getting now the ratio of subjects to be removed from the majority classes

In [23]:
mci_ratio = min/len(subjects_mci)
nc_ratio = min/len(subjects_nc)
ad_ratio = min/len(subjects_ad)
mci_ratio, nc_ratio, ad_ratio

(0.3296296296296296, 0.5894039735099338, 1.0)

Now, for each of the categories defined earlier we are going to take a random sample of subjects to be kept from the majority classes based on these ratios

Creating empty folders to store the new dataset

In [25]:
data_path = str(Path(os.getcwd()).parent.absolute().parent.absolute()) + '\\adni-images-Full-resized160x192'

try: os.mkdir(data_path + '\\train_strat_sub_under')
except: print('train_strat_sub_under already exists')

for target in os.listdir(data_path + '\\train_unbalanced'):
    try : os.mkdir(data_path + '\\train_strat_sub_under\\' + target)
    except: print('train_strat_sub_under\\' + target + ' already exists')

train_strat_sub_under already exists


In [26]:
for target in os.listdir(data_path + '\\train_unbalanced'):
    
    print('Creating ' + target + ' class')
    
    if target == min_class: # No undersampling for the min class
        for image in tqdm(os.listdir(data_path + '\\train_unbalanced\\' + target)):
            shutil.copy(data_path + '\\train_unbalanced\\' + target + '\\' + image, data_path + '\\train_strat_sub_under\\' + target + '\\' + image)
        continue
    
    if target == 'ad':
        class_subjects = subjects_ad
        class_ratio = ad_ratio
    elif target == 'mci':
        class_subjects = subjects_mci
        class_ratio = mci_ratio
    else:
        class_subjects = subjects_nc
        class_ratio = nc_ratio
    
    tot_subjects = []
    for category in categories:
        subjects = df[(df['weight_cat']==category['weight_cat']) & 
                      (df['Age_cat']==category['Age_cat']) &
                      (df['Sex']==category['Sex'])]['RID'].tolist()
        subjects = [subject for subject in subjects if subject in class_subjects]
        if len(subjects) == 0:
            continue
        random.shuffle(subjects)
        subjects = subjects[:int(np.ceil(len(subjects)*class_ratio))]
        tot_subjects += subjects
    
    print('Total subjects: ' + str(len(tot_subjects)))
    
    for image in tqdm(os.listdir(data_path + '\\train_unbalanced\\' + target)):
        subject = int(image.split('_')[3])

        if subject in tot_subjects:
            shutil.copy(data_path + '\\train_unbalanced\\' + target + '\\' + image, data_path + '\\train_strat_sub_under\\' + target + '\\' + image)
    

Creating ad class


100%|██████████| 17166/17166 [01:14<00:00, 231.75it/s]


Creating mci class
Total subjects: 94


100%|██████████| 72780/72780 [01:47<00:00, 677.90it/s] 


Creating nor class
Total subjects: 93


100%|██████████| 39689/39689 [01:43<00:00, 383.66it/s] 


Verifiying that the number of subjects in each class is aproximately the same

In [27]:
for target in os.listdir(data_path + '\\train_strat_sub_under'):
    print(target, len(os.listdir(data_path + '\\train_strat_sub_under\\' + target)))

ad 17166
mci 24780
nor 23928
