# Splitting data

In [56]:
import os

import numpy as np
import pandas as pd

from tqdm import tqdm

from typing import *

In [5]:
path_to_csv = "./data/Data_Entry_extended.csv"
path_to_test_list = "./data/test_list.txt"
path_to_train_val_list = "./data/train_val_list.txt"

In [6]:
label_classes: List[str] = ['No Finding', 'Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening', 'Cardiomegaly', 'Nodule', 'Mass', 'Hernia']

In [7]:
def load_metadata(path: str, class_labels: List[str]) -> pd.DataFrame:
    df = pd.read_csv(path)
    return df

In [8]:
def load_file(file_path: str) -> List[str]:
    with open(file_path, "r") as file:
        lines = [line.rstrip() for line in file]
    return lines

In [14]:
df = load_metadata(path_to_csv, label_classes)

In [15]:
train_val_data = load_file(path_to_train_val_list)

In [17]:
df_train_val = df.loc[df['Image Index'].isin(train_val_data)]

In [20]:
df_train_val

Unnamed: 0.1,Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],...,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
0,0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,...,0,0,0,0,0,0,1,0,0,0
1,1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,...,0,1,0,0,0,0,1,0,0,0
2,2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,...,0,0,0,1,0,0,1,0,0,0
3,3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,...,0,0,0,0,0,0,0,0,0,0
12,12,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112100,112100,00030789_000.png,Infiltration,0,30789,52,F,PA,2021,2021,...,0,0,0,0,0,0,0,0,0,0
112106,112106,00030793_000.png,Mass|Nodule,0,30793,58,F,PA,2021,2021,...,0,0,0,0,0,0,0,1,1,0
112108,112108,00030795_000.png,Pleural_Thickening,0,30795,53,F,PA,2021,2021,...,0,0,0,0,0,1,0,0,0,0
112114,112114,00030801_000.png,No Finding,0,30801,39,M,PA,2500,2048,...,0,0,0,0,0,0,0,0,0,0


In [22]:
unique_patients = np.unique(df_train_val['Patient ID'])

In [25]:
np.random.seed(42)
indices = np.arange(len(unique_patients))
np.random.shuffle(indices)

In [35]:
n_train: int = int(np.floor(len(indices) * 0.8))

training_indices = indices[:n_train]
val_indices = indices[n_train:]

train_patients = unique_patients[training_indices]
val_patients = unique_patients[val_indices]

train_data =  df_train_val[df_train_val['Patient ID'].isin(train_patients)]
val_data =  df_train_val[df_train_val['Patient ID'].isin(val_patients)]

In [41]:
print(len(train_data))
print(len(val_data))
print(len(train_data)/len(df_train_val))
print(len(val_data)/len(df_train_val))

68918
17606
0.7965188849336601
0.20348111506633998


In [42]:
train_list = train_data["Image Index"].tolist()
val_list = val_data["Image Index"].tolist()

In [43]:
train_list

['00000001_000.png',
 '00000001_001.png',
 '00000001_002.png',
 '00000002_000.png',
 '00000004_000.png',
 '00000005_000.png',
 '00000005_001.png',
 '00000005_002.png',
 '00000005_003.png',
 '00000005_004.png',
 '00000005_005.png',
 '00000005_006.png',
 '00000005_007.png',
 '00000006_000.png',
 '00000007_000.png',
 '00000008_000.png',
 '00000008_001.png',
 '00000008_002.png',
 '00000009_000.png',
 '00000010_000.png',
 '00000012_000.png',
 '00000015_000.png',
 '00000017_000.png',
 '00000017_001.png',
 '00000017_002.png',
 '00000018_000.png',
 '00000020_000.png',
 '00000020_001.png',
 '00000020_002.png',
 '00000021_000.png',
 '00000021_001.png',
 '00000022_000.png',
 '00000022_001.png',
 '00000023_000.png',
 '00000023_001.png',
 '00000023_002.png',
 '00000023_003.png',
 '00000023_004.png',
 '00000024_000.png',
 '00000025_000.png',
 '00000026_000.png',
 '00000029_000.png',
 '00000030_000.png',
 '00000030_001.png',
 '00000031_000.png',
 '00000033_000.png',
 '00000034_000.png',
 '00000034_00

In [45]:
def save_file(path_to_file: str, list_to_save: List[str]):
    with open(path_to_file, 'w') as file:
        for line in list_to_save:
            file.write(line + '\n')

In [46]:
save_file('./data/train_list.txt', train_list)
save_file('./data/val_list.txt', val_list)

In [49]:
test_list = load_file('./data/test_list.txt')

In [58]:
# MOVING ALL FILES
base_dir: str = './data/images_resized'
base_dir_train: str = './data/images_resized/train'
base_dir_val: str = './data/images_resized/val'
base_dir_test: str = './data/images_resized/test'

if not os.path.exists(base_dir_train):
    os.mkdir(base_dir_train)
    os.mkdir(base_dir_val)
    os.mkdir(base_dir_test)

    for train_img_name in tqdm(train_list):
        os.rename(os.path.join(base_dir, train_img_name), os.path.join(base_dir_train, train_img_name))

    for val_img_name in tqdm(val_list):
        os.rename(os.path.join(base_dir, val_img_name), os.path.join(base_dir_val, val_img_name))

    for test_img_name in tqdm(test_list):
        os.rename(os.path.join(base_dir, test_img_name), os.path.join(base_dir_test, test_img_name))

100%|██████████| 68918/68918 [00:46<00:00, 1470.83it/s]
100%|██████████| 17606/17606 [00:12<00:00, 1370.93it/s]
100%|██████████| 25596/25596 [00:19<00:00, 1346.93it/s]
