This notebook is used for splitting data into train/val/test sets

In [1]:
# Change this to your local
ROOT_DIR = "/home/andreasabo/Documents/HNProject"

In [2]:
data_set_to_split_on = "at_least_one_target_and_labelled_view"
# data_set_to_split_on = "labelled_view"

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

In [4]:
csv_path = os.path.join(ROOT_DIR, "all_label_df.csv")
df = pd.read_csv(csv_path)

In [5]:
# Define the df of images we want to consider as the entire dataset
if data_set_to_split_on == "at_least_one_target_and_labelled_view":
    labelled_view_df = df[(df.view_label != "Missing") & (df.view_label != "Other")]
    frame_to_split = labelled_view_df[(labelled_view_df.function_label != "Missing") | (labelled_view_df.reflux_label != "Missing") | (labelled_view_df.surgery_label != "Missing")]

elif  data_set_to_split_on == "labelled_view":
    frame_to_split = df[(df.view_label != "Missing") & (df.view_label != "Other")]
elif 0: # Replace this with other datasets as we define them
    pass
else:
    raise Exception("Invalid data to split on!")


# Some QA to make sure the dataset we are using is what we expect it to be
frame_to_split_count = frame_to_split.groupby('subj_id').scan_num.agg(['nunique'])
print(f"There are {len(frame_to_split)} from {frame_to_split_count['nunique'].sum()} scans from {frame_to_split.subj_id.nunique()} patients in this dataframe.")
print(frame_to_split.columns)

There are 9581 from 230 scans from 71 patients in this dataframe.
Index(['Unnamed: 0', 'num_in_seq', 'function_label', 'image_ids', 'image_manu',
       'reflux_label', 'surgery_label', 'view_label', 'subj_id', 'scan_num',
       'image_num'],
      dtype='object')


In [6]:
# Add image counts and percentage of total images that belong to each patient
patient_scan_details = frame_to_split.groupby('subj_id').image_num.agg(img_count='count')
patient_scan_details['img_percent_count'] = patient_scan_details['img_count'] / patient_scan_details['img_count'].sum()

# Calculate percent (of all images) that were taken by each machine for each patient 
machine_labels = dict(frame_to_split['image_manu'].value_counts())
for machine in machine_labels:
    this_machine = frame_to_split[frame_to_split['image_manu'] == machine]
    this_machine_count = this_machine.groupby('subj_id').image_manu.count()
#     patient_scan_details["percent_" + machine] = this_machine.groupby('subj_id').image_manu.count() / len(frame_to_split)
    patient_scan_details[machine] = this_machine.groupby('subj_id').image_manu.count()

# Replace nans with 0s because they're easier to work with
patient_scan_details = patient_scan_details.fillna(0)
print(patient_scan_details.sum(axis = 0))

img_count                   9581.0
img_percent_count              1.0
Philips Medical Systems     5119.0
TOSHIBA_MEC                 3599.0
ATL                          260.0
ACUSON                       219.0
SAMSUNG MEDISON CO.,LTD.     133.0
TOSHIBA_MEC_US               125.0
GE Medical Systems            83.0
GE Healthcare                 43.0
dtype: float64


In [7]:
frame_to_split.head()

Unnamed: 0.1,Unnamed: 0,num_in_seq,function_label,image_ids,image_manu,reflux_label,surgery_label,view_label,subj_id,scan_num,image_num
72,72,30,41,1068_1_31,TOSHIBA_MEC,No,No,Saggital_Right,1068,1,31
75,75,33,41,1068_1_34,TOSHIBA_MEC,No,No,Saggital_Right,1068,1,34
76,76,34,41,1068_1_35,TOSHIBA_MEC,No,No,Saggital_Right,1068,1,35
77,77,35,41,1068_1_36,TOSHIBA_MEC,No,No,Saggital_Right,1068,1,36
78,78,36,41,1068_1_37,TOSHIBA_MEC,No,No,Saggital_Right,1068,1,37


In [None]:
def calculate_scanner_proportion_diff(patient_scan_details, train, test):
    train_machine_counts = patient_scan_details[patient_scan_details.index.isin(train)]
    test_machine_counts = patient_scan_details[patient_scan_details.index.isin(test)]

    proportions_test = test_machine_counts.sum(axis=0) / test_machine_counts['img_count'].sum()
    proportions_train = train_machine_counts.sum(axis=0) / train_machine_counts['img_count'].sum()
    diff = abs(proportions_train - proportions_test)
    
    return diff.sum()

def calculate_outcome_proportion_diff(df, train, test):
    diff_sum = 0
    train_set = df[df.subj_id.isin(train)]
    test_set = df[df.subj_id.isin(test)]
    
    outcomes = ['reflux_label', 'surgery_label', 'function_label']

    for outcome in outcomes:
        train_set_counts = train_set[outcome].value_counts()
        test_set_counts = test_set[outcome].value_counts()
        
        train_count = len(train_set)
        test_count = len(test_set)
        
        if outcome == "function_label":
            # For function, we are only interested in missing vs. non-missing
            train_missing_frac = train_set_counts[train_set_counts.index == "Missing"] / train_count
            test_missing_frac = test_set_counts[test_set_counts.index == "Missing"] / test_count
            
            # Conversion from series to float so we don't run into issues 
            new_diff = list(abs(train_missing_frac - test_missing_frac))[0]

            diff_sum += new_diff
        else:
            train_set_counts = train_set_counts/ train_count
            test_set_counts = test_set_counts/ test_count
            
            new_diff = abs(test_set_counts - train_set_counts).sum() / 2
            
            diff_sum += new_diff
            
    return diff_sum


def select_split(df, patient_scan_details, test_percentage=0.3, num_trials=25000):
    # Just intialize to large value, this will be overwritten in the first iteration
    lowest_diff = 100
    lowest_diff_seed = 0
    
    # Sample many trials, and then select the one with the smallest difference in proportions
    for i in range(num_trials):
        patients = df.subj_id.unique()
        train, test = train_test_split(patients, test_size=test_percentage, random_state=i)
        cur_diff_scanner = calculate_scanner_proportion_diff(patient_scan_details, train, test)
        cur_diff_label = calculate_outcome_proportion_diff(df, train, test)
        cur_diff = cur_diff_label + cur_diff_scanner
#         print(cur_diff_label, cur_diff_scanner, cur_diff)
        # if this split has the lowest proportions, remember it
        if cur_diff < lowest_diff:
            lowest_diff = cur_diff
            lowest_diff_seed = i
            
    print("Best config: ",lowest_diff, lowest_diff_seed)
    
    # Split on the best random state we found
    train, test = train_test_split(patients, test_size=test_percentage, random_state=lowest_diff_seed)
    cur_diff = calculate_scanner_proportion_diff(patient_scan_details, train, test)
    return train, test
    
    
train_subjs, test_subjs = select_split(frame_to_split, patient_scan_details)

In [None]:
def get_filepaths_to_img(dataset, image_dir):
    filepaths = pd.DataFrame()
    filepaths['paths'] = image_dir + dataset['subj_id'].map(str) + "_" + dataset['scan_num'].map(str) + "_" + dataset['image_num'].map(str) + ".jpg" 
    return filepaths


def split_data_based_on_patients(df_to_split, image_dir, train_ids, test_ids):
    train_set = df_to_split[df_to_split.subj_id.isin(train_ids)]
    test_set = df_to_split[df_to_split.subj_id.isin(test_ids)]
    
    train_filepaths = get_filepaths_to_img(train_set, image_dir)
    test_filepaths = get_filepaths_to_img(test_set, image_dir)

    return train_filepaths, test_filepaths
    
train_filepaths, test_filepaths = split_data_based_on_patients(frame_to_split, os.path.join(ROOT_DIR, 'all_label_img/'), train_subjs, test_subjs)  

train_filepaths.to_csv(os.path.join(ROOT_DIR, 'train_filepaths.csv'))
test_filepaths.to_csv(os.path.join(ROOT_DIR, 'test_filepaths.csv'))
