# _50/50 Split (POS/NEG) Sample Data Set Playground_

__July 8, 2019__

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
# fastai libraries
from fastai.vision import *
from fastai.callbacks.hooks import *
from fastai.utils.mem import *
from fastai.callbacks import EarlyStoppingCallback

In [2]:
# import libraries
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import random
import os
from pathlib import Path

# Matplotlib
%matplotlib inline
%config InlineBackend.figure_format='retina'
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [30]:
path = Path(os.getcwd())
path = path / 'data'
path
# GCP path variable
#path = Config.data_path()
#path

PosixPath('/Users/joeai/Springboard/capstone_2/data')

# _Load in Data_

In [31]:
# read in csv files for training and validation sets
train_df = pd.read_csv(path/'CheXpert-v1.0-small/train.csv')
valid_df = pd.read_csv(path/'CheXpert-v1.0-small/valid.csv')

In [32]:
print(train_df.shape)
print(valid_df.shape)

(223414, 19)
(234, 19)


In [33]:
# add valid column to indicate if observations are part of validation set
train_df['valid'] = False
valid_df['valid'] = True

In [42]:
# extract patient id, study and add to columns
train_df['Patient_id'] = train_df.Path.str.split('/', 3, True)[2]
valid_df['Patient_id'] = valid_df.Path.str.split('/', 3, True)[2]

train_df['study'] = train_df.Path.str.split('/',4,True)[3]
valid_df['study'] = valid_df.Path.str.split('/',4,True)[3]

In [43]:
# create function to seed data (allows us to more easily reproduce sample data set
def seed_data(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [44]:
def sample_df(df, sample_perc = 0.05):
    '''function to gather sample of original CheXpert data'''
    train_only_df = full_df[~full_df.valid]
    # valid_only_df = full_df[full_df.valid]
    unique_patients = train_only_df.Patient_id.unique()
    mask = np.random.rand(len(unique_patients)) <= sample_perc
    sample_patients = unique_patients[mask]
    sample_df = train_only_df[train_df.Patient_id.isin(sample_patients)]
    sample_df = pd.concat([sample_df,valid_only_df])
    return sample_df

In [45]:
train_df.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,valid,Patient_id,study
0,CheXpert-v1.0-small/train/patient00001/study1/...,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0,False,patient00001,study1
1,CheXpert-v1.0-small/train/patient00002/study2/...,Female,87,Frontal,AP,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,,False,patient00002,study2
2,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Frontal,AP,,,,1.0,,,-1.0,,,,,,1.0,,False,patient00002,study1
3,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Lateral,,,,,1.0,,,-1.0,,,,,,1.0,,False,patient00002,study1
4,CheXpert-v1.0-small/train/patient00003/study1/...,Male,41,Frontal,AP,,,,,,1.0,,,,0.0,,,,,False,patient00003,study1


In [62]:
pathology_list = list(train_df.columns[5:19])
pathology_list

['No Finding',
 'Enlarged Cardiomediastinum',
 'Cardiomegaly',
 'Lung Opacity',
 'Lung Lesion',
 'Edema',
 'Consolidation',
 'Pneumonia',
 'Atelectasis',
 'Pneumothorax',
 'Pleural Effusion',
 'Pleural Other',
 'Fracture',
 'Support Devices']

In [60]:
train_df[pathology_list] = train_df[pathology_list].fillna(0)
train_df.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,valid,Patient_id,study
0,CheXpert-v1.0-small/train/patient00001/study1/...,Female,68,Frontal,AP,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False,patient00001,study1
1,CheXpert-v1.0-small/train/patient00002/study2/...,Female,87,Frontal,AP,0.0,0.0,-1.0,1.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,1.0,0.0,False,patient00002,study2
2,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Frontal,AP,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,patient00002,study1
3,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Lateral,,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,patient00002,study1
4,CheXpert-v1.0-small/train/patient00003/study1/...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,patient00003,study1


In [63]:
train_df.isnull().sum()

Path                              0
Sex                               0
Age                               0
Frontal/Lateral                   0
AP/PA                         32387
No Finding                        0
Enlarged Cardiomediastinum        0
Cardiomegaly                      0
Lung Opacity                      0
Lung Lesion                       0
Edema                             0
Consolidation                     0
Pneumonia                         0
Atelectasis                       0
Pneumothorax                      0
Pleural Effusion                  0
Pleural Other                     0
Fracture                          0
Support Devices                   0
valid                             0
Patient_id                        0
study                             0
dtype: int64

In [66]:
train_df['AP/PA'] = train_df['AP/PA'].fillna('')

In [68]:
from test_folder import replicate

train_df = replicate.uignore(train_df, 'Cardiomegaly')
train_df.head()

Dropped -1 observations.
------------------------------
Converted pathology column into integer type.
------------------------------
Shape of new dataframe: (215327, 22)


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,valid,Patient_id,study
0,CheXpert-v1.0-small/train/patient00001/study1/...,Female,68,Frontal,AP,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False,patient00001,study1
1,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Frontal,AP,0.0,0.0,0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,patient00002,study1
2,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Lateral,,0.0,0.0,0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,patient00002,study1
3,CheXpert-v1.0-small/train/patient00003/study1/...,Male,41,Frontal,AP,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,patient00003,study1
4,CheXpert-v1.0-small/train/patient00004/study1/...,Female,20,Frontal,PA,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,patient00004,study1


# _Random Oversampling_

In [74]:
# Class count
count_class_0, count_class_1 = train_df['Cardiomegaly'].value_counts()
print(count_class_0, count_class_1)

188327 27000


In [75]:
# Divide by class
df_class_0 = train_df[train_df['Cardiomegaly'] == 0]
df_class_1 = train_df[train_df['Cardiomegaly'] == 1]

In [76]:
print(df_class_0.shape)
print(df_class_1.shape)

(188327, 22)
(27000, 22)


In [78]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over['Cardiomegaly'].value_counts())

Random over-sampling:
1    188327
0    188327
Name: Cardiomegaly, dtype: int64


In [80]:
df_test_over.shape

(376654, 22)

In [71]:
positive_df = train_df[train_df['Cardiomegaly'] == 1]
negative_df = train_df[train_df['Cardiomegaly'] == 0]

In [73]:
print(positive_df.shape)
print(negative_df.shape)

(27000, 22)
(188327, 22)


# _Random Undersampling_

In [81]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under['Cardiomegaly'].value_counts())

Random under-sampling:
1    27000
0    27000
Name: Cardiomegaly, dtype: int64


In [None]:
# concat undersampled train_df and valid_df together
full_df = pd.concat([train_df, valid_df])