In [1]:
from sklearn.utils import shuffle
from tqdm.notebook import tqdm
import tensorflow as tf
import pandas as pd
import numpy as np
from modules.dataset import Directory, Dataset, LABELS
from modules.utils import serialize_fold

2023-02-16 12:55:59.357303: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
import os

INPUT_PATH = "dataset/ChestXray NIH"
INPUT_PATH = os.path.abspath(INPUT_PATH)

STRATEGY = tf.distribute.get_strategy()    
BATCH_SIZE = 16
IMG_SIZE = 224
SEED = 42

In [5]:
df = pd.read_csv(f"{INPUT_PATH}/preprocessed_data.csv", index_col=0)
df = df.astype("int16")
df = shuffle(df, random_state=SEED)

In [6]:
df

Unnamed: 0,No Finding,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
dataset/ChestXray NIH/images_010/images/00022260_003.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_006/images/00012048_007.png,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
dataset/ChestXray NIH/images_007/images/00014352_005.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_003/images/00006040_001.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_005/images/00011202_002.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dataset/ChestXray NIH/images_009/images/00018865_040.png,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
dataset/ChestXray NIH/images_012/images/00030028_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
dataset/ChestXray NIH/images_011/images/00027679_000.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_001/images/00000211_014.png,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0


### Count Class

In [7]:
count_class = {}
for each_class in df.columns[:] :
    df_new = df[each_class].value_counts() 
    count_class[df_new.name] = [df_new[1]]
    
df_count_class = pd.DataFrame(count_class).T
df_count_class.set_axis(["Count"], axis="columns", inplace=True)
df_count_class

Unnamed: 0,Count
No Finding,60361
Atelectasis,11559
Consolidation,4667
Infiltration,19894
Pneumothorax,5302
Edema,2303
Emphysema,2516
Fibrosis,1686
Effusion,13317
Pneumonia,1431


In [8]:
df_count_class.drop(index=('No Finding')).mean()

Count    5798.285714
dtype: float64

In [9]:
df_count_class.mean()

Count    9435.8
dtype: float64

### Drop No Finding Class

In [10]:
tmp_df = df.copy()

No_Finding = df[df["No Finding"] == 1].index
tmp_df.drop(No_Finding, inplace=True)
tmp_df

Unnamed: 0,No Finding,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
dataset/ChestXray NIH/images_006/images/00012048_007.png,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
dataset/ChestXray NIH/images_002/images/00002275_007.png,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_012/images/00030350_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_004/images/00008858_007.png,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
dataset/ChestXray NIH/images_007/images/00014663_049.png,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dataset/ChestXray NIH/images_009/images/00020274_002.png,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
dataset/ChestXray NIH/images_006/images/00013747_002.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
dataset/ChestXray NIH/images_009/images/00018865_040.png,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
dataset/ChestXray NIH/images_012/images/00030028_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


### Random Sampling

In [11]:
random_no_finding = df[df["No Finding"] == 1].sample(n=9436, random_state=SEED)
random_no_finding

Unnamed: 0,No Finding,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
dataset/ChestXray NIH/images_002/images/00001931_008.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_004/images/00009103_001.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_011/images/00027072_000.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_005/images/00010635_003.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_004/images/00008992_000.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dataset/ChestXray NIH/images_005/images/00010519_001.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_011/images/00028147_001.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_004/images/00008327_001.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_008/images/00016159_000.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Create New DataFrame

Under Sampling "No Finding" Class

In [12]:
new_df = pd.concat([tmp_df, random_no_finding])
new_df = new_df.sample(frac=1, random_state=SEED).reset_index()
new_df

Unnamed: 0,index,No Finding,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
0,dataset/ChestXray NIH/images_004/images/000076...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,dataset/ChestXray NIH/images_004/images/000084...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,dataset/ChestXray NIH/images_002/images/000034...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,dataset/ChestXray NIH/images_001/images/000010...,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
4,dataset/ChestXray NIH/images_001/images/000004...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61190,dataset/ChestXray NIH/images_007/images/000140...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
61191,dataset/ChestXray NIH/images_009/images/000208...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
61192,dataset/ChestXray NIH/images_009/images/000204...,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
61193,dataset/ChestXray NIH/images_011/images/000279...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


To CSV

In [13]:
new_df.to_csv(f"{INPUT_PATH}/under_sampling_data.csv", index=False)

In [14]:
pd.read_csv(f"{INPUT_PATH}/under_sampling_data.csv", index_col=0)

Unnamed: 0_level_0,No Finding,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
dataset/ChestXray NIH/images_004/images/00007691_004.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
dataset/ChestXray NIH/images_004/images/00008468_018.png,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_002/images/00003427_000.png,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_001/images/00001052_001.png,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_001/images/00000459_037.png,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dataset/ChestXray NIH/images_007/images/00014004_047.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
dataset/ChestXray NIH/images_009/images/00020843_000.png,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
dataset/ChestXray NIH/images_009/images/00020429_003.png,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
dataset/ChestXray NIH/images_011/images/00027919_000.png,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
