## Data Preparation for Synth Paper 2022

### Loading data

In [56]:
import os
import glob
from pathlib import Path
import shutil
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
import numpy as np
import timm

import albumentations as albu
from sklearn.metrics import classification_report,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader
from torch.utils.data import Dataset as BaseDataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# helper function for data visualization
def visualize(label, **images):
    """PLot images in one row."""
    n = len(images)
    plt.figure(figsize=(16, 5))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.xticks([])
        plt.yticks([])
        #plt.title(' '.join(name.split('_')).title())
        plt.title(f'{name.title()} | GT: {label}')
        plt.imshow(image)
    plt.show()

### Data Preparation

UKU Images (old split):

In [4]:
dir_only_uku = '../../data/spaete_splits/only_uu/'

In [5]:
file_paths = [str(Path(x)) for x in glob.glob(f'{dir_only_uku}/*/*/*')]

In [6]:
df_uku = pd.DataFrame(file_paths,columns=['file_path'])

In [7]:
df_uku['file_name'] = [Path(x).name for x in file_paths]

In [8]:
df_uku['source'] = 'UKU_old_split'

In [9]:
df_uku['split'] = df_uku['file_path'].str.contains('train',regex=False)

In [10]:
df_uku['split'] = np.where(df_uku['split']==True,'train','test')

In [11]:
df_uku['class'] = df_uku['file_name'].str.extract('^(NB|C)')

In [12]:
cls_map = {'C' : 'C',
           'NB' : 'H'}

In [13]:
df_uku['class'] = df_uku['class'].map(cls_map)

In [14]:
df_uku.head()

Unnamed: 0,file_path,file_name,source,split,class
0,../../data/spaete_splits/only_uu/valid/nb/NB21...,NB219a.jpg,UKU_old_split,test,H
1,../../data/spaete_splits/only_uu/valid/nb/NB17...,NB177a.jpg,UKU_old_split,test,H
2,../../data/spaete_splits/only_uu/valid/nb/NB24...,NB242a.jpg,UKU_old_split,test,H
3,../../data/spaete_splits/only_uu/valid/nb/NB25...,NB254a.jpg,UKU_old_split,test,H
4,../../data/spaete_splits/only_uu/valid/nb/NB19...,NB195a.jpg,UKU_old_split,test,H


UKU All Images:

In [15]:
dir_uku_all = '../../data/segmentation_test/raw/'

In [16]:
file_paths = [str(Path(x)) for x in glob.glob(f'{dir_uku_all}/*')]

In [17]:
df_uku_all = pd.DataFrame(file_paths,columns=['file_path'])

In [18]:
df_uku_all['file_name'] = [Path(x).name for x in file_paths]

In [19]:
df_uku_all['source'] = 'UKU_all'

In [20]:
df_uku_all['class'] = df_uku_all['file_name'].str.extract('^(NB|P|B|V|C)')

In [21]:
cls_map = {'C' : 'C',
           'NB' : 'H',
           'V' : 'V',
           'P' : 'F',
           'B' : 'B'}

In [22]:
df_uku_all['class'] = df_uku_all['class'].map(cls_map)

In [23]:
X_train, _ = train_test_split(df_uku_all['file_name'].values,test_size=0.2,random_state=1,stratify=df_uku_all['class'].values)

In [24]:
df_uku_all['split'] = ['train' if x in X_train else 'test' for x in df_uku_all['file_name'].values]

In [25]:
df_uku_all.head()

Unnamed: 0,file_path,file_name,source,class,split
0,../../data/segmentation_test/raw/C59.png,C59.png,UKU_all,C,train
1,../../data/segmentation_test/raw/NB46.png,NB46.png,UKU_all,H,train
2,../../data/segmentation_test/raw/NB663.png,NB663.png,UKU_all,H,train
3,../../data/segmentation_test/raw/V29.png,V29.png,UKU_all,V,train
4,../../data/segmentation_test/raw/NB626.png,NB626.png,UKU_all,H,train


In [26]:
len(df_uku_all)

1082

Synthetic data:

In [27]:
dir_synth = '../../data/generated_data/cs_uu/'

In [28]:
file_paths = [str(Path(x)) for x in glob.glob(f'{dir_synth}/*/*')]

In [29]:
df_synth = pd.DataFrame(file_paths,columns=['file_path'])

In [30]:
df_synth['file_name'] = [Path(x).name for x in file_paths]

In [31]:
df_synth['source'] = 'Synth'

In [32]:
df_synth['split'] = 'train'

In [33]:
df_synth['class'] = df_synth['file_path'].str.contains('nb',regex=False)

In [34]:
df_synth['class'] = np.where(df_synth['class']==True,'H','C')

In [35]:
df_synth.head()

Unnamed: 0,file_path,file_name,source,split,class
0,../../data/generated_data/cs_uu/nb/sample_5975...,sample_5975.png,Synth,train,H
1,../../data/generated_data/cs_uu/nb/sample_7059...,sample_7059.png,Synth,train,H
2,../../data/generated_data/cs_uu/nb/sample_3288...,sample_3288.png,Synth,train,H
3,../../data/generated_data/cs_uu/nb/sample_4932...,sample_4932.png,Synth,train,H
4,../../data/generated_data/cs_uu/nb/sample_7125...,sample_7125.png,Synth,train,H


Covidx:

In [36]:
dir_covidx = '../../data/Covidx/data_with_classes/'

In [37]:
file_paths = [str(Path(x)) for x in glob.glob(f'{dir_covidx}/*/*/*')]

In [38]:
df_covidx = pd.DataFrame(file_paths,columns=['file_path'])

In [39]:
df_covidx['file_name'] = [Path(x).name for x in file_paths]

In [40]:
df_covidx['source'] = 'Covidx'

In [41]:
df_covidx['split'] = df_covidx['file_path'].str.contains('train',regex=False)

In [42]:
df_covidx['split'] = np.where(df_covidx['split']==True,'train','test')

In [43]:
df_covidx['class'] = df_covidx['file_path'].str.contains('normal',regex=False)

In [44]:
df_covidx['class'] = np.where(df_covidx['class']==True,'H','C')

In [45]:
df_covidx.head()

Unnamed: 0,file_path,file_name,source,split,class
0,../../data/Covidx/data_with_classes/valid/norm...,99432aa3-8d61-4ff2-a79a-f0a0218d6fa2.png,Covidx,test,H
1,../../data/Covidx/data_with_classes/valid/norm...,27b32500-40ef-4c14-ab88-ace5e9ceea27.png,Covidx,test,H
2,../../data/Covidx/data_with_classes/valid/norm...,3c8cc0d1-be7f-493d-9570-542d851f623b.png,Covidx,test,H
3,../../data/Covidx/data_with_classes/valid/norm...,6804008c-94eb-4870-ba6e-b06514073e71.png,Covidx,test,H
4,../../data/Covidx/data_with_classes/valid/norm...,8989e25c-a698-48fc-b428-fff56931fc8f.png,Covidx,test,H


Merge all dataframes:

In [46]:
df = pd.concat([df_uku,df_uku_all,df_synth,df_covidx],ignore_index=True)

Finally encode labels:

In [47]:
df['class'] = df['class'].astype('category')
df['label_encoded'] = df['class'].cat.codes.astype('int64')

In [48]:
df[df.isnull().any(axis=1)]

Unnamed: 0,file_path,file_name,source,split,class,label_encoded


In [49]:
df.sample(5)

Unnamed: 0,file_path,file_name,source,split,class,label_encoded
24645,../../data/Covidx/data_with_classes/train/norm...,7c6d38a6-bbc3-4dd5-b4ad-bc8a7758eeaf.png,Covidx,train,H,3
31245,../../data/Covidx/data_with_classes/train/norm...,b7b0bdad-ee4b-4381-af19-501eecb6e847.png,Covidx,train,H,3
7181,../../data/generated_data/cs_uu/nb/sample_8496...,sample_8496.png,Synth,train,H,3
3526,../../data/generated_data/cs_uu/nb/sample_6530...,sample_6530.png,Synth,train,H,3
19368,../../data/generated_data/cs_uu/covid/sample_8...,sample_8821.png,Synth,train,C,1


In [50]:
#df.to_csv('synth_paper_data.csv',index=False)

Move files for image generation:

In [51]:
df_synth = df.loc[df['source']=='UKU_all'].copy()

In [54]:
df_synth['target_path'] = df_synth.agg(lambda x: f"../../data/synth_paper2_cs/{x['split']}/{x['class']}/{x['file_name']}",axis=1)

In [55]:
df_synth

Unnamed: 0,file_path,file_name,source,split,class,label_encoded,target_path
333,../../data/segmentation_test/raw/C59.png,C59.png,UKU_all,train,C,1,../../data/synth_paper2_cs/train/C/C59.png
334,../../data/segmentation_test/raw/NB46.png,NB46.png,UKU_all,train,H,3,../../data/synth_paper2_cs/train/H/NB46.png
335,../../data/segmentation_test/raw/NB663.png,NB663.png,UKU_all,train,H,3,../../data/synth_paper2_cs/train/H/NB663.png
336,../../data/segmentation_test/raw/V29.png,V29.png,UKU_all,train,V,4,../../data/synth_paper2_cs/train/V/V29.png
337,../../data/segmentation_test/raw/NB626.png,NB626.png,UKU_all,train,H,3,../../data/synth_paper2_cs/train/H/NB626.png
...,...,...,...,...,...,...,...
1410,../../data/segmentation_test/raw/P29.png,P29.png,UKU_all,train,F,2,../../data/synth_paper2_cs/train/F/P29.png
1411,../../data/segmentation_test/raw/NB364.png,NB364.png,UKU_all,train,H,3,../../data/synth_paper2_cs/train/H/NB364.png
1412,../../data/segmentation_test/raw/NB22.png,NB22.png,UKU_all,train,H,3,../../data/synth_paper2_cs/train/H/NB22.png
1413,../../data/segmentation_test/raw/NB669.png,NB669.png,UKU_all,test,H,3,../../data/synth_paper2_cs/test/H/NB669.png


In [63]:
for source,dest in tqdm(zip(df_synth['file_path'],df_synth['target_path'])):
    if not Path(dest).parent.exists():
        Path(dest).parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(source, dest)

1082it [00:08, 120.63it/s]
