# Workbook to Resize & Organize Images

Within this workbook, all images from the data library are center cropped if they are greater than 3048x3048 and are then resized to 1024x1024.

All images lower than 1024x1024 are discarded.

The dataset contains a mix of resolutions ranging from 300x300 up to 4000x4000. Dropping all images with resolution lower than 1024x1024 reduced the overall dataset size from ~70k images to ~50k images.

These 1024x1024 images were then used to evaluate the model.

In [1]:
import cv2
import pandas as pd
import seaborn as sns
import glob
from matplotlib import pyplot
import os
import shutil
import numpy as np
import multiprocessing

In [2]:
### Read full dataframe ###
all_data_df=pd.read_csv('FinalTrain.csv')
all_data_df

Unnamed: 0,index,filename,target,width,height
0,0,ISIC_2637011_2020_1_31256,0.0,1024,1024
1,1,ISIC_0015719_2020_1_2963,0.0,1024,1024
2,2,ISIC_0052212_2020_1_14821,0.0,1024,1024
3,3,ISIC_0068279_2020_1_23495,0.0,1024,1024
4,4,ISIC_0074268_2020_1_30557,0.0,1024,1024
...,...,...,...,...,...
69336,145,ISIC_0015443_2017_3_145,0.0,6601,4401
69337,146,ISIC_0015445_2017_3_146,0.0,6641,4440
69338,147,ISIC_0015483_2017_3_147,0.0,6668,4459
69339,148,ISIC_0015496_2017_3_148,0.0,6688,4459


In [5]:
#### Target Resolution ###
target_height = 1024
target_width = 1024

#### Subset Data Discard Low Res ####

target_res_df = all_data_df[all_data_df['height'] >= target_height]
target_res_df = target_res_df[target_res_df['width'] >= target_width]
target_res_df.describe()

Unnamed: 0,index,target,width,height
count,47555.0,47555.0,47555.0,47555.0
mean,16565.485543,0.16381,1155.267038,1097.98835
std,8901.381062,0.370107,694.519839,403.063547
min,0.0,0.0,1024.0,1024.0
25%,9872.5,0.0,1024.0,1024.0
50%,17339.0,0.0,1024.0,1024.0
75%,23283.5,0.0,1024.0,1024.0
max,33125.0,1.0,6748.0,4499.0


In [6]:
from PIL import Image
def resize_crop_and_save(file_list, target_dir):
    #center crop all images greater than 3048
    #then resize all images to 1024x1024
    for file in file_list:
        image = Image.open('FullData/'+file+'.jpg')
        width, height = image.size
        
        if width > 3048:
            target = 3048
        else:
            target = 1024
        
        if width > 1024 or height > 1024:
            b = (height + target)//2
            t = (height - target)//2
            r = (width + target)//2
            l = (width - target)//2
            image = image.crop((l,t,r,b))
            
        image = image.resize((1024,1024))
        image.save(target_dir+file+'.jpg')
            
            
        

In [7]:
from multiprocessing import Process
import os
from PIL import Image

### Use 16 core multiprocessing to perform resize & crop and save operation to speed up runtime

### MultiProcessing Center Crop & Resize ###
num_cores = 16

splits_by_cores = target_res_df['filename'].to_numpy()
splits_by_cores = np.array_split(splits_by_cores, num_cores)

for i in range(0,num_cores):
    p = Process(target=resize_crop_and_save, args=(splits_by_cores[i],'1024x1024/'))
    p.start()

for i in range(0,num_cores):
    p.join()

In [42]:
resize_crop_and_save(test['filename'].to_numpy()[:30], '600x600/')

In [8]:
target_res_df.to_csv('1024x1024_train.csv')