In [1]:
from PIL import Image, ImageOps
import os
import sys
import pandas as pd
import numpy as np
from glob import glob
from os.path import isfile, join
from sklearn.model_selection import StratifiedKFold
import arff

main_dir = os.path.abspath(os.pardir)
sys.path.insert(0, main_dir)

In [2]:
raw_imgs_pth = os.path.join(main_dir, "data", "raw")+os.sep
train_pth = os.path.join(raw_imgs_pth, "train")+os.sep
tst_pth = os.path.join(raw_imgs_pth, "test")+os.sep

intrm_img_trn_pth = os.path.join(main_dir, "data", "interim", "train")+os.sep
intrm_img_tst_pth = os.path.join(main_dir, "data", "interim", "test")+os.sep

intrm_img_trn_fnames_pth = os.path.join(intrm_img_trn_pth, "filenames")+os.sep
intrm_img_trn_data_pth = os.path.join(intrm_img_trn_pth, "data")+os.sep

intrm_img_tst_fnames_pth = os.path.join(intrm_img_tst_pth, "filenames")+os.sep
intrm_img_tst_data_pth = os.path.join(intrm_img_tst_pth, "data")+os.sep

paths = [intrm_img_trn_pth, intrm_img_tst_pth, intrm_img_trn_fnames_pth,
         intrm_img_trn_data_pth, intrm_img_tst_fnames_pth, intrm_img_tst_data_pth]  # intrm_img_pth2
for pth in paths:
    os.makedirs(pth, exist_ok=True)
    
img_pth = pd.Series(glob(train_pth+"*.jpg")+glob(tst_pth+"*.jpg"))
images_lst = []
for file in img_pth:
    images_lst += [file.rsplit('\\', 1)[-1]]
print(len(images_lst))

10236


In [3]:
# degree_rot = 5
# n_rot = int(360 / degree_rot)

In [4]:
print_timestamp = "%Y/%m/%d %H:%M:%S"

In [13]:
# Conv, ts to greyscale, resizes, and crops
max_size = (480, 640)
box_coord = (20, 40)
(left, upper, right, lower) = (box_coord[0], box_coord[1], max_size[0]-box_coord[0], max_size[1]-box_coord[1])

start_time = pd.to_datetime("now")
print("Start time:", start_time.strftime(print_timestamp))

for i, image_fname in enumerate(images_lst):
    print("Progress: "+str(int(round(((i+1)/((len(images_lst)+1))*100), 0)))+"%", end="\r")
    try:
        img = Image.open(img_pth[i]).convert("L")
    except Exception as error:
        print(image_fname, repr(error))
        continue
     
    # Rotate if necessary, resize, crop, normalise
    s = img.size
    if s[0] > s[1]:
        img = img.rotate(90, expand=1)
        s = img.size
        print(img_pth[i], "was rotated. Check if it has succeded manually (exif).")
    ratio = max_size[0]/s[0]
    width, height = int(s[0]*ratio), int(s[1]*ratio)
    
    img = img.resize((width, height), Image.LANCZOS)
    img = img.crop((left, upper, right, lower))
    
    # Normalise intensity
    img = ImageOps.equalize(img)
    img = ImageOps.autocontrast(img)
    
#     for interim_path in interim_paths:
    if ("tst_" in image_fname) | ("vld_" in image_fname):
        img.save(intrm_img_trn_pth+image_fname, quality=95)  # .replace(".jpg", "_0.jpg")
    if ("trn_" in image_fname) | ("vld_" in image_fname):
        img.save(intrm_img_tst_pth+image_fname, quality=95)
    
#     if "tst_" not in image_fname:
#         for i in range(1, n_rot):
#             img_copy = img.rotate(degree_rot*i)
#             for interim_path in interim_paths:
#                 img_copy.save(interim_path+image_fname.replace(".jpg", "_"+str(i))+".jpg", quality=95)

#         img = ImageOps.mirror(img)
#         for interim_path in interim_paths:
#             img.save(interim_path+image_fname.replace(".jpg", "_flip_0")+".jpg", quality=95)
#         for i in range(1, n_rot):
#             img_copy = img.rotate(degree_rot*i)
#             for interim_path in interim_paths:
#                 img_copy.save(interim_path+image_fname.replace(".jpg", "_flip_"+str(i))+".jpg", quality=95)
    
print()

finish_time = pd.to_datetime("now")
elapsed_time = finish_time - start_time
print("Finish time:", str(finish_time.strftime(print_timestamp))+". This has taken "+str(round(elapsed_time.seconds / 60, 0))+" minutes.")

Start time: 2020/05/24 23:08:21
Progress: 0%
Finish time: 2020/05/24 23:08:21. This has taken 0.0 minutes.


In [5]:
def prep_filenames(path, export=True, extra_str=""):
    images_df = [f for f in os.listdir(path) if isfile(join(path, f))]
    images_df = pd.DataFrame([f for f in images_df if f.endswith(".jpg")], columns=["filename"])
    images_df["class"] = images_df["filename"].str.extract("(t\d{1})").astype(str)
    
    # images_df.to_csv(intrm_img_trn_pth+'complete_fnames'+extra_str+'.csv', index=False)
    if export:
    #     for interim_path in interim_paths:
        arff.dump(path+'complete_fnames'+extra_str+'.arff'
              , images_df.values
              , relation='kaggle_ccs'
              , names=images_df.columns)
    return images_df

In [6]:
img_fnames_trn_df = prep_filenames(intrm_img_trn_pth, export=False, extra_str="_train_")
img_fnames_tst_df = prep_filenames(intrm_img_tst_pth, export=False, extra_str="_test_")

In [8]:
def printDivisors(n): 
    i = 1
    n_w = n
    divs = []
    while (i <= n_w): 
        if (n_w % i==0) & (i >= 5) & (i <= 10):
            drop_n = n-n_w
            print("Int divisor for", n_w, "is", str(i)+". Drop", drop_n,"image groups.")
            return drop_n, i
        if (n_w == i):
            n_w -= 1
            i = 0
        i = i + 1

In [9]:
drp_img_n, cv_folds = printDivisors(len(img_fnames_trn_df))

Int divisor for 6730 is 5. Drop 0 image groups.


In [22]:
def stratified_kfolding(filenames_s, class_s, export=True):
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=123)
    kfold_ind = {}

    for n, i in enumerate(skf.split(filenames_s, class_s)):
        kfold_ind[n] = i
        train_set = pd.concat([img_fnames_trn_df.loc[kfold_ind[n][0], filenames_s.name],
                           img_fnames_trn_df.loc[kfold_ind[n][0], class_s.name]], axis=1)
        val_set = pd.concat([img_fnames_trn_df.loc[kfold_ind[n][1],filenames_s.name],
                               img_fnames_trn_df.loc[kfold_ind[n][1], class_s.name]], axis=1)
#         print(len(train_set), len(val_set))
    
        if export:
            print("Exporting fold", str(n), "to", intrm_img_trn_fnames_pth)
        #     train_set.to_csv(intrm_img_trn_fnames_pth+"f"+str(n)+"_fnames_train.csv", index=False)
        #     val_set.to_csv(intrm_img_trn_fnames_pth+"f"+str(n)+"_fnames_val.csv", index=False)

            arff.dump(intrm_img_trn_fnames_pth+"f"+str(n)+"_fnames_train.arff"
              , train_set.values
              , relation="f"+str(n)+"_train"
              , names=train_set.columns)

            arff.dump(intrm_img_trn_fnames_pth+"f"+str(n)+"_fnames_val.arff"
              , val_set.values
              , relation="f"+str(n)+"_val"
              , names=val_set.columns)

In [23]:
stratified_kfolding(img_fnames_trn_df["filename"], img_fnames_trn_df["class"], export=False)