In [1]:
from PIL import Image, ImageOps
import os
import sys
import pandas as pd
import numpy as np
from os.path import isfile, join
import arff

main_dir = os.path.abspath(os.pardir)
sys.path.insert(0, main_dir)

In [2]:
raw_imgs_pth = os.path.join(main_dir, "data", "raw")+os.sep
intrm_img_pth = os.path.join(main_dir, "data", "interim").replace("D:", "G:")+os.sep
intrm_img_pth2 = intrm_img_pth.replace("G:", "H:")

interim_paths = [intrm_img_pth, intrm_img_pth2]
for interim_path in interim_paths:
    os.makedirs(interim_path, exist_ok=True)
    
images_lst = [f for f in os.listdir(raw_imgs_pth) if os.path.isfile(os.path.join(raw_imgs_pth, f))]
images_lst = [f for f in images_lst if f.endswith(".jpg")]

In [3]:
degree_rot = 5
n_rot = int(360 / degree_rot)

In [4]:
print_timestamp = "%Y/%m/%d %H:%M:%S"

In [5]:
# Conv, ts to greyscale, resizes, and crops
max_size = (480, 640)
box_coord = (20, 40)
(left, upper, right, lower) = (box_coord[0], box_coord[1], max_size[0]-box_coord[0], max_size[1]-box_coord[1])

start_time = pd.to_datetime("now")
print("Start time:", start_time.strftime(print_timestamp))

for i, image_fname in enumerate(images_lst):
    print("Progress: "+str(int(round(((i+1)/((len(images_lst)+1))*100), 0)))+"%", end="\r")
    try:
        img = Image.open(raw_imgs_pth+image_fname).convert("L")
    except Exception as error:
        print(image_fname, repr(error))
        continue
     
    # Rotate if necessary, resize, crop, normalise
    s = img.size
    if s[0] > s[1]:
        img = img.rotate(90, expand=1)
        s = img.size
    ratio = max_size[0]/s[0]
    width, height = int(s[0]*ratio), int(s[1]*ratio)
    
    img = img.resize((width, height), Image.LANCZOS)
    img = img.crop((left, upper, right, lower))
    
    # Normalise intensity
    img = ImageOps.equalize(img)
    img = ImageOps.autocontrast(img)
    
    for interim_path in interim_paths:
        img.save(interim_path+image_fname.replace(".jpg", "_0")+".jpg", quality=95)
    
    if "tst_" not in image_fname:
        for i in range(1, n_rot):
            img_copy = img.rotate(degree_rot*i)
            for interim_path in interim_paths:
                img_copy.save(interim_path+image_fname.replace(".jpg", "_"+str(i))+".jpg", quality=95)

        img = ImageOps.mirror(img)
        for interim_path in interim_paths:
            img.save(interim_path+image_fname.replace(".jpg", "_flip_0")+".jpg", quality=95)
        for i in range(1, n_rot):
            img_copy = img.rotate(degree_rot*i)
            for interim_path in interim_paths:
                img_copy.save(interim_path+image_fname.replace(".jpg", "_flip_"+str(i))+".jpg", quality=95)
    
print()

finish_time = pd.to_datetime("now")
elapsed_time = finish_time - start_time
print("Finish time:", str(finish_time.strftime(print_timestamp))+". This has taken "+str(round(elapsed_time.seconds / 60, 0))+" minutes.")

Start time: 2020/05/12 12:58:56
Progress: 100%
Finish time: 2020/05/12 15:36:40. This has taken 158.0 minutes.


In [6]:
images_df = [f for f in os.listdir(intrm_img_pth) if isfile(join(intrm_img_pth, f))]
images_df = pd.DataFrame([f for f in images_df if f.endswith(".jpg")], columns=["filename"])
images_df["class"] = images_df["filename"].str.extract("(t\d{1})").astype("category")
images_df.head()

Unnamed: 0,filename,class
0,trn_t1_1131_0.jpg,t1
1,trn_t1_1131_1.jpg,t1
2,trn_t1_1131_10.jpg,t1
3,trn_t1_1131_11.jpg,t1
4,trn_t1_1131_12.jpg,t1


In [7]:
for interim_path in interim_paths:
    arff.dump(interim_path+'filenames.arff'
          , images_df.values
          , relation='kaggle_ccs'
          , names=images_df.columns)