# Parse
In this notebook I will import different datasets. Then I will compare them and finally I will select the one to be used in the project.

In [128]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from PIL import Image
import matplotlib.pyplot as plt
from skimage import data
from skimage.color import rgb2gray
from tqdm import tqdm

In [162]:
path = "../../dataset/"
dataset = "mirflickr25k"
output_path = "../../"

 ## Preprocessing

In [72]:
def plot_comparison(img_resized,img_grayscaled):     
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))
    ax = axes.ravel()

    ax[0].imshow(img_resized)
    ax[0].set_title("Resized")
    ax[1].imshow(img_grayscaled, cmap=plt.cm.gray)
    ax[1].set_title("Grayscale")

    fig.tight_layout()
    plt.show()

In [98]:
'''
Preprocessing:
- resizing
- grayscaling
- normalizing
- check tensor format
'''
def preprocess_img(img_original,W,H):
    # resize to fixed size
    img_resized =  np.asarray(Image.fromarray(img_original).resize((W,H),resample=3))
    # grayscale
    img_grayscaled = rgb2gray(img_resized)
    # normalize
    img_grayscaled = img_grayscaled.astype("float32") / 255
    # tensor format
    img_resized = img_resized.reshape(W,H,3)    
    img_grayscaled = img_grayscaled.reshape(W,H,1)    
    # plot_comparison(img_resized,img_grayscaled)
    return (img_resized,img_grayscaled)

In [164]:
# defining some pre-processing parameters

N = 500 # number of images to pre-process
W = 128 # width to rescale
H = 128 # height to rescale

block = 250

img_paths = [img for img in os.listdir(path+dataset) if ".jpg" in img]

In [165]:
print(f"Found {len(img_paths)} images\nReady to preprocess images in blocks of {block} units")

Found 25000 images
Ready to preprocess images in blocks of 250 units


In [170]:
# array containers
resized = np.array([])
grayscaled = np.array([])

for i,img_path in tqdm(enumerate(img_paths[:N]),total=N):       
    # read image file with "Pillow"
    PIL_img = Image.open(f"{path}{dataset}/{img_path}")                
    # preprocess
    img_resized, img_grayscaled = preprocess_img(np.asarray(PIL_img),W,H)   
    # append to global arrays
    resized = np.append(resized,img_resized)
    grayscaled = np.append(grayscaled,img_grayscaled)

    if i>1 and (i+1) % block == 0:
        
        # reshaping
        resized = resized.reshape(block,W,H,3)
        grayscaled = grayscaled.reshape(block,W,H,1)
        
        # save to npy file
        np.savez_compressed(output_path+"resized.npz", resized)
        np.savez_compressed(output_path+"grayscaled.npz", grayscaled)
        
        # reset array containers
        resized = np.array([])
        grayscaled = np.array([])

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [00:23<00:00, 21.27it/s]
