# 2. Data Preparation

The methods provided here allows you to convert the image data into numpy arrays and to store it in npz-format. It makes it a lot easier for sklear to have the data prepared in numpy arrays, so we can easily load the data and train models.

In [1]:
# import libraries
import PIL as pil
import os as os
import numpy as np
import pandas as pd
import humanize as hm

from skimage import data, io, filters, transform
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
n_classes = 256
image_count = 30607
image_resize_factor = 0.3
image_width = int(382 * image_resize_factor)
image_height = int(345 * image_resize_factor)
image_pixels = image_height * image_width * 3
image_pixels

35226

In [3]:
def class_to_data(path, label):
    """
    Converts a class of images in a given path into a two numpy arrays
    
    Args:
        path (str): The path of the folder which contains the images of one class.
        label (int): The label for the images of the class
    
    Returns:
        X (numpy.ndarray): Array containing a image in each row.
        y (numpy.ndarray): Array containing a label in each row.
    """
    
    
    X = np.empty((0, image_width, image_height, 3), dtype=np.int8)
    y = np.empty(0, dtype=np.int8)

    for image in sorted(os.listdir(path)):
        if image.endswith('.jpg'):
            try:
                image_path = os.path.join(path, image)
                im = io.imread(image_path)
                
                im = transform.resize(im, (image_width, image_height), mode='reflect')
                X = np.append(X, [im], axis=0)
                y = np.append(y, label)
            except:
                 print('Error: Image is not RGB:', image_path)
                 continue
    return X, y

In [4]:
def classes_to_data(path, n_classes=1):
    """
    Converts classes of images in a given path into a two numpy arrays.
    
    Args:
        path (str): The path of the folder which contains the images of one class.
        n_classes (int): The number of classes to use.
    
    Returns:
        X (numpy.ndarray): Array containing a image in each row.
        y (numpy.ndarray): Array containing a label in each row.
    """
    X = np.empty((0, image_width, image_height, 3), dtype=np.int8)
    y = np.empty(0, dtype=np.int8)
    label = 1
    
    for folder in sorted(os.listdir(path)):
        print(label)
        class_path = os.path.join(path, folder)
        images, targets = class_to_data(class_path, label)
        X = np.append(X, images, axis=0)
        y = np.append(y, targets)
        label += 1
        if label > n_classes:
            break
    return X, y

Generate a new smaller dataset which takes the first 10 classes. We will resize the data on 30% of their size because of performance. After we found a good model we will take the whole data and train it in the Google Cloud.

In [5]:
X, y = classes_to_data('../data/data_resized/', n_classes=100)

1
Error: Image is not RGB: ../data/data_resized/001.ak47/001_0016.jpg
2
3
4
5
6
Error: Image is not RGB: ../data/data_resized/006.basketball-hoop/006_0004.jpg
Error: Image is not RGB: ../data/data_resized/006.basketball-hoop/006_0005.jpg
Error: Image is not RGB: ../data/data_resized/006.basketball-hoop/006_0007.jpg
Error: Image is not RGB: ../data/data_resized/006.basketball-hoop/006_0063.jpg
Error: Image is not RGB: ../data/data_resized/006.basketball-hoop/006_0075.jpg
7
Error: Image is not RGB: ../data/data_resized/007.bat/007_0082.jpg
8
Error: Image is not RGB: ../data/data_resized/008.bathtub/008_0137.jpg
9
10
11
12
13
Error: Image is not RGB: ../data/data_resized/013.birdbath/013_0020.jpg
14
Error: Image is not RGB: ../data/data_resized/014.blimp/014_0022.jpg
15
16
17
Error: Image is not RGB: ../data/data_resized/017.bowling-ball/017_0070.jpg
18
19
20
Error: Image is not RGB: ../data/data_resized/020.brain-101/020_0009.jpg
Error: Image is not RGB: ../data/data_resized/020.brain-10

MemoryError: 

Save the 10 classes with labels as npz-file:

In [None]:
np.savez('../data/image_data_257', X, y)