# Feature Engineering Notebook

Download additional python packages

In [2]:
!pip install opencv-python

Collecting opencv-python
[?25l  Downloading https://files.pythonhosted.org/packages/5e/7e/bd5425f4dacb73367fddc71388a47c1ea570839197c2bcad86478e565186/opencv_python-4.1.1.26-cp36-cp36m-manylinux1_x86_64.whl (28.7MB)
[K     |████████████████████████████████| 28.7MB 10.4MB/s eta 0:00:01
Installing collected packages: opencv-python
Successfully installed opencv-python-4.1.1.26


Install libraries

In [3]:
# Importing standard libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle

from tqdm import tqdm
import cv2
from sklearn.utils import shuffle
from botocore.client import Config
import ibm_boto3

Define credentials to access a IBM cloud bucket storage

In [4]:
# The code was removed by Watson Studio for sharing.

Download the raw data into a dictionary

In [5]:
cos.download_file(Bucket=credentials['BUCKET'], Key='raw_data.pickle', Filename='raw_data.pickle')   

with open('raw_data.pickle', 'rb') as f:
    raw_data = pickle.load(f, encoding='latin1')  # dictionary type

We use a set of functions for augmentating the amount of data in the training set. This functions have been adapted from https://www.kaggle.com/valentynsichkar/traffic-signs-preprocessed#datasets_preparing.py

In [6]:
# Defining function for equalization training dataset
def equalize_training_dataset(x_train, y_train):
    # Getting number of examples for every label
    number_of_examples_for_every_label = np.bincount(y_train)
    # Calculating total amount of unique labels
    number_of_labels = np.arange(len(number_of_examples_for_every_label))

    # Iterating over all number of labels
    # Showing progress ber with 'tqdm'
    for i in tqdm(number_of_labels):
        # Calculating how many examples is needed to add for current label
        # 'np.mean(number_of_examples_for_every_label)' - average number over examples for every label
        number_of_examples_to_add = int(np.mean(number_of_examples_for_every_label) * 2.5) - \
                                    number_of_examples_for_every_label[i]

        # Defining temporary arrays for collecting new images
        x_temp = []
        y_temp = []

        # Getting random image from current label
        # Transforming it and adding to the temporary arrays
        for j in range(number_of_examples_to_add):
            getting_random_image = random_image(x_train, y_train, i)
            x_temp.append(transformation_brightness_rotation(getting_random_image))
            y_temp.append(i)

        x_train = np.append(x_train, np.array(x_temp), axis=0)
        y_train = np.append(y_train, np.array(y_temp), axis=0)

    return x_train, y_train

# Defining function for getting random image of one label
def random_image(x_train, y_train, y_number):
    # Getting indexes of needed 'y_number' from 'y_train'
    # Defining True - False array
    image_indexes = np.where(y_train == y_number)
    # Getting random index of needed label
    # 'np.bincount(y_train)' - array with number of examples for every label
    # 'np.bincount(y_train)[y_number] - 1' - number of examples for 'y_number' label
    random_index = np.random.randint(0, np.bincount(y_train)[y_number] - 1)
    # Returning random image from 'x_train'
    # 'x_train[image_indexes]' - returns array with only 'y_number' label
    # 'x_train[image_indexes][random_index]' - random image of needed label
    return x_train[image_indexes][random_index]

# Defining function for transformation: brightness + rotation
def transformation_brightness_rotation(image):
    return brightness_changing(rotation_changing(image))

# Defining function for changing brightness
def brightness_changing(image):
    # Converting firstly image from RGB to HSV
    image_hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
    # Defining random value for changing brightness
    random_brightness = 0.25 + np.random.uniform()
    # Implementing changing of Value channel of HSV image
    image_hsv[:, :, 2] = image_hsv[:, :, 2] * random_brightness
    # Converting HSV changed image to RGB
    image_rgb = cv2.cvtColor(image_hsv, cv2.COLOR_HSV2RGB)
    # Returning image with changed brightness
    return image_rgb

# Defining function for changing rotation of image
def rotation_changing(image):
    # Defining angle range
    angle_range = 25
    # Defining angle rotation
    angle_rotation = np.random.uniform(angle_range) - angle_range / 2
    # Getting shape of image
    rows, columns, channels = image.shape
    # Implementing rotation
    # Calculating Affine Matrix
    affine_matrix = cv2.getRotationMatrix2D((columns / 2, rows / 2), angle_rotation, 1)
    # Warping original image with Affine Matrix
    rotated_image = cv2.warpAffine(image, affine_matrix, (columns, rows))
    # Returning rotated image
    return rotated_image


We perform data augmentation on the training data set

In [7]:
# # Implementing equalization of training dataset
x_train, y_train = equalize_training_dataset(raw_data['x_train'].astype(np.uint8), raw_data['y_train'])

# # Putting loaded and equalized data into the dictionary
# # Equalization is done only for training dataset
d_loaded = {'x_train': x_train, 'y_train': y_train,
            'x_validation': raw_data['x_validation'], 'y_validation': raw_data['y_validation'],
            'x_test': raw_data['x_test'], 'y_test': raw_data['y_test'],
            'labels': raw_data['labels']}

100%|██████████| 43/43 [01:04<00:00,  1.78s/it]


In [8]:
with open('data_equalized.pickle', 'wb') as f:
    pickle.dump(d_loaded, f)
    # Releasing memory
    del d_loaded
    
#upload feather object
cos.upload_file(Filename='data_equalized.pickle', Bucket=credentials['BUCKET'], Key='data_equalized.pickle') 

In [9]:
cos.download_file(Bucket=credentials['BUCKET'], Key='data_equalized.pickle', Filename='data_equalized.pickle')   

with open('data_equalized.pickle', 'rb') as f:
    d_loaded = pickle.load(f, encoding='latin1')  # dictionary type

We shuffle training data. We also do it with validation and test data although it is not necessary

In [10]:
d_loaded['x_train'], d_loaded['y_train'] = shuffle(d_loaded['x_train'], d_loaded['y_train'], random_state = 0)
d_loaded['x_validation'], d_loaded['y_validation'] = shuffle(d_loaded['x_validation'], d_loaded['y_validation'], random_state = 0)
d_loaded['x_test'], d_loaded['y_test'] = shuffle(d_loaded['x_test'], d_loaded['y_test'], random_state = 0)

We transpose data images in order color channels are first

In [11]:
print(d_loaded['x_train'].shape)
print(d_loaded['x_validation'].shape)
print(d_loaded['x_test'].shape)

(86989, 32, 32, 3)
(4410, 32, 32, 3)
(12630, 32, 32, 3)


In [12]:
d_loaded['x_train'] = d_loaded['x_train'].transpose(0, 3, 1, 2)
d_loaded['x_validation'] = d_loaded['x_validation'].transpose(0, 3, 1, 2)
d_loaded['x_test'] = d_loaded['x_test'].transpose(0, 3, 1, 2)

In [13]:
print(d_loaded['x_train'].shape)
print(d_loaded['x_validation'].shape)
print(d_loaded['x_test'].shape)

(86989, 3, 32, 32)
(4410, 3, 32, 32)
(12630, 3, 32, 32)


As input data is only unstructured data with pixel value volors, we can normalize the data dividing by 255 instead of calculating mean and standard deviation.

In [14]:
with open('data_shuffled.pickle', 'wb') as f:
    pickle.dump(d_loaded, f)
    # Releasing memory
    del d_loaded
    
#upload feather object
cos.upload_file(Filename='data_shuffled.pickle', Bucket=credentials['BUCKET'], Key='data_shuffled.pickle') 

In [15]:
cos.download_file(Bucket=credentials['BUCKET'], Key='data_shuffled.pickle', Filename='data_shuffled.pickle')   

with open('data_shuffled.pickle', 'rb') as f:
    d_loaded = pickle.load(f, encoding='latin1')  # dictionary type

In [16]:
# Normalizing whole data by dividing /255.0
d_loaded['x_train'] = d_loaded['x_train'].astype(np.float32) / 255.0
d_loaded['x_validation'] = d_loaded['x_validation'].astype(np.float32) / 255.0
d_loaded['x_test'] = d_loaded['x_test'].astype(np.float32) / 255.0

In [17]:
with open('data_shuffled_scaled255.pickle', 'wb') as f:
    pickle.dump(d_loaded, f)
    # Releasing memory
    del d_loaded
    
#upload feather object
cos.upload_file(Filename='data_shuffled_scaled255.pickle', Bucket=credentials['BUCKET'], Key='data_shuffled_scaled255.pickle') 