# ETL notebook

Install libraries

In [None]:
# Importing standard libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle

from botocore.client import Config
import ibm_boto3

Define credentials to access a IBM cloud bucket storage

In [None]:
# The code was removed by Watson Studio for sharing.

We define a function to read rgb data from a pickle file provided by the customer

In [3]:
# Defining function for loading dataset from 'pickle' file
def load_rgb_data(file):
    # Opening 'pickle' file and getting images
    with open(file, 'rb') as f:
        d = pickle.load(f, encoding='latin1')  # dictionary type, we use 'latin1' for python3
        # At the same time method 'astype()' used for converting ndarray from int to float
        # It is needed to divide float by float when applying Normalization
        x = d['features'].astype(np.float32)   # 4D numpy.ndarray type, for train = (34799, 32, 32, 3)
        y = d['labels']                        # 1D numpy.ndarray type, for train = (34799,)
        s = d['sizes']                         # 2D numpy.ndarray type, for train = (34799, 2)
        c = d['coords']                        # 2D numpy.ndarray type, for train = (34799, 4)
        """
        Data is a dictionary with four keys:
            'features' - is a 4D array with raw pixel data of the traffic sign images,
                         (number of examples, width, height, channels).
            'labels'   - is a 1D array containing the label id of the traffic sign image,
                         file label_names.csv contains id -> name mappings.
            'sizes'    - is a 2D array containing arrays (width, height),
                         representing the original width and height of the image.
            'coords'   - is a 2D array containing arrays (x1, y1, x2, y2),
                         representing coordinates of a bounding frame around the image.
        """

    # Returning ready data
    return x, y, s, c

The customer has provided 3 separate sets for training, validation and testing. We load into a dictionary of numpy arrays.

In [4]:
cos.download_file(Bucket=credentials['BUCKET'], Key='train.pickle', Filename='train.pickle')   
cos.download_file(Bucket=credentials['BUCKET'], Key='valid.pickle', Filename='valid.pickle')
cos.download_file(Bucket=credentials['BUCKET'], Key='test.pickle', Filename='test.pickle')
cos.download_file(Bucket=credentials['BUCKET'], Key='label_names.csv', Filename='label_names.csv')

# Loading rgb data from training dataset
x_train, y_train, s_train, c_train = load_rgb_data('train.pickle')
#
# # Loading rgb data from validation dataset
x_validation, y_validation, s_validation, c_validation = load_rgb_data('valid.pickle')
#
# # Loading rgb data from test dataset
x_test, y_test, s_test, c_test = load_rgb_data('test.pickle')

The customer has provided a separate csv file with the description of every class (traffic sign description). We define a function to obtain the description of each class

In [5]:
# # Getting texts for every class
labels = pd.read_csv('label_names.csv', delimiter=',')

We define a different set of functions for data augmentation

In [6]:
# # Putting loaded data into the dictionary
# # Equalization is done only for training dataset
d_loaded = {'x_train': x_train, 'y_train': y_train,
            'x_validation': x_validation, 'y_validation': y_validation,
            'x_test': x_test, 'y_test': y_test,
            'labels': labels}

We save all the data together in a dictionary

In [8]:
# Saving loaded and preprocessed data into 'pickle' file
with open('raw_data.pickle', 'wb') as f:
    pickle.dump(d_loaded, f)
    # Releasing memory
del d_loaded

#upload feather object
cos.upload_file(Filename='raw_data.pickle', Bucket=credentials['BUCKET'], Key='raw_data.pickle') 