In [2]:
import os
import pandas as pd
import numpy as np

from keras.initializers import he_uniform
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from keras.regularizers import l2
from keras.layers import Lambda, Dense, Dropout, Flatten, BatchNormalization, Activation
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

seed=11
np.random.seed(seed)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 1. Getting Data

1.1 Fetched datasets from local files

In [3]:
dataset_path = '../datasets/'
augmentation_path = os.path.join(dataset_path, 'augmentations')

train_path = os.path.join(dataset_path, 'train.csv')
test_path = os.path.join(dataset_path, 'test.csv')

In [4]:
train_data = pd.read_csv(train_path)
eval_data = pd.read_csv(test_path)
print(train_data.shape)

(42000, 785)


1.2 Transformed datasets to deep learning friendly format

In [5]:
train_set = train_data.values.astype(np.float32)
eval_set = eval_data.values.astype(np.float32)

1.3 Created dev and test sets

In [7]:
split = StratifiedShuffleSplit(random_state=seed, test_size=.3, n_splits=1)

for train_index, test_index in split.split(train_set[:,1:], train_set[:,0]):
    test_set = train_set[test_index]
    train_set = train_set[train_index]

In [8]:
split = StratifiedShuffleSplit(random_state=seed, test_size=.5, n_splits=1)

for dev_index, test_index in split.split(test_set[:,1:], test_set[:,0]):
    dev_set = test_set[dev_index]
    test_set = test_set[test_index]

In [9]:
print('train set:\t', train_set.shape)
print('test set:\t', test_set.shape)
print('dev set:\t', dev_set.shape)

train set:	 (29400, 785)
test set:	 (6300, 785)
dev set:	 (6300, 785)


1.4 removed intersecting instances b/w train, test and dev sets. so the net never learn from test and dev set

In [10]:
train_data = pd.DataFrame(train_set, columns=train_data.columns)

 __Notes__:
 
 1. Successful set up of dev and test set.
 2. Training set will be extended using augmented data 
 3. Training Dev set will be created

## 2. Data Augmentation

#### 2.1 Shifted Augmentations

2.1.1 Fetched shifted augmentations from local file

In [12]:
shifted_class_every_aug_filepath = os.path.join(augmentation_path, 'class-every.csv')

shifted_class_every_aug_data = pd.read_csv(shifted_class_every_aug_filepath)

2.1.2 Appended shifted augmentation to train set

In [13]:
train_data = train_data.append(shifted_class_every_aug_data)
train_set = train_data.values.astype(np.float32)

#### 2.2 Created train and train-dev set

In [14]:
split = StratifiedShuffleSplit(random_state=seed, test_size=.12, n_splits=1)

for train_index, train_dev_index in split.split(train_set[:,1:], train_set[:,0]):
    train_dev_set = train_set[train_dev_index]
    train_set = train_set[train_index]

In [15]:
train_size = len(train_set)
train_dev_size = len(train_dev_set)
dev_size = len(dev_set)
test_size = len(test_set)

total_size = train_dev_size+train_size+dev_size+test_size

labels = np.array(['training set','training dev set','dev set','test set','total']).reshape(-1,1)
sizes = np.array([train_size, train_dev_size, dev_size, test_size, total_size]).reshape(-1,1)
perc = (np.round((sizes/total_size).reshape(-1,1), decimals=2)*100).astype(np.int32)

pd.DataFrame(np.c_[labels, sizes, perc], columns=['Set','Size','Perc'])

Unnamed: 0,Set,Size,Perc
0,training set,141215,82
1,training dev set,19257,11
2,dev set,6300,4
3,test set,6300,4
4,total,173072,100


## 3. Preproccessing Data

### 3.1 Normalization

__Multiple approaches for normalization can be used:__

1. Min-Max Scaling, mostly (0,1)
   
2. Standardize pixel distribution to ensure zero mean and one variance

__Tip: scale data to activation function's range__
    

#### 2.3.1 Min-Max Scaling

In [7]:
def min_max_scale(set_,mn=0,mx=255):
    return (set_-mn)/(mx-mn)

In [17]:
train_set[:,1:] = min_max_scale(train_set[:,1:])
train_dev_set[:,1:] = min_max_scale(train_dev_set[:,1:])
dev_set[:,1:] = min_max_scale(dev_set[:,1:])
test_set[:,1:] = min_max_scale(test_set[:,1:])

In [8]:
eval_set = min_max_scale(eval_set)

saving in payload

In [9]:
payload_path = '../payload'
scaled_path = 'mnmxs'

In [19]:
filepath = os.path.join(payload_path, 'normal-payload.npz')

np.savez(filepath, train_set=train_set, test_set=test_set, dev_set=dev_set, train_dev_set=train_dev_set)

In [10]:
filepath = os.path.join(payload_path, 'normal-eval-payload.npz')
np.savez(filepath, eval_set=eval_set)

#### 2.3.2 Standardization