# DATA AUGMENTATION

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.datasets import fetch_openml
import warnings 
warnings.filterwarnings("ignore")

In [17]:
## Loading MNIST dataset
mnist = fetch_openml("mnist_784", version=1)

In [18]:
X, y = mnist["data"], mnist["target"]

In [19]:
## The label is a string, so let's cast it to integers
y = y.astype(np.uint8)

In [20]:
## Let's split the dataset into training and test sets
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [47]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [30]:
4*[y_train[0]]

[5, 5, 5, 5]

In [61]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
g = y_train

In [33]:
np.concatenate((g, 4*[y_train[0]]))

array([5, 0, 4, ..., 5, 5, 5], dtype=uint8)

In [22]:
from scipy.ndimage import shift

In [49]:
type(y_train[0])

numpy.uint8

In [69]:
def shiftingImage(image, down, right):
    image = image.reshape(28,28)
    r_image = shift(image, [0, right],  mode="constant").reshape(1, 784)
    l_image = shift(image, [0, -right],  mode="constant").reshape(1, 784)
    d_image = shift(image, [down, 0],  mode="constant").reshape(1, 784)
    up_image = shift(image, [-down, 0]).reshape(1, 784)
    
    #print(r_image.shape)

    final_image_array = np.concatenate((r_image, l_image, d_image, up_image), axis=0)

    return final_image_array

In [28]:
X_train.shape[0]

60000

In [42]:
b = X_train.to_numpy()
b

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [72]:
t = shiftingImage(X_train[0], 1, 1)
to = [ti for ti in t]
for i in t:
    to.append(i)
    

In [73]:
len(to)

8

In [80]:
X_train_shifted = [img for img in X_train]
y_train_shifted = y_train

for row in range(60000):
    
    imageShifted = shiftingImage(X_train[row], 1, 1)
    
    for shifts in imageShifted:
        X_train_shifted.append(shifts)
        
    if row == 0 :
        y_train_shifted = np.concatenate((y_train, 4*[y_train[row]]))
    else:
        y_train_shifted = np.concatenate((y_train_shifted, 4*[y_train[row]]))

In [81]:
X_train_shifted = np.array(X_train_shifted)
y_train_shifted = np.array(y_train_shifted)

In [82]:
X_train_shifted.shape

(300000, 784)

In [83]:
y_train_shifted.shape

(300000,)

In [79]:
y_train.shape

(60000,)

In [84]:
from sklearn.ensemble import RandomForestClassifier

In [85]:
from sklearn.neighbors import KNeighborsClassifier

In [87]:
knn_clf = KNeighborsClassifier(n_neighbors= 4, weights = 'distance')

In [88]:
knn_clf.fit(X_train_shifted, y_train_shifted)

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [89]:
y_pred_knn = knn_clf.predict(X_test)

In [90]:
from sklearn.metrics import accuracy_score

In [91]:
accuracy_score(y_test, y_pred_knn)

0.9763

We can obeserve that we're getting an improvement for the accuracy score. It passed from 97.14% with the original datasets to 97.63% with data augmentation.

# Let's use the Random Forest Classifier

In [92]:
rf_clf = RandomForestClassifier()

In [93]:
rf_clf.fit(X_train_shifted, y_train_shifted)

RandomForestClassifier()

In [94]:
y_pred_rf = rf_clf.predict(X_test)

In [95]:
accuracy_score(y_test, y_pred_rf)

0.9806

The random Forest (an ensemble technique - bagging) is performing very well on the test data. It outperforms the accuracy score of KNN classifier of 0.43%.