# Data Dictionary:
## **1. Download Data**
> train_dataset    |   torchvision.datasets.mnist.FashionMNIST<br>
eval_dataset       |   torchvision.datasets.mnist.FashionMNIST

## **2. Prepare Data**


> train_images     |   numpy array (60000, 784)
<br>train_labels   |   numpy array (60000, )
<br>eval_images    |   numpy array (10000, 784)
<br>eval_labels    |   numpy array (10000, )
<br> standardized_train_images | numpy array (60000, 784)
<br> standardized_eval_images |   numpy array (10000, 784)




# **To-Do**
1. Test PCA with sharpened data
2. Test Feature selection on PCs

Add in F1 Score & Confusion matrix

In [0]:
# a = []
# while(1):
#     a.append('1')

# Importing of Libraries

In [2]:
import torchvision
from PIL import Image

import pandas as pd
import numpy as np
import scipy as sp
from scipy import ndimage
from scipy.stats import norm
import scipy.signal as sig

from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from skimage import io
from skimage.feature import hog
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import time, random, math


  import pandas.util.testing as tm


In [0]:
%matplotlib inline

# Functions

In [0]:
def print_time_taken(start_time):
    end_time = time.time()
    secs = end_time-start_time
    mins = math.floor(secs / 60)
    secs = secs % 60
    print('time taken:',str(mins), 'min ',str(secs),'s')

# Data Extraction

In [5]:
# download the Fashion MNIST training data
train_dataset = torchvision.datasets.FashionMNIST(root='./', train=True, download=True)

# download the Fashion MNIST evaluation data
eval_dataset = torchvision.datasets.FashionMNIST(root='./', train=False, download=True)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./FashionMNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./FashionMNIST/raw/train-images-idx3-ubyte.gz to ./FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./FashionMNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./FashionMNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./FashionMNIST/raw/t10k-labels-idx1-ubyte.gz




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./FashionMNIST/raw
Processing...
Done!


In [0]:
labels_dict = {0: "T-shirt/top",
               1: "Trouser",
               2: "Pullover",
               3: "Dress",
               4: "Coat",
               5: "Sandal",
               6: "Shirt",
               7: "Sneaker",
               8: "Bag",
               9: "Ankle boot"}

# Data Preparation

**Reshape data from 28x28 into 784**

In [0]:
train_images = train_dataset.data.numpy().reshape(-1,28*28)
train_labels = train_dataset.targets.data.numpy()

In [8]:
train_images.shape

(60000, 784)

In [9]:
train_labels.shape

(60000,)

In [0]:
eval_images = eval_dataset.data.numpy().reshape(-1,28*28)
eval_labels = eval_dataset.targets.data.numpy()

In [11]:
eval_images.shape

(10000, 784)

In [12]:
eval_labels.shape

(10000,)

**Standardize data**

In [0]:
standardized_train_images = StandardScaler().fit_transform(train_images)
standardized_eval_images = StandardScaler().fit_transform(eval_images)

# Support Vector Machine

In [0]:
# optimal number for random_state
random_seed = 42

**Run SVM on Untreated Data**

In [0]:
# svm = SVC(kernel='linear', random_state=random_seed)

In [0]:
# cvs = cross_val_score(svm, train_images, train_labels,scoring='precision_macro')
# print np.mean(cvs)

In [0]:
# start = time.time()

In [0]:
# svm.fit(train_images,train_labels)

In [0]:
# y_pred_1 = svm.predict(eval_images)

# print('Model classification accuracy: {}%'.format(str(metrics.accuracy_score(eval_labels, y_pred_1) * 100)))
# f1 = f1_score(eval_labels, y_pred_1,average='macro')
# print('F1 score: %f' % f1)

# print_time_taken(start)

**Run SVM on Standardized Data**

In [0]:
svm = SVC(kernel='linear', random_state=random_seed)

In [21]:
cvs = cross_val_score(svm, train_images, train_labels,scoring='precision_macro')
print(np.mean(cvs))

SyntaxError: ignored

In [0]:
start = time.time()

In [0]:
svm.fit(standardized_train_images,train_labels)

In [0]:
y_pred_2 = svm.predict(standardized_eval_images)

print('Model classification accuracy: {}%'.format(str(metrics.accuracy_score(eval_labels, y_pred_2) * 100)))
f1 = f1_score(eval_labels, y_pred_2,average='macro')
print('F1 score: %f' % f1)

print_time_taken(start)

**Run SVM with PCA**

In [0]:
pca = PCA()

pca.n_components = 10

pca_train_images = pca.fit_transform(standardized_train_images)
pca_eval_images = pca.fit_transform(standardized_eval_images)

In [0]:
svm = SVC(kernel='linear', random_state=random_seed)

In [0]:
start = time.time()

In [0]:
svm.fit(pca_train_images,train_labels)

In [0]:
y_pred_3 = svm.predict(pca_eval_images)

print('Model classification accuracy: {}%'.format(str(metrics.accuracy_score(eval_labels, y_pred_3) * 100)))
f1 = f1_score(eval_labels, y_pred_3,average='macro')
print('F1 score: %f' % f1)

print_time_taken(start)

**SVM GridSearchCV**

Testing Linear C Values

In [0]:
c_values = [0.0001, 0.01, 0.1, 1, 10]
gamma_values = ['auto','scale']
degree_values = np.arange(1, 3, 1)
# kernel_types = ['linear','rbf','poly','sigmoid']
# param_grid = dict(kernel=kernel_types, gamma=gamma_range, C=c_range,degree=degree_values)
tuning_param_grid = [{'kernel':['linear'], 'C':c_values}]
                    #  {'kernel':['rbf'], 'C':c_values, 'gamma':gamma_values}]
                    #  {'kernel':['poly'],'degree':degree_values, 'C':c_values, 'gamma':gamma_values}]
                    #  {'kernel':['sigmoid'], 'C':c_values, 'gamma':gamma_values}]
svm = SVC()
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_seed)

In [0]:
start = time.time()

In [0]:
grid = GridSearchCV(svm, param_grid=tuning_param_grid, cv=cv, verbose=10, n_jobs=-1)
grid.fit(standardized_train_images,train_labels)

In [0]:
print ("The best classifier is: " , grid_rbf.best_estimator_)
print ('Score of best classifier :', grid_rbf.score(test,testt))
print_time_taken(start)

In [0]:
# grid.best_param.get("kernel")
# grid.best_param.get("gamma")
# grid.best_param.get("C")
# grid.best_param.get("degree")
# grid.best_param

In [0]:
# for param, score in zip(grid.cv_results_['params'], grid.cv_results_['mean_test_score']):
#     print(param, score)

**Recursive feature elimination**


m = RFECV(RandomForestClassifier(), scoring='accuracy')
m.fit(X, y)