# Large Scale Data Analysis - Assignment 1

## Task 1 - Implement image classifier with scikit-learn

In [1]:
# quick imports
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read train and test set and separate labels
train_all = np.load('fashion_train.npy')
test_all = np.load('fashion_test.npy')

train_labels = train_all[:,-1]
train_featues = train_all[:, :-1]

test_labels = test_all[:,-1]
test_features = test_all[:, :-1]

In [3]:
# import sklearn elements for preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# for classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# for validation and evaluation
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score

Before classification I have used the combination of standard scaling and Principal Component Analysis to normalize the data to some degree and also to keep only a smaller number of features instead of the original 28 * 28 for the each picture. I have experimented with different number of components received from PCA, and I have experienced an increase in performance until I have increased it to 50.

In [4]:
# setting up pipeline and process train data
preprocessing_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=50))
])

processed_train_data = preprocessing_pipeline.fit_transform(train_featues)

I have trained 4 classifiers (SVM, RandomForest, DecisionTree, LogisticRegression) and also examined their performance after changing their parameters. I have compared their accurary score from 5-fold cross-validations. SVM with polykernel and RandomForest with maximum tree depth of 10 seemed like the best candidates. By SVC I have also tried to a Gaussian kernel, but it was outperformed by polykernel with the chosen parameters. LogisticRegression worked much better when I have applied the penalty term 'l2'.

In [5]:
# LogisticRegression with cross validation
lr_classifier = LogisticRegression(penalty='l2', tol=0.00001)
cross_val_score(lr_classifier, processed_train_data, train_labels, cv = 5, scoring='accuracy')

array([0.8215, 0.818 , 0.8195, 0.825 , 0.819 ])

In [6]:
# RandomForestClassifier with cross validation
rf_classifier = RandomForestClassifier(max_depth = 10)
cross_val_score(rf_classifier, processed_train_data, train_labels, cv = 5, scoring='accuracy')

array([0.8385, 0.832 , 0.839 , 0.841 , 0.8395])

In [7]:
# decisionTreeClassifier with cross calidation
dc_classifier = DecisionTreeClassifier(max_depth=10)
cross_val_score(dc_classifier, processed_train_data, train_labels, cv = 5, scoring='accuracy')

array([0.764 , 0.773 , 0.7635, 0.77  , 0.7675])

In [8]:
# SVM classiffier with cross validation
sv_classifier = SVC(kernel='poly', degree = 3, coef0=2, C = 5)
cross_val_score(sv_classifier, processed_train_data, train_labels, cv = 5, scoring='accuracy')

array([0.8675, 0.8535, 0.864 , 0.8535, 0.8535])

In [9]:
# process testing data
processed_test_data = preprocessing_pipeline.transform(test_features)

As for final evaluation I have trained the classifiers with the picked parameters on the whole training data to test their accuracy, macro precision, macro recall and confusion matrix.
The best results were produced by SVM classifier with very high performance on class 1 and class 3.

In [10]:
lr_classifier.fit(processed_train_data, train_labels)
lr_test_y = lr_classifier.predict(processed_test_data)
print(f"acc = {accuracy_score(test_labels, lr_test_y)}")
print(f"precision = {precision_score(test_labels, lr_test_y, average= 'macro')}")
print(f"recall = {recall_score(test_labels, lr_test_y, average = 'macro')}")
print(confusion_matrix(test_labels, lr_test_y))

acc = 0.8122
precision = 0.8107132210685485
recall = 0.8122
[[810   2  12  64 112]
 [  5 953  10  27   5]
 [ 23   4 808  18 147]
 [ 32  17   7 881  63]
 [169   5 164  53 609]]


In [11]:
# 
sv_classifier.fit(processed_train_data, train_labels)
sv_test_y = sv_classifier.predict(processed_test_data)
print(f"acc = {accuracy_score(test_labels, sv_test_y)}")
print(f"precision = {precision_score(test_labels, sv_test_y, average= 'macro')}")
print(f"recall = {recall_score(test_labels, sv_test_y, average = 'macro')}")
print(confusion_matrix(test_labels, sv_test_y))

acc = 0.845
precision = 0.8440280874600662
recall = 0.845
[[824   6  17  38 115]
 [  4 966   4  20   6]
 [ 30   4 845  23  98]
 [ 35   9  13 910  33]
 [155   5 122  38 680]]


In [12]:
rf_classifier.fit(processed_train_data, train_labels)
rf_test_y = rf_classifier.predict(processed_test_data)
print(f"acc = {accuracy_score(test_labels, rf_test_y)}")
print(f"precision = {precision_score(test_labels, rf_test_y, average= 'macro')}")
print(f"recall = {recall_score(test_labels, rf_test_y, average = 'macro')}")
print(confusion_matrix(test_labels, rf_test_y))

acc = 0.8222
precision = 0.8207599478550769
recall = 0.8221999999999999
[[810   2  15  63 110]
 [  7 936   9  41   7]
 [ 18   0 849  20 113]
 [ 28  11   5 912  44]
 [186   1 163  46 604]]


In [13]:
dc_classifier.fit(processed_train_data, train_labels)
dc_test_y = dc_classifier.predict(processed_test_data)
print(f"acc = {accuracy_score(test_labels, dc_test_y)}")
print(f"precision = {precision_score(test_labels, dc_test_y, average= 'macro')}")
print(f"recall = {recall_score(test_labels, dc_test_y, average = 'macro')}")
print(confusion_matrix(test_labels, dc_test_y))

acc = 0.771
precision = 0.7744361861506388
recall = 0.771
[[722   9  21  88 160]
 [ 11 906  17  46  20]
 [ 24   7 751  37 181]
 [ 30  14  19 863  74]
 [160   8 164  55 613]]


## Task 2 - Keras for image classification

For this task I have implemented a simple network with two dense layers (with added dropout layers) plus a softmax at the end. I have experienced the best results with a using ReLU as activation function for both dense layers, since I have got lesser results with using hyperbolic tangent. 
To avoid overfitting I have chosen to use dropout, without it the net was likely to overfit after epoch 20. 
The reason I have not gone to a more advanced network (CNN) is that I wanted to see the power of a simple network and even this network outperformed the best ML classifier with 86% accuracy over 84% for poly-kernel SVM.

In [14]:
# preparing input, scaling 0-1
train_featues_scaled = train_featues / 255.0
test_features_scaled = test_features / 255.0
# separate development set
train_featues_scaled_train = train_featues_scaled[:7000, :]
train_labels_train = train_labels[:7000]

train_labels_dev = train_labels[7000:]
train_featues_scaled_dev = train_featues_scaled[7000:, :]

In [15]:
import tensorflow as tf
from tensorflow import keras

In [16]:
model = keras.models.Sequential([
    keras.layers.Dense(300, activation = "relu", input_shape = [784,]),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(100, activation = "relu"),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(5, activation = "softmax")
])

In [17]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = keras.optimizers.SGD(lr = 0.01), metrics = ["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 505       
Total params: 266,105
Trainable params: 266,105
Non-trainable params: 0
_________________________________________________________________


In [18]:
history = model.fit(train_featues_scaled_train, train_labels_train, epochs = 30, validation_data=(train_featues_scaled_dev, train_labels_dev), verbose=0)

In [19]:
history = model.fit(train_featues_scaled, train_labels, epochs = 30, verbose = 0)

In [20]:
acc = model.evaluate(test_features_scaled, test_labels, verbose = 0)

In [21]:
print(acc)

[0.3916806790351868, 0.8588]
