# Large Scale Data Analysis
## Assignment 1

## Task 1 - Implement image classifier with scikit-learn

In [15]:
# quick imports
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
# read train and test set and separate labels
train_all = np.load('fashion_train.npy')
test_all = np.load('fashion_test.npy')

train_labels = train_all[:,-1]
train_featues = train_all[:, :-1]

test_labels = test_all[:,-1]
test_features = test_all[:, :-1]

In [17]:
# import sklearn elements for preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# for classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# for validation and evaluation
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score

Before classification I have used the combination of standard scaling and Principal Component Analysis to normalize the data to some degree and also to keep only a smaller number of features instead of the original 28 * 28 for the each picture. I have experimented with different number of components received from PCA, and I have experienced an increase in performance until I have increased it to 50.

In [26]:
# setting up pipeline and process train data
preprocessing_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=50))
])

processed_train_data = preprocessing_pipeline.fit_transform(train_featues)

I have trained 4 classifiers (SVM, RandomForest, DecisionTree, LogisticRegression) and also examined their performance after changing their parameters. I have compared their accurary score from 5-fold cross-validations. SVM with polykernel and RandomForest with 10 maximum tree depth seemed like the best candidates. By SVC I have also tried to a Gaussian kernel, but it was outperformed by polykernel with the chosen parameters. LogisticRegression worked much better when I have applied the penalty term 'l2'.

In [27]:
# LogisticRegression with cross validation
lr_classifier = LogisticRegression(penalty='l2', tol=0.00001)
cross_val_score(lr_classifier, processed_train_data, train_labels, cv = 5, scoring='accuracy')

array([0.82  , 0.8135, 0.82  , 0.822 , 0.82  ])

In [28]:
# RandomForestClassifier with cross validation
rf_classifier = RandomForestClassifier(max_depth = 10)
cross_val_score(rf_classifier, processed_train_data, train_labels, cv = 5, scoring='accuracy')

array([0.8375, 0.8325, 0.8395, 0.838 , 0.828 ])

In [29]:
# decisionTreeClassifier with cross calidation
dc_classifier = DecisionTreeClassifier(max_depth=10)
cross_val_score(dc_classifier, processed_train_data, train_labels, cv = 5, scoring='accuracy')

array([0.762 , 0.776 , 0.7695, 0.7725, 0.766 ])

In [30]:
# SVM classiffier with cross validation
sv_classifier = SVC(kernel='poly', degree = 3, coef0=2, C = 5)
cross_val_score(sv_classifier, processed_train_data, train_labels, cv = 5, scoring='accuracy')

array([0.8635, 0.853 , 0.867 , 0.8505, 0.8515])

In [31]:
# process testing data
processed_test_data = preprocessing_pipeline.transform(test_features)

As for final evaluation I have trained the classifiers with the picked parameters on the whole training data to test their accuracy, macro precision and macro recall and confusion matrix.
Best results were produced by SVM classifier with very high performance on class 1 and class 3.

In [37]:
lr_classifier.fit(processed_train_data, train_labels)
lr_test_y = lr_classifier.predict(processed_test_data)
print(f"acc = {accuracy_score(test_labels, lr_test_y)}")
print(f"precision = {precision_score(test_labels, lr_test_y, average= 'macro')}")
print(f"recall = {recall_score(test_labels, lr_test_y, average = 'macro')}")
print(confusion_matrix(test_labels, lr_test_y))

acc = 0.8122
precision = 0.8104489697732291
recall = 0.8122
[[810   3  12  63 112]
 [  5 953  10  27   5]
 [ 23   4 811  18 144]
 [ 32  16   8 882  62]
 [170   6 165  54 605]]


In [36]:
# 
sv_classifier.fit(processed_train_data, train_labels)
sv_test_y = sv_classifier.predict(processed_test_data)
print(f"acc = {accuracy_score(test_labels, sv_test_y)}")
print(f"precision = {precision_score(test_labels, sv_test_y, average= 'macro')}")
print(f"recall = {recall_score(test_labels, sv_test_y, average = 'macro')}")
print(confusion_matrix(test_labels, sv_test_y))

acc = 0.8448
precision = 0.8438225297590956
recall = 0.8448
[[820   6  14  35 125]
 [  5 966   2  22   5]
 [ 32   4 847  22  95]
 [ 35   7  10 915  33]
 [157   7 123  37 676]]


In [38]:
rf_classifier.fit(processed_train_data, train_labels)
rf_test_y = rf_classifier.predict(processed_test_data)
print(f"acc = {accuracy_score(test_labels, rf_test_y)}")
print(f"precision = {precision_score(test_labels, rf_test_y, average= 'macro')}")
print(f"recall = {recall_score(test_labels, rf_test_y, average = 'macro')}")
print(confusion_matrix(test_labels, rf_test_y))

acc = 0.8278
precision = 0.8267267677707185
recall = 0.8278000000000001
[[813   1  13  67 106]
 [  3 936   8  45   8]
 [ 15   0 861  23 101]
 [ 30   9   6 913  42]
 [177   0 160  47 616]]


In [39]:
dc_classifier.fit(processed_train_data, train_labels)
dc_test_y = dc_classifier.predict(processed_test_data)
print(f"acc = {accuracy_score(test_labels, dc_test_y)}")
print(f"precision = {precision_score(test_labels, dc_test_y, average= 'macro')}")
print(f"recall = {recall_score(test_labels, dc_test_y, average = 'macro')}")
print(confusion_matrix(test_labels, dc_test_y))

acc = 0.7684
precision = 0.7702856724341037
recall = 0.7684
[[725   5  26  67 177]
 [ 12 906  16  48  18]
 [ 31   6 756  42 165]
 [ 37  29  21 853  60]
 [164  10 159  65 602]]


## Task 2 - Keras for image classification

For this task I have implemented a simple network with two dense layers (with added dropout layers) plus a softmax at the end. I have experienced the best results with a using ReLU as activation function for both dense layers, since I have got lesser results with using hyperbolic tangent. 
To avoid overfitting I have chosen to use dropout, without it the net was likely to overfit after epoch 20. 
The reason I have not gone to a more advanced network (CNN) is that currently I am still in the process of deepening my keras knowledge and even this network outperformed the best ML classifier with 86% accuracy over 84% for poly-kernel SVM.

In [52]:
# preparing input, scaling 0-1
train_featues_scaled = train_featues / 255.0
test_features_scaled = test_features / 255.0
# separate development set
train_featues_scaled_train = train_featues_scaled[:7000, :]
train_labels_train = train_labels[:7000]

train_labels_dev = train_labels[7000:]
train_featues_scaled_dev = train_featues_scaled[7000:, :]

In [54]:
import tensorflow as tf
from tensorflow import keras

In [99]:
model = keras.models.Sequential([
    keras.layers.Dense(300, activation = "relu", input_shape = [784,]),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(100, activation = "relu"),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(5, activation = "softmax")
])

In [105]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = keras.optimizers.SGD(lr = 0.01), metrics = ["accuracy"])
model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_36 (Dense)             (None, 300)               235500    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_37 (Dense)             (None, 100)               30100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 5)                 505       
Total params: 266,105
Trainable params: 266,105
Non-trainable params: 0
_________________________________________________________________


In [106]:
history = model.fit(train_featues_scaled_train, train_labels_train, epochs = 30, validation_data=(train_featues_scaled_dev, train_labels_dev), verbose=0)

In [107]:
history = model.fit(train_featues_scaled, train_labels, epochs = 30, verbose = 0)

In [108]:
acc = model.evaluate(test_features_scaled, test_labels, verbose = 0)