In [1]:
import os
import cv2
import pickle
import numpy as np
import pdb
import requests
from collections import defaultdict
import random 
import time

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeClassifier as DTree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import *

from functools import wraps
from time import time as _timenow 
from sys import stderr


## Load CIFAR-10 Data

In [2]:
def load_cifar():
    
    trn_data, trn_labels, tst_data, tst_labels = [], [], [], []
    def unpickle(file):
        with open(file, 'rb') as fo:
            data = pickle.load(fo, encoding='latin1')
        return data
    
    for i in trange(1):
        batchName = './data/data_batch_{0}'.format(i + 1)
        unpickled = unpickle(batchName)
        trn_data.extend(unpickled['data'])
        trn_labels.extend(unpickled['labels'])
    unpickled = unpickle('./data/test_batch')
    tst_data.extend(unpickled['data'])
    tst_labels.extend(unpickled['labels'])
    return np.array(trn_data), np.array(trn_labels), np.array(tst_data), np.array(tst_labels)




## Image preprocessing

In [3]:
def image_prep(train_data, mean_img):
    ''' pre-processes the given image
        performs mean normalization and other such operations'''
    train_data = train_data - mean_img
    train_data = train_data.astype(np.float)
    train_data = train_data * (1 / 255.0)
    return train_data

## Dimensionality reduction using PCA

In [4]:


def reduce_dim(**kwargs):
    ''' performs dimensionality reduction'''
    if kwargs['method'] == 'pca':
        pca = PCA(n_components=kwargs['n_components'])
        transformed_data = pca.fit_transform(kwargs['data'])
        return transformed_data, pca
    if kwargs['method'] == 'lda':
        lda = LDA(n_components=kwargs['n_components'])
        transformed_data = lda.fit_transform(kwargs['data'], kwargs['labels'])
        return transformed_data, lda

        

## Classification using kernel SVM

In [39]:
def classify(X, y, X_val, y_val, **kwargs):
    ''' trains a classifier by taking input features
        and their respective targets and returns the trained model'''
    if kwargs['method'] == 'SVM':
        model = SVC(C=kwargs['C'], kernel='linear')
    elif kwargs['method'] == 'SVM RBF':
        model = SVC(C=kwargs['C'], kernel='rbf', gamma=kwargs['gamma'])
    elif kwargs['method'] == 'DT':
        model = DTree(criterion=kwargs['criterion'], max_depth=kwargs['max_depth'])
    elif kwargs['method'] == 'LogisticRegression':
        model = LogisticRegression()
    else:
        raise ValueError("Not a valid method")
    model.fit(X, y)
    train_acc = model.score(X, y)
    val_acc = model.score(X_val, y_val)
    return model, train_acc, val_acc


## Evaluation 

In [6]:
def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='micro')
    acc = accuracy_score(target, predicted)
    return f1, acc

In [7]:
def test(X, y, model):
    '''takes test data and trained classifier model,
    performs classification and prints accuracy and f1-score'''
    prediction = model.predict(X)
    f1, acc = evaluate(y, prediction)
    return f1, acc

In [8]:
def load_and_format_data():
    trn_data, trn_labels, tst_data, tst_labels = load_cifar()
    mean_img = np.mean(trn_data, axis=0)
    trn_data, tst_data = image_prep(trn_data, mean_img), image_prep(tst_data, mean_img)
    X_train, X_val, y_train, y_val = train_test_split(trn_data, trn_labels, test_size = 0.20) 
    return X_train, y_train, X_val, y_val, tst_data, tst_labels


In [9]:
X_train, y_train, X_val, y_val, X_test, y_test = load_and_format_data()

100%|██████████| 1/1 [00:00<00:00,  1.24it/s]


# Experiments:

We will now try and determine the best possible combination of features as well as models with various hyperparameters for obtaining the best test accuracy on the dataset. We will try the possible combinations below along with a discussion on overfitting and the various results based on varying the hyperparameters/features in each case. The best parameters based on validation scores are selected for obtainging the test accuracy. A summarized table will be presented at the end of the experiments.

### With Logistic Regression based classifier and raw data

In [64]:
model_params = {'method': "LogisticRegression"}
model, train_acc, val_acc = classify(X_train, y_train, X_val, y_val, **model_params)
print("Logistic Regression + raw data:  train={}, val={}".format(train_acc, val_acc))

Logistic Regression + raw data:  train=0.7045, val=0.3195


**The model overfits on the raw data when no reduction is done.**

**Test Accuracy:**

In [65]:
f1score, accuracy = test(X_test, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.3308   Accuracy: 0.3308


### With Logistic Regression based classifier and LDA

In [67]:
reduction_params = {'method': "lda", "data": X_train, "labels": y_train}
model_params = {'method': "LogisticRegression"}

n_components = [5, 9]

for nc in n_components:
    reduction_params['n_components'] = nc
    X_reduced, fmodel = reduce_dim(**reduction_params)
    X_val_reduced = fmodel.transform(X_val)
    model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)
    print("Logistic Regression + LDA: n_components={}, train={}, val={}".format(nc, train_acc, val_acc))



Logistic Regression + LDA: n_components=5, train=0.653375, val=0.2125
Logistic Regression + LDA: n_components=9, train=0.8515, val=0.2185


**Model seems to overfit for LDA. This is because the reduction in LDA can have a maximum of num_classes-1 as the reduced dimension, for a small number of classes, this is not sufficient to capture the proper structure of the data and hence this leads to overfitting.**

**The Test Accuracy for the model is:**

In [68]:
reduction_params = {'method': "lda", "data": X_train, "labels": y_train}
reduction_params['n_components'] = 9
X_reduced, fmodel = reduce_dim(**reduction_params)
X_val_reduced = fmodel.transform(X_val)
model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)

X_test_reduced = fmodel.transform(X_test)
f1score, accuracy = test(X_test_reduced, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))



Test - F1 score: 0.2374   Accuracy: 0.2374


### With Logistic Regression based classifier and PCA

In [50]:
reduction_params = {'method': "pca", "data": X_train, "labels": y_train}
model_params = {'method': "LogisticRegression"}

n_components = [50, 100, 250, 500, 1000]

for nc in n_components:
    reduction_params['n_components'] = nc
    X_reduced, fmodel = reduce_dim(**reduction_params)
    X_val_reduced = fmodel.transform(X_val)
    model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)
    print("Logistic Regression + PCA: n_components={}, train={}, val={}".format(nc, train_acc, val_acc))

Logistic Regression + PCA: n_components=50, train=0.38725, val=0.3725
Logistic Regression + PCA: n_components=100, train=0.424, val=0.3875
Logistic Regression + PCA: n_components=250, train=0.4715, val=0.3725
Logistic Regression + PCA: n_components=500, train=0.5315, val=0.353
Logistic Regression + PCA: n_components=1000, train=0.630125, val=0.326


**With less components, we see that the model trains relatively well on both training and validation data. As the number of components increase, we can see that the model starts overfitting on training data. 100 components seem to be the best with relatively close training and validation scores.**

**The Test Accuracy for the model is:**

In [63]:
reduction_params = {'method': "pca", "data": X_train, "labels": y_train}
reduction_params['n_components'] = 100
X_reduced, fmodel = reduce_dim(**reduction_params)
X_val_reduced = fmodel.transform(X_val)
model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)

X_test_reduced = fmodel.transform(X_test)
f1score, accuracy = test(X_test_reduced, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.3818   Accuracy: 0.3818


### With Soft Linear SVM and raw data

In [10]:
model_params = {'method': "SVM"}
C = [1e-4, 1e-2, 0.1, 1, 10]

for c in C:
    model_params['C'] = c
    model, train_acc, val_acc = classify(X_train, y_train, X_val, y_val, **model_params)
    print("Soft Linear SVM + raw data: C={}, train={}, val={}".format(c, train_acc, val_acc))

Soft Linear SVM + raw data: C=0.0001, train=0.3065, val=0.2735
Soft Linear SVM + raw data: C=0.01, train=0.515125, val=0.37
Soft Linear SVM + raw data: C=0.1, train=0.6925, val=0.3315
Soft Linear SVM + raw data: C=1, train=0.939375, val=0.289
Soft Linear SVM + raw data: C=10, train=0.999875, val=0.2875


**The penalty parameter determines the width of the margin, a larger margin allows more misclassification while training, this causes the model to perform well on the training data, but not on the testing/validation data. A good value of C would be one where training and validation accuracy is in the same region.**

**Test Accuracy:**

In [13]:
model_params['C'] = 1e-4
model, train_acc, val_acc = classify(X_train, y_train, X_val, y_val, **model_params)
f1score, accuracy = test(X_test, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.2123   Accuracy: 0.2123


### With Soft Linear SVM and LDA

In [16]:
reduction_params = {'method': "lda", "data": X_train, "labels": y_train}
model_params = {'method': "SVM"}

n_components = [5, 9]
C = [1e-4, 1e-2, 0.1, 1, 10]

for nc in n_components:
    reduction_params['n_components'] = nc
    X_reduced, fmodel = reduce_dim(**reduction_params)
    X_val_reduced = fmodel.transform(X_val)
    for c in C:
        model_params['C'] = c
        model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)
        print("Soft SVM + LDA: n_components={}, C={}, train={}, val={}".format(nc, c, train_acc, val_acc))

Soft SVM + LDA: n_components=5, C=0.0001, train=0.1125, val=0.105
Soft SVM + LDA: n_components=5, C=0.01, train=0.6415, val=0.207
Soft SVM + LDA: n_components=5, C=0.1, train=0.6495, val=0.212
Soft SVM + LDA: n_components=5, C=1, train=0.66, val=0.218
Soft SVM + LDA: n_components=5, C=10, train=0.648, val=0.2065
Soft SVM + LDA: n_components=9, C=0.0001, train=0.111, val=0.102
Soft SVM + LDA: n_components=9, C=0.01, train=0.871, val=0.221
Soft SVM + LDA: n_components=9, C=0.1, train=0.8775, val=0.216
Soft SVM + LDA: n_components=9, C=1, train=0.8535, val=0.2305
Soft SVM + LDA: n_components=9, C=10, train=0.867, val=0.2195


**Model seems to overfit for LDA except for a very small margin. This is because the reduction in LDA can have a maximum of num_classes-1 as the reduced dimension, for a small number of classes, this is not sufficient to capture the proper structure of large dimensional data such as images and hence this leads to overfitting.**

**The Test Accuracy for the model is:**

In [18]:
reduction_params = {'method': "lda", "data": X_train, "labels": y_train}
reduction_params['n_components'] = 5
X_reduced, fmodel = reduce_dim(**reduction_params)
X_val_reduced = fmodel.transform(X_val)
model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)

model_params = {'method': "SVM", 'C': 1e-4}
X_test_reduced = fmodel.transform(X_test)
f1score, accuracy = test(X_test_reduced, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.2378   Accuracy: 0.2378


### With Soft Linear SVM and PCA

In [20]:
reduction_params = {'method': "pca", "data": X_train, "labels": y_train}
model_params = {'method': "SVM"}

n_components = [50, 100, 250, 500, 1000]
C = [1e-4, 1e-2, 0.1, 1, 10]

for nc in n_components:
    reduction_params['n_components'] = nc
    X_reduced, fmodel = reduce_dim(**reduction_params)
    X_val_reduced = fmodel.transform(X_val)
    for c in C:
        model_params['C'] = c
        model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)
        print("Soft SVM + PCA: n_components={}, C={}, train={}, val={}".format(nc, c, train_acc, val_acc))

Soft SVM + PCA: n_components=50, C=0.0001, train=0.2125, val=0.191
Soft SVM + PCA: n_components=50, C=0.01, train=0.4465, val=0.329
Soft SVM + PCA: n_components=50, C=0.1, train=0.4955, val=0.304
Soft SVM + PCA: n_components=50, C=1, train=0.5125, val=0.314
Soft SVM + PCA: n_components=50, C=10, train=0.479, val=0.2895
Soft SVM + PCA: n_components=100, C=0.0001, train=0.2215, val=0.181
Soft SVM + PCA: n_components=100, C=0.01, train=0.49, val=0.3315
Soft SVM + PCA: n_components=100, C=0.1, train=0.6255, val=0.3135
Soft SVM + PCA: n_components=100, C=1, train=0.7125, val=0.2945
Soft SVM + PCA: n_components=100, C=10, train=0.736, val=0.278
Soft SVM + PCA: n_components=250, C=0.0001, train=0.184, val=0.171
Soft SVM + PCA: n_components=250, C=0.01, train=0.5745, val=0.3295
Soft SVM + PCA: n_components=250, C=0.1, train=0.7935, val=0.298
Soft SVM + PCA: n_components=250, C=1, train=0.977, val=0.2835
Soft SVM + PCA: n_components=250, C=10, train=1.0, val=0.2665
Soft SVM + PCA: n_components=

**With less components, we see that the model trains relatively well on both training and validation data. As the number of components increase, we can see that the model starts overfitting on training data. 100 components seem to be the best with relatively close training and validation scores. Overfitting can also be seen when larger margins are present in the model. A margin with penalty 0.01 seems to be doing relatively well across all cases.**

**The Test Accuracy for the model is:**

In [21]:
reduction_params = {'method': "pca", "data": X_train, "labels": y_train}
reduction_params['n_components'] = 100
model_params = {'method': "SVM", 'C': 0.01}

X_reduced, fmodel = reduce_dim(**reduction_params)
X_val_reduced = fmodel.transform(X_val)
model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)

X_test_reduced = fmodel.transform(X_test)
f1score, accuracy = test(X_test_reduced, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.3494   Accuracy: 0.3494


### With SVM RBF and raw data

In [24]:
model_params = {'method': "SVM RBF"}
C = [1e-2, 0.1, 1]
gamma = [3e-4, 1e-1, 1, 10]

for c in C:
    for g in gamma:
        model_params['C'] = c
        model_params['gamma'] = g
        model, train_acc, val_acc = classify(X_train, y_train, X_val, y_val, **model_params)
        print("SVM RBF + raw data: C={}, gamma={}, train={}, val={}".format(c, g, train_acc, val_acc))

SVM RBF + raw data: C=0.01, gamma=0.0003, train=0.1095, val=0.1015
SVM RBF + raw data: C=0.1, gamma=0.0003, train=0.161, val=0.1295
SVM RBF + raw data: C=1, gamma=0.0003, train=0.3635, val=0.29
SVM RBF + raw data: C=0.01, gamma=0.1, train=0.1095, val=0.091
SVM RBF + raw data: C=0.01, gamma=1, train=0.1075, val=0.0985
SVM RBF + raw data: C=0.01, gamma=10, train=0.223, val=0.105
SVM RBF + raw data: C=0.1, gamma=0.1, train=0.113, val=0.105
SVM RBF + raw data: C=0.1, gamma=1, train=0.213, val=0.167
SVM RBF + raw data: C=0.1, gamma=10, train=0.118, val=0.1015
SVM RBF + raw data: C=1, gamma=0.1, train=1.0, val=0.102
SVM RBF + raw data: C=1, gamma=1, train=1.0, val=0.0925
SVM RBF + raw data: C=1, gamma=10, train=1.0, val=0.1015


**The penalty parameter determines the width of the margin, a larger margin allows more misclassification while training, this causes the model to perform well on the training data, but not on the testing/validation data. A good value of C would be one where training and validation accuracy is in the same region. The value of gamma determines the deviation of the RBF kernel. A good value of gamma would also be one where training and validation accuracy is in the same region.**

**Test Accuracy:**

In [25]:
model_params['C'] = 1
model_params['gamma'] = 3e-4
model, train_acc, val_acc = classify(X_train, y_train, X_val, y_val, **model_params)
f1score, accuracy = test(X_test, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.308   Accuracy: 0.308


### With SVM RBF and LDA

In [26]:
reduction_params = {'method': "lda", "data": X_train, "labels": y_train}
model_params = {'method': "SVM RBF"}

C = [1e-2, 0.1, 1]
gamma = [3e-4, 1e-1, 1]

n_components = [9]

for nc in n_components:
    reduction_params['n_components'] = nc
    X_reduced, fmodel = reduce_dim(**reduction_params)
    X_val_reduced = fmodel.transform(X_val)
    for c in C:
        for g in gamma:
            model_params['C'] = c
            model_params['gamma'] = g
            model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)
            print("SVM RBF + LDA: n_components={}, C={}, gamma={}, train={}, val={}".format(nc, c, g, train_acc, val_acc))

SVM RBF + LDA: n_components=9, C=0.01, gamma=0.0003, train=0.112, val=0.104
SVM RBF + LDA: n_components=9, C=0.01, gamma=0.1, train=0.171, val=0.102
SVM RBF + LDA: n_components=9, C=0.01, gamma=1, train=0.118, val=0.0925
SVM RBF + LDA: n_components=9, C=0.1, gamma=0.0003, train=0.1185, val=0.1015
SVM RBF + LDA: n_components=9, C=0.1, gamma=0.1, train=0.87, val=0.2195
SVM RBF + LDA: n_components=9, C=0.1, gamma=1, train=0.1065, val=0.1105
SVM RBF + LDA: n_components=9, C=1, gamma=0.0003, train=0.8475, val=0.2165
SVM RBF + LDA: n_components=9, C=1, gamma=0.1, train=0.9145, val=0.211
SVM RBF + LDA: n_components=9, C=1, gamma=1, train=1.0, val=0.127


**Model seems to overfit for LDA except for a very small margin or a small value of gamma. This is because the reduction in LDA can have a maximum of num_classes-1 as the reduced dimension, for a small number of classes, this is not sufficient to capture the proper structure of large dimensional data such as images and hence this leads to overfitting. A smaller gamma also seems to fit well for the low dimensional data.**

**The Test Accuracy for the model is:**

In [27]:
reduction_params = {'method': "lda", "data": X_train, "labels": y_train}
reduction_params['n_components'] = 9
X_reduced, fmodel = reduce_dim(**reduction_params)
X_val_reduced = fmodel.transform(X_val)
model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)

model_params = {'method': "SVM RBF", 'gamma': 3e-4, 'C': 0.1}
X_test_reduced = fmodel.transform(X_test)
f1score, accuracy = test(X_test_reduced, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.1413   Accuracy: 0.1413


### With SVM RBF and PCA

In [28]:
reduction_params = {'method': "pca", "data": X_train, "labels": y_train}
model_params = {'method': "SVM RBF"}

C = [1e-2, 0.1, 1]
gamma = [3e-4, 1e-1, 1]

n_components = [50, 100, 250, 500]

for nc in n_components:
    reduction_params['n_components'] = nc
    X_reduced, fmodel = reduce_dim(**reduction_params)
    X_val_reduced = fmodel.transform(X_val)
    for c in C:
        for g in gamma:
            model_params['C'] = c
            model_params['gamma'] = g
            model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)
            print("SVM RBF + PCA: n_components={}, C={}, gamma={}, train={}, val={}".format(nc, c, g, train_acc, val_acc))

SVM RBF + PCA: n_components=50, C=0.01, gamma=0.0003, train=0.1075, val=0.0985
SVM RBF + PCA: n_components=50, C=0.01, gamma=0.1, train=0.1165, val=0.0925
SVM RBF + PCA: n_components=50, C=0.01, gamma=1, train=0.115, val=0.0925
SVM RBF + PCA: n_components=50, C=0.1, gamma=0.0003, train=0.177, val=0.143
SVM RBF + PCA: n_components=50, C=0.1, gamma=0.1, train=0.111, val=0.1015
SVM RBF + PCA: n_components=50, C=0.1, gamma=1, train=0.106, val=0.1015
SVM RBF + PCA: n_components=50, C=1, gamma=0.0003, train=0.3525, val=0.297
SVM RBF + PCA: n_components=50, C=1, gamma=0.1, train=1.0, val=0.1525
SVM RBF + PCA: n_components=50, C=1, gamma=1, train=1.0, val=0.1
SVM RBF + PCA: n_components=100, C=0.01, gamma=0.0003, train=0.11, val=0.091
SVM RBF + PCA: n_components=100, C=0.01, gamma=0.1, train=0.115, val=0.0985
SVM RBF + PCA: n_components=100, C=0.01, gamma=1, train=0.1095, val=0.091
SVM RBF + PCA: n_components=100, C=0.1, gamma=0.0003, train=0.182, val=0.1515
SVM RBF + PCA: n_components=100, C=

**With less components, we see that the model trains relatively well on both training and validation data. As the number of components increase, we can see that the model starts overfitting on training data. 50 components seem to be the best with relatively close training and validation scores. Overfitting can also be seen when larger margins are present in the model. A combination of gamma=0.0003 and C=1 seems to be working the best in all cases.**

**The Test Accuracy for the model is:**

In [29]:
reduction_params = {'method': "pca", "data": X_train, "labels": y_train}
reduction_params['n_components'] = 50
model_params = {'method': "SVM RBF", 'C': 1, 'gamma':0.0003}

X_reduced, fmodel = reduce_dim(**reduction_params)
X_val_reduced = fmodel.transform(X_val)
model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)

X_test_reduced = fmodel.transform(X_test)
f1score, accuracy = test(X_test_reduced, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.3077   Accuracy: 0.3077


### With DTree and raw data

In [32]:
model_params = {'method': "DT", 'criterion': 'gini'}
max_depth = [2, 5, 10, 50, 100]

for m in max_depth:
    model_params['max_depth'] = m
    model, train_acc, val_acc = classify(X_train, y_train, X_val, y_val, **model_params)
    print("DTree + raw data: max_depth={}, train={}, val={}".format(m, train_acc, val_acc))

DTree + raw data: max_depth=2, train=0.20025, val=0.181
DTree + raw data: max_depth=5, train=0.300375, val=0.2445
DTree + raw data: max_depth=10, train=0.587375, val=0.2475
DTree + raw data: max_depth=50, train=1.0, val=0.236
DTree + raw data: max_depth=100, train=1.0, val=0.237



**Max Depth determines the maximum depth of the decision tree, as the depth of the tree increases, we see that more and more training samples are accurately classified, however this also leads to overfitting as the each node contains fewer samples and it is less generalized. A depth of 5 seems to be working well for the raw data.**

**Test Accuracy:**

In [33]:
model_params['max_depth'] = 5

model, train_acc, val_acc = classify(X_train, y_train, X_val, y_val, **model_params)
f1score, accuracy = test(X_test, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.2678   Accuracy: 0.2678


### With DTree and LDA

In [34]:
reduction_params = {'method': "lda", "data": X_train, "labels": y_train}
model_params = {'method': "DT", 'criterion': 'gini'}

max_depth = [2, 5, 10, 50, 100]
n_components = [5, 9]

for nc in n_components:
    reduction_params['n_components'] = nc
    X_reduced, fmodel = reduce_dim(**reduction_params)
    X_val_reduced = fmodel.transform(X_val)
    for m in max_depth:
        model_params['max_depth'] = m
        model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)
        print("DTree + LDA: n_components={}, max_depth={}, train={}, val={}".format(nc, m, train_acc, val_acc))

DTree + LDA: n_components=5, max_depth=2, train=0.337375, val=0.15
DTree + LDA: n_components=5, max_depth=5, train=0.575875, val=0.195
DTree + LDA: n_components=5, max_depth=10, train=0.714125, val=0.2035
DTree + LDA: n_components=5, max_depth=50, train=1.0, val=0.214
DTree + LDA: n_components=5, max_depth=100, train=1.0, val=0.2105
DTree + LDA: n_components=9, max_depth=2, train=0.337375, val=0.15
DTree + LDA: n_components=9, max_depth=5, train=0.7425, val=0.2
DTree + LDA: n_components=9, max_depth=10, train=0.863625, val=0.191
DTree + LDA: n_components=9, max_depth=50, train=1.0, val=0.189
DTree + LDA: n_components=9, max_depth=100, train=1.0, val=0.186


**Model seems to overfit for LDA except for trees with small depth. This is because the reduction in LDA can have a maximum of num_classes-1 as the reduced dimension, for a small number of classes, this is not sufficient to capture the proper structure of large dimensional data such as images and hence this leads to overfitting. Smaller trees also seems to fit well for the low dimensional data.**

**The Test Accuracy for the model is:**

In [35]:
reduction_params = {'method': "lda", "data": X_train, "labels": y_train}
reduction_params['n_components'] = 9
X_reduced, fmodel = reduce_dim(**reduction_params)
X_val_reduced = fmodel.transform(X_val)
model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)

model_params = {'method': "DT", 'max_depth': 2, 'criterion': 'gini'}
X_test_reduced = fmodel.transform(X_test)
f1score, accuracy = test(X_test_reduced, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.2032   Accuracy: 0.2032


### With DTree and PCA

In [37]:
reduction_params = {'method': "pca", "data": X_train, "labels": y_train}
model_params = {'method': "DT", 'criterion': 'gini'}

max_depth = [2, 5, 10, 50, 100]

n_components = [50, 100, 250, 500]

for nc in n_components:
    reduction_params['n_components'] = nc
    X_reduced, fmodel = reduce_dim(**reduction_params)
    X_val_reduced = fmodel.transform(X_val)
    for m in max_depth:
        model_params['max_depth'] = m
        model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)
        print("DTree + PCA: n_components={}, max_depth={}, train={}, val={}".format(nc, m, train_acc, val_acc))

DTree + PCA: n_components=50, max_depth=2, train=0.19525, val=0.1835
DTree + PCA: n_components=50, max_depth=5, train=0.289, val=0.242
DTree + PCA: n_components=50, max_depth=10, train=0.550125, val=0.2655
DTree + PCA: n_components=50, max_depth=50, train=1.0, val=0.237
DTree + PCA: n_components=50, max_depth=100, train=1.0, val=0.231
DTree + PCA: n_components=100, max_depth=2, train=0.19525, val=0.1835
DTree + PCA: n_components=100, max_depth=5, train=0.289375, val=0.2425
DTree + PCA: n_components=100, max_depth=10, train=0.5605, val=0.253
DTree + PCA: n_components=100, max_depth=50, train=1.0, val=0.226
DTree + PCA: n_components=100, max_depth=100, train=1.0, val=0.2315
DTree + PCA: n_components=250, max_depth=2, train=0.19525, val=0.1835
DTree + PCA: n_components=250, max_depth=5, train=0.289125, val=0.2405
DTree + PCA: n_components=250, max_depth=10, train=0.56175, val=0.259
DTree + PCA: n_components=250, max_depth=50, train=1.0, val=0.2225
DTree + PCA: n_components=250, max_depth=

**Model seems to be training well when number of components increase, which will always be the case because features are independent of each other in DTrees. The more features, the better the model seems to perform. The model starts overfitting at larger depths. 5 seems to be a good depth without causing overfitting.**

**The Test Accuracy for the model is:**

In [38]:
reduction_params = {'method': "pca", "data": X_train, "labels": y_train}
reduction_params['n_components'] = 500
model_params = {'method': "DT", 'criterion': 'gini', 'max_depth': 5}

X_reduced, fmodel = reduce_dim(**reduction_params)
X_val_reduced = fmodel.transform(X_val)
model, train_acc, val_acc = classify(X_reduced, y_train, X_val_reduced, y_val, **model_params)

X_test_reduced = fmodel.transform(X_test)
f1score, accuracy = test(X_test_reduced, y_test, model)
print('Test - F1 score: {}   Accuracy: {}'.format(f1score, accuracy))

Test - F1 score: 0.2586   Accuracy: 0.2586


# Strategies to avoid overfitting

We see in many of the cases, the model overfits the training data and has a poor validation/test accuracy. There are a few ways in which this can be avoided:

- Use larger training datasets. More variety in the training data causes models to learn underlying representations of the data rather than find patterns in the training set. This is true for any model that learns parameters for classification(MLP/SVM) or uses the training data for classifying the test set(KNN).
- Use models that generalise well. The more generic the model, the better is it's ability to learn the structure of the data. Simple models tend to overfit on complex data relatively easily.
- Use regularization. Regularization causes the model parameters to be constrained and hence forces the model to learn much simpler solutions, rather than complex patterns that will overfit the training data.
- Use data augmentation to increase the variety of samples in the dataset. Rotating images, scaling, cropping all can be used to create larger datasets with more variety for each class, thus avoiding overfitting.



# Summary

The following section contains the summarized results for the different experiments along with the best parameters and test accuracy for each of them. The discussion of the experiment and the reason as to why those parameters were chosen were previously discussed in the subsequent experiment section.

### Logistic Regression parameter selection

| Feature | Reduction Parameters | Model Parameters | Training | Validation |
|---------|----------------------|------------------|----------|------------|
| Raw     | None                 | None             | 0.7045   | 0.3195     |
| LDA     | n_components=5       | None             | 0.653375 | 0.2125     |
| LDA     | n_components=5       | None             | 0.8515   | 0.2185     |
| PCA     | n_components=50      | None             | 0.38725  | 0.3725     |
| PCA     | n_components=10      | None             | 0.424    | 0.3875     |
| PCA     | n_components=25      | None             | 0.4715   | 0.3725     |
| PCA     | n_components=50      | None             | 0.5315   | 0.353      |
| PCA     | n_components=1000    | None             | 0.630125 | 0.326      |

### Logistic Regression test accuracy (using best parameters)

| Feature | Reduction Parameters | Model Parameters | F1 score | Accuracy |
|---------|----------------------|------------------|----------|----------|
| Raw     | None                 | None             | 0.3308   | 0.3308   |
| LDA     | n_components=9       | None             | 0.2374   | 0.2374   |
| PCA     | n_components=100     | None             | 0.3818   | 0.3818   |


### SVM Linear parameter selection

| Feature | Reduction Parameters | Model Parameters | Training | Validation |
|---------|----------------------|------------------|----------|------------|
| Raw     | None                 | C=0.0001         | 0.3065   | 0.2735     |
| Raw     | None                 | C=0.01           | 0.515125 | 0.37       |
| Raw     | None                 | C=0.1            | 0.6925   | 0.3315     |
| Raw     | None                 | C=1              | 0.939375 | 0.289      |
| Raw     | None                 | C=10             | 0.999875 | 0.2875     |
| LDA     | n_components=5       | C=0.0001         | 0.1125   | 0.105      |
| LDA     | n_components=5       | C=0.01           | 0.6415   | 0.207      |
| LDA     | n_components=5       | C=0.1            | 0.6495   | 0.212      |
| LDA     | n_components=5       | C=1              | 0.66     | 0.218      |
| LDA     | n_components=5       | C=10             | 0.648    | 0.2065     |
| LDA     | n_components=9       | C=0.0001         | 0.111    | 0.102      |
| LDA     | n_components=9       | C=0.01           | 0.871    | 0.221      |
| LDA     | n_components=9       | C=0.1            | 0.8775   | 0.216      |
| LDA     | n_components=9       | C=1              | 0.8535   | 0.2305     |
| LDA     | n_components=9       | C=10             | 0.867    | 0.2195     |
| PCA     | n_components=50      | C=0.0001         | 0.2125   | 0.191      |
| PCA     | n_components=50      | C=0.01           | 0.4465   | 0.329      |
| PCA     | n_components=50      | C=0.1            | 0.4955   | 0.304      |
| PCA     | n_components=50      | C=1              | 0.5125   | 0.314      |
| PCA     | n_components=50      | C=10             | 0.479    | 0.2895     |
| PCA     | n_components=100     | C=0.0001         | 0.2215   | 0.181      |
| PCA     | n_components=100     | C=0.01           | 0.49     | 0.3315     |
| PCA     | n_components=100     | C=0.1            | 0.6255   | 0.3135     |
| PCA     | n_components=100     | C=1              | 0.7125   | 0.2945     |
| PCA     | n_components=100     | C=10             | 0.736    | 0.278      |
| PCA     | n_components=250     | C=0.0001         | 0.184    | 0.171      |
| PCA     | n_components=250     | C=0.01           | 0.5745   | 0.3295     |
| PCA     | n_components=250     | C=0.1            | 0.7935   | 0.298      |
| PCA     | n_components=250     | C=1              | 0.977    | 0.2835     |
| PCA     | n_components=250     | C=10             | 1.0      | 0.2665     |
| PCA     | n_components=500     | C=0.0001         | 0.1995   | 0.164      |
| PCA     | n_components=500     | C=0.01           | 0.603    | 0.334      |
| PCA     | n_components=500     | C=0.1            | 0.9135   | 0.316      |
| PCA     | n_components=500     | C=1              | 0.9995   | 0.2785     |
| PCA     | n_components=500     | C=10             | 1.0      | 0.272      |
| PCA     | n_components=1000    | C=0.0001         | 0.204    | 0.1595     |
| PCA     | n_components=1000    | C=0.01           | 0.5895   | 0.3275     |
| PCA     | n_components=1000    | C=0.1            | 0.925    | 0.285      |
| PCA     | n_components=1000    | C=1              | 1.0      | 0.298      |
| PCA     | n_components=1000    | C=10             | 1.0      | 0.291      |

### SVM Linear test accuracy (using best parameters)

| Feature | Reduction Parameters | Model Parameters | F1 score | Accuracy |
|---------|----------------------|------------------|----------|----------|
| Raw     | None                 | C=0.0001         | 0.2123   | 0.2123   |
| LDA     | n_components=5       | C=0.0001         | 0.2378   | 0.2378   |
| PCA     | n_components=100     | C=0.01           | 0.3494   | 0.3494   |

### SVM RBF parameter selection

| Feature | Reduction Parameters | Model Parameters     | Training | Validation |
|---------|----------------------|----------------------|----------|------------|
| Raw     | None                 | C=0.01; gamma=0.0003 | 0.1095   | 0.1015     |
| Raw     | None                 | C=0.1; gamma=0.0003  | 0.161    | 0.1295     |
| Raw     | None                 | C=1; gamma=0.0003    | 0.3635   | 0.29       |
| Raw     | None                 | C=0.01; gamma=0.1    | 0.1095   | 0.091      |
| Raw     | None                 | C=0.01; gamma=1      | 0.1075   | 0.0985     |
| Raw     | None                 | C=0.01; gamma=10     | 0.223    | 0.105      |
| Raw     | None                 | C=0.1; gamma=0.1     | 0.113    | 0.105      |
| Raw     | None                 | C=0.1; gamma=1       | 0.213    | 0.167      |
| Raw     | None                 | C=0.1; gamma=10      | 0.118    | 0.1015     |
| Raw     | None                 | C=1; gamma=0.1       | 1.0      | 0.102      |
| Raw     | None                 | C=1; gamma=1         | 1.0      | 0.0925     |
| Raw     | None                 | C=1; gamma=10        | 1.0      | 0.1015     |
| LDA     | n_components=9       | C=0.01; gamma=0.0003 | 0.112    | 0.104      |
| LDA     | n_components=9       | C=0.01; gamma=0.1    | 0.171    | 0.102      |
| LDA     | n_components=9       | C=0.01; gamma=1      | 0.118    | 0.0925     |
| LDA     | n_components=9       | C=0.1; gamma=0.0003  | 0.1185   | 0.1015     |
| LDA     | n_components=9       | C=0.1; gamma=0.1     | 0.87     | 0.2195     |
| LDA     | n_components=9       | C=0.1; gamma=1       | 0.1065   | 0.1105     |
| LDA     | n_components=9       | C=1; gamma=0.0003    | 0.8475   | 0.2165     |
| LDA     | n_components=9       | C=1; gamma=0.1       | 0.9145   | 0.211      |
| LDA     | n_components=9       | C=1; gamma=1         | 1.0      | 0.127      |
| PCA     | n_components=50      | C=0.01; gamma=0.0003 | 0.1075   | 0.0985     |
| PCA     | n_components=50      | C=0.01; gamma=0.1    | 0.1165   | 0.0925     |
| PCA     | n_components=50      | C=0.01; gamma=1      | 0.115    | 0.0925     |
| PCA     | n_components=50      | C=0.1; gamma=0.0003  | 0.177    | 0.143      |
| PCA     | n_components=50      | C=0.1; gamma=0.1     | 0.111    | 0.1015     |
| PCA     | n_components=50      | C=0.1; gamma=1       | 0.106    | 0.1015     |
| PCA     | n_components=50      | C=1; gamma=0.0003    | 0.3525   | 0.297      |
| PCA     | n_components=50      | C=1; gamma=0.1       | 1.0      | 0.1525     |
| PCA     | n_components=50      | C=1; gamma=1         | 1.0      | 0.1        |
| PCA     | n_components=100     | C=0.01; gamma=0.0003 | 0.11     | 0.091      |
| PCA     | n_components=100     | C=0.01; gamma=0.1    | 0.115    | 0.0985     |
| PCA     | n_components=100     | C=0.01; gamma=1      | 0.1095   | 0.091      |
| PCA     | n_components=100     | C=0.1; gamma=0.0003  | 0.182    | 0.1515     |
| PCA     | n_components=100     | C=0.1; gamma=0.1     | 0.1155   | 0.1015     |
| PCA     | n_components=100     | C=0.1; gamma=1       | 0.111    | 0.091      |
| PCA     | n_components=100     | C=1; gamma=0.0003    | 0.348    | 0.296      |
| PCA     | n_components=100     | C=1; gamma=0.1       | 1.0      | 0.1375     |
| PCA     | n_components=100     | C=1; gamma=1         | 1.0      | 0.111      |
| PCA     | n_components=250     | C=0.01; gamma=0.0003 | 0.1085   | 0.1105     |
| PCA     | n_components=250     | C=0.01; gamma=0.1    | 0.223    | 0.0995     |
| PCA     | n_components=250     | C=0.01; gamma=1      | 0.111    | 0.0925     |
| PCA     | n_components=250     | C=0.1; gamma=0.0003  | 0.1545   | 0.146      |
| PCA     | n_components=250     | C=0.1; gamma=0.1     | 0.1165   | 0.1015     |
| PCA     | n_components=250     | C=0.1; gamma=1       | 0.114    | 0.0925     |
| PCA     | n_components=250     | C=1; gamma=0.0003    | 0.3555   | 0.2895     |
| PCA     | n_components=250     | C=1; gamma=0.1       | 1.0      | 0.119      |
| PCA     | n_components=250     | C=1; gamma=1         | 1.0      | 0.1015     |
| PCA     | n_components=500     | C=0.01; gamma=0.0003 | 0.111    | 0.0925     |
| PCA     | n_components=500     | C=0.01; gamma=0.1    | 0.22     | 0.0985     |
| PCA     | n_components=500     | C=0.01; gamma=1      | 0.1115   | 0.0985     |
| PCA     | n_components=500     | C=0.1; gamma=0.0003  | 0.1465   | 0.1195     |
| PCA     | n_components=500     | C=0.1; gamma=0.1     | 0.109    | 0.104      |
| PCA     | n_components=500     | C=0.1; gamma=1       | 0.11     | 0.105      |
| PCA     | n_components=500     | C=1; gamma=0.0003    | 0.3745   | 0.286      |
| PCA     | n_components=500     | C=1; gamma=0.1       | 1.0      | 0.103      |
| PCA     | n_components=500     | C=1; gamma=1         | 1.0      | 0.1015     |

### SVM RBF test accuracy (using best parameters)

| Feature | Reduction Parameters | Model Parameters    | F1 score | Accuracy |
|---------|----------------------|---------------------|----------|----------|
| Raw     | None                 | C=1; gamma=0.0003   | 0.308    | 0.308    |
| LDA     | n_components=9       | C=0.1; gamma=0.0003 | 0.1413   | 0.1413   |
| PCA     | n_components=50      | C=1; gamma=0.0003   | 0.3077   | 0.3077   |


### DTree parameter selection

| Feature | Reduction Parameters | Model Parameters | Training | Validation |
|---------|----------------------|------------------|----------|------------|
| Raw     | None                 | max_depth=2      | 0.20025  | 0.181      |
| Raw     | None                 | max_depth=5      | 0.300375 | 0.2445     |
| Raw     | None                 | max_depth=10     | 0.587375 | 0.2475     |
| Raw     | None                 | max_depth=50     | 1.0      | 0.236      |
| Raw     | None                 | max_depth=100    | 1.0      | 0.237      |
| LDA     | n_components=5       | max_depth=2      | 0.337375 | 0.15       |
| LDA     | n_components=5       | max_depth=5      | 0.575875 | 0.195      |
| LDA     | n_components=5       | max_depth=10     | 0.714125 | 0.2035     |
| LDA     | n_components=5       | max_depth=50     | 1.0      | 0.214      |
| LDA     | n_components=5       | max_depth=100    | 1.0      | 0.2105     |
| LDA     | n_components=9       | max_depth=2      | 0.337375 | 0.15       |
| LDA     | n_components=9       | max_depth=5      | 0.7425   | 0.2        |
| LDA     | n_components=9       | max_depth=10     | 0.863625 | 0.191      |
| LDA     | n_components=9       | max_depth=50     | 1.0      | 0.189      |
| LDA     | n_components=9       | max_depth=100    | 1.0      | 0.186      |
| PCA     | n_components=50      | max_depth=2      | 0.19525  | 0.1835     |
| PCA     | n_components=50      | max_depth=5      | 0.289    | 0.242      |
| PCA     | n_components=50      | max_depth=10     | 0.550125 | 0.2655     |
| PCA     | n_components=50      | max_depth=50     | 1.0      | 0.237      |
| PCA     | n_components=50      | max_depth=100    | 1.0      | 0.231      |
| PCA     | n_components=100     | max_depth=2      | 0.19525  | 0.1835     |
| PCA     | n_components=100     | max_depth=5      | 0.289375 | 0.2425     |
| PCA     | n_components=100     | max_depth=10     | 0.5605   | 0.253      |
| PCA     | n_components=100     | max_depth=50     | 1.0      | 0.226      |
| PCA     | n_components=100     | max_depth=100    | 1.0      | 0.2315     |
| PCA     | n_components=250     | max_depth=2      | 0.19525  | 0.1835     |
| PCA     | n_components=250     | max_depth=5      | 0.289125 | 0.2405     |
| PCA     | n_components=250     | max_depth=10     | 0.56175  | 0.259      |
| PCA     | n_components=250     | max_depth=50     | 1.0      | 0.2225     |
| PCA     | n_components=250     | max_depth=100    | 1.0      | 0.218      |
| PCA     | n_components=500     | max_depth=2      | 0.19525  | 0.1835     |
| PCA     | n_components=500     | max_depth=5      | 0.289125 | 0.2405     |
| PCA     | n_components=500     | max_depth=10     | 0.571625 | 0.248      |
| PCA     | n_components=500     | max_depth=50     | 1.0      | 0.2155     |
| PCA     | n_components=500     | max_depth=100    | 1.0      | 0.2095     |

### DTree test accuracy (using best parameters)

| Feature | Reduction Parameters | Model Parameters | F1 score | Accuracy |
|---------|----------------------|------------------|----------|----------|
| Raw     | None                 | max_depth=5      | 0.2678   | 0.2678   |
| LDA     | n_components=9       | max_depth=2      | 0.2032   | 0.2032   |
| PCA     | n_components=500     | max_depth=5      | 0.2586   | 0.2586   |
