# Assigment 2 
Group 7:
- Guillem Capellera
- Anna Oliveras
- Johnny Núñez

## Table of contents
* [1. Import libraries](#import) 
* [2. Read and visualize the train and test files](#read)
* [3. Data distribution](#distribution)
* [4. Descriptors](#descriptors)
* [5. Hyperparameters](#hyper)
* [6. Bag of Visual Words](#BoVW)
* [7. KNN Classifier](#classifier)
* [8. Dimensionality reduction](#dim)
* [9. Experiments](#exp)
* [10. Test data evaluation](#test)

In [None]:

import cv2
import numpy as np
import pickle

from sklearn.cluster import MiniBatchKMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import normalize
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, confusion_matrix

import matplotlib.pyplot as plt
from functools import partial
import time
import pandas as pd
from tqdm.notebook import trange, tqdm
import optuna
from optuna.visualization.matplotlib import plot_contour, plot_edf, plot_intermediate_values, plot_optimization_history, plot_parallel_coordinate, plot_param_importances, plot_slice, plot_pareto_front
import os
from optuna.samplers import TPESampler
import concurrent.futures
import gc
import seaborn as sns

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline


# 2. Read and visualize the train and test files <a name="read"></a>

In [None]:
train_images_filenames = pickle.load(
    open('MIT_split/train_images_filenames_unix.dat', 'rb'))
test_images_filenames = pickle.load(
    open('MIT_split/test_images_filenames_unix.dat', 'rb'))
# train_images_filenames = ['..' + n[15:] for n in train_images_filenames] original
# test_images_filenames  = ['..' + n[15:] for n in test_images_filenames]  original
train_images_filenames = [n[16:] for n in train_images_filenames]
test_images_filenames = [n[16:] for n in test_images_filenames]
train_labels = pickle.load(open('MIT_split/train_labels_unix.dat', 'rb'))
test_labels = pickle.load(open('MIT_split/test_labels_unix.dat', 'rb'))


In [None]:
train_images_filenames[12]

In [None]:
# Function to visualize images of each class of the dataset
def visualize(images_filenames, labels, num_images=5):
    print(f'Number of samples: {len(images_filenames)}')
    # get unique classses
    classes = np.unique(np.array(labels))
    num_classes = len(classes)
    # set size for plot
    plt.figure(figsize=(15,8))
    # loop over classes
    for i, c in enumerate(classes):
        # get the first 5 images of the class
        idx = np.where(np.array(labels) == c)[0][:num_images]
        # loop over the images and plot them
        for j, index in enumerate(idx):
            plt_idx = j * num_classes + i + 1
            plt.subplot(num_images, num_classes, plt_idx)
            plt.imshow(cv2.cvtColor(cv2.imread(images_filenames[index]), cv2.COLOR_BGR2RGB))
            plt.axis('off')
            if j == 0:
                plt.title(c)

In [None]:
# visualize the train dataset
visualize(train_images_filenames, train_labels)

In [None]:
# visualize the test dataset
visualize(test_images_filenames, test_labels)

# 3. Data distribution <a name="distribution"></a>

In [None]:
unique_classes, counts = np.unique(train_labels, return_counts=True)
total_count = sum(counts)
train_class_proportions = counts / total_count

# Calculate the class proportions for the test set
unique_classes, counts = np.unique(test_labels, return_counts=True)
total_count = sum(counts)
test_class_proportions = counts / total_count

# Print the class proportions for the train and test sets
print("Train set class proportions:", train_class_proportions)
print("Test set class proportions:", test_class_proportions)


In [None]:
unique_labels = np.unique(train_labels + test_labels)
train_count = [np.sum(np.array(train_labels) == lab) for lab in unique_labels]
test_count = [np.sum(np.array(test_labels) == lab) for lab in unique_labels]


# distribution of the training and test set
def plot_distribution(train_count, test_count, unique_labels):
    plt.figure(figsize=(8, 5))
    plt.title("Distribution of the training and test set")
    plt.bar(unique_labels, train_count, label="Training Set")
    plt.bar(unique_labels, test_count, label="Test Set")
    plt.legend()
    plt.show()


plot_distribution(train_count, test_count, unique_labels)


### Answer:
To check if a dataset is unbalanced, we can calculate the proportion of each class in the dataset and compare the proportions. If the proportions of the classes are significantly different, then the dataset is likely to be unbalanced.

Based on the class proportions we have calculated, it looks like the train set and test set are both slightly **imbalanced**, but not **heavily imbalanced**.
In a perfectly balanced dataset, the class proportions would be the same for all classes and would be approximately equal to 1/number of classes. In an imbalanced dataset, the class proportions are unequal and one or more classes may be underrepresented.

There are a few ways to quantify the degree of imbalance in a dataset. One commonly used metric is the Gini index, which can be calculated using the following formula:

$Gini = 1 - ∑(p_i)^2$

where p_i is the proportion of the i-th class in the dataset.

A dataset is considered imbalanced if the Gini index is greater than 0.5. Using this metric, we can calculate the Gini index for the train set and test set as follows:

Train set Gini index: 0.936
Test set Gini index: 0.912

Both of these values are greater than 0.5, indicating that the train set and test set are both imbalanced. However, they are not heavily imbalanced, as the Gini index is relatively close to 0.5.

# First we execute all pipelines without any optimization

# 4. Descriptors <a name="descriptors"></a>

In [None]:
class KAZE:
    def __init__(self, threshold=0.001):
        self.extractor = cv2.KAZE_create(threshold=threshold)

    def extract_features(self, image):
        keypoints, descriptors = self.extractor.detectAndCompute(image, None)
        return descriptors


class AKAZE:
    def __init__(self, threshold=0.001):
        self.extractor = cv2.AKAZE_create(threshold=threshold)

    def extract_features(self, image):
        keypoints, descriptors = self.extractor.detectAndCompute(image, None)
        return descriptors


class SIFT:
    """ SIFT feature extractor """
    def __init__(self, n_features=300):
        self.extractor = cv2.SIFT_create(nfeatures=n_features)

    def extract_features(self, image):
        keypoints, descriptors = self.extractor.detectAndCompute(image, None)
        return descriptors


class DenseSIFT:
    def __init__(self, n_features=300, step_size=10, patch_size=10):
        self.extractor = cv2.SIFT_create(nfeatures=n_features)
        self.step_div_size = step_size
        self.num_sizes = patch_size

    def extract_features(self, image):
        descriptors = []
        init_step_size_x = max(image.shape[1] // self.step_div_size, 16)
        init_step_size_y = max(image.shape[0] // self.step_div_size, 16)
        
        for i in range(1, self.num_sizes+1):
            current_step_x = init_step_size_x * i
            current_step_y = init_step_size_y * i
            avg_size = (current_step_x + current_step_y) // 2
            descriptors += [cv2.KeyPoint(x, y, avg_size) for y in range(0, image.shape[0], current_step_y) 
                                                    for x in range(0, image.shape[1], current_step_x)]
        descriptors = self.extractor.compute(image, descriptors)[1]
        return descriptors


class ORB:
    def __init__(self, n_features=100):
        self.extractor = cv2.ORB_create(nfeatures=n_features)

    def extract_features(self, image):
        keypoints, descriptors = self.extractor.detectAndCompute(image, None)
        return descriptors


class BRISK:
    def __init__(self, n_features=100):
        self.extractor = cv2.BRISK_create(nfeatures=n_features)

    def extract_features(self, image):
        keypoints, descriptors = self.extractor.detectAndCompute(image, None)
        return descriptors


In [None]:
feature_extractors = {
    "SIFT": SIFT,
    "DenseSIFT": DenseSIFT,
    "KAZE": KAZE,
    "AKAZE": AKAZE,
    "ORB": ORB,
    "BRISK": BRISK
}


In [None]:
def extract_features(filenames, labels, descriptor_extractor, extract_features=True):

    descriptors = []
    label_per_descriptor = []
    images = []
    
    for filename,labels in zip(filenames, labels):
        ima=cv2.imread(filename)
        gray=cv2.cvtColor(ima,cv2.COLOR_BGR2GRAY)
        
        if extract_features:
            des = descriptor_extractor.extract_features(gray)
            descriptors.append(des)
        else:
            images.append(gray)
            
        label_per_descriptor.append(labels)

    if not extract_features:
        return images, label_per_descriptor
    else:
        return descriptors, label_per_descriptor

In [None]:
def cluster_local_features(features, n_clusters):
    codebook = MiniBatchKMeans(n_clusters=n_clusters, n_init='auto', verbose=False, batch_size=n_clusters *
                               20, compute_labels=False, reassignment_ratio=10**-4, random_state=42)
    codebook.fit(features)
    return codebook


In [None]:
def compute_histogram(assigned_clusters, num_clusters):
    bag_visual_words = np.zeros(
        (len(assigned_clusters), num_clusters), dtype=np.float32)
    for i in range(len(assigned_clusters)):
        hist_i, _ = np.histogram(
            assigned_clusters[i], bins=num_clusters, range=(0, num_clusters))
        bag_visual_words[i, :] = normalize(hist_i.reshape(1, -1), norm='l2')
    return bag_visual_words


def obtain_histogram_visual_words(features, tr_lengths=None, codebook=None):
    if tr_lengths is None:
        tr_lengths = [len(feature) for feature in features]
        features = np.vstack(features)
    assigned_labels = codebook.predict(features)
    lengths = np.array(
        [0]+[descriptor_length for descriptor_length in tr_lengths])
    lengths = np.cumsum(lengths)
    splitted_labels = [assigned_labels[lengths[i]:lengths[i+1]]
                       for i in range(len(lengths)-1)]
    return compute_histogram(splitted_labels, codebook.cluster_centers_.shape[0])

In [None]:
# Define cross-validation functions
cv_strategies = {
    "kfold": KFold,
    "stratified": StratifiedKFold,
    "repeats": RepeatedStratifiedKFold
}

metrics = {
    "balanced_accuracy": balanced_accuracy_score,
    "accuracy": accuracy_score,
    "f1-score": f1_score,
    "confusion-matrix": confusion_matrix
}


class BoVWClassifier(BaseEstimator, ClassifierMixin):
    """ Image classifier using Bag of Visual Words. """

    def __init__(self, clustering_method, classifier, reduction_method):
        self.clustering_method = clustering_method
        self.classifier = classifier
        self.reduction_method = reduction_method
        self.codebook = None

    def fit(self, features, labels, sample_weight=None):
        tr_lengths = [len(feature) for feature in features]
        features = np.vstack(features)
        self.codebook = self.clustering_method(features)
        tr_hist = obtain_histogram_visual_words(
            features, tr_lengths, self.codebook)
        tr_hist_reduced = self.reduction_method.fit_transform(tr_hist, labels)
        self.classifier.fit(tr_hist_reduced, labels)

    def fit_transform(self, features, labels):
        self.fit(features, labels)
        return self.predict(features)

    def predict_proba(self, features):
        te_lengths = [len(feature) for feature in features]
        features = np.vstack(features)

        te_hist = obtain_histogram_visual_words(
            features, te_lengths, self.codebook)
        te_hist_reduced = self.reduction_method.transform(te_hist)
        cls = self.classifier.predict_proba(te_hist_reduced)
        return cls

    def predict(self, features):
        te_lengths = [len(feature) for feature in features]
        features = np.vstack(features)

        te_hist = obtain_histogram_visual_words(
            features, te_lengths, self.codebook)
        te_hist_reduced = self.reduction_method.transform(te_hist)
        cls = self.classifier.predict(te_hist_reduced)
        return cls

    def score(self, X, y=None):
        return (sum(self.predict(X)))

    def score_accuracy(self, X, y):
        return 100*self.score(X, y)/len(y)


class FastCrossValidator:
    """ Cross-validator class """

    def __init__(self, cv_method, metric_name, trainer, labels):
        """ 
        Params:
        - cv_method (function): Clustering function that when called returns a codebook.
        - classifier (Classifier like KNN, LogisticRegression,...)
        - reduction_method (None/PCA/LDA/Isomap)
        """
        self.cv_method = cv_method
        self.metric_name = metric_name
        self.trainer = trainer
        self.labels = np.array(labels)

    def cross_validate(self, feature_list, labels, n_jobs=-1):
        return cross_val_score(self.trainer, feature_list, labels, scoring=self.metric_name, cv=self.cv_method, n_jobs=n_jobs)


class Dummy():
    """ Dummy dimensionality reduction method that keeps all the original features. """

    def fit_transform(self, features, labels):
        return features

    def transform(self, features):
        return features


classifiers = {"KNN": KNeighborsClassifier}

dim_reduction = {
    "None": Dummy,
    "PCA": PCA,
    "LDA": LinearDiscriminantAnalysis,
}

In [None]:
# Best descriptor
DESCRIPTOR = feature_extractors["DenseSIFT"](n_features=251, patch_size=3, step_size=75)
train_descriptors, train_labels_descrip = extract_features(train_images_filenames, train_labels, DESCRIPTOR)
test_descriptors, test_labels_descrip = extract_features(test_images_filenames, test_labels, DESCRIPTOR)

In [None]:
# Best model
clustering = partial(cluster_local_features, n_clusters=1024)
dim_reduction_type = dim_reduction["PCA"](n_components=46)
classifier = classifiers["KNN"](n_neighbors=18, n_jobs=8, metric='euclidean')

In [None]:
# Train the model and compute the time
start = time.time()
ex_trainer = BoVWClassifier(clustering, classifier, dim_reduction_type)
ex_trainer.fit(train_descriptors, train_labels_descrip)
end = time.time()
print("Training time: ", end - start)

In [None]:
# Test the model and compute the time
start = time.time()
predictions = ex_trainer.predict(test_descriptors)
end = time.time()
print("Testing time: ", end - start)

In [None]:
# Define a function to compute the f1-score and accuracy for each class
def compute_metrics(truth, preds):
    results = []
    unique_labels = np.unique(truth)
    truth, preds = np.array(truth), np.array(preds)
    for lab in unique_labels:
        acc = metrics["accuracy"](truth == lab, preds == lab)
        F1 = metrics["f1-score"](truth == lab, preds == lab)
        results.append((lab, acc, F1))

    overall_acc = metrics["balanced_accuracy"](truth, preds)
    weighted_F1 = metrics["f1-score"](truth, preds, average="weighted")
    results.append(("OVERALL", overall_acc, weighted_F1))
    return pd.DataFrame(data=results, columns=["label", "accuracy", "f1_score"])

In [None]:
# Evaluate the model using the f1-score, accuracy and confusion matrix
compute_metrics(test_labels_descrip, predictions)

In [None]:
# Plot confusion matrix
labels = ["Opencountry", "coast", "forest", "mountain", "highway", "tallbuilding", "street", "inside_city"]
confusion = confusion_matrix(test_labels_descrip, predictions, labels=labels)
    
fig = plt.figure(figsize=(8, 6))
g = sns.heatmap(confusion,cbar=True,annot=True, cmap="Blues")#, xticklabels=labels, yticklabels=labels,)
g.set_title('Confusion matrix')

g.set_ylabel('Truth')
g.set_xlabel('Predicted')
g.set_yticklabels(labels, rotation=0)
g.set_xticklabels(labels, rotation=60)
g.xaxis.tick_top()

plt.show()

In [None]:
# Plot fscore matrix
precision =(confusion/confusion.sum(axis=0))
recall =(((confusion.T)/(confusion.sum(axis=1))).T)
f_score = np.nan_to_num((2 * (precision * recall) / (precision + recall)))

fig = plt.figure(figsize=(8, 6))
g = sns.heatmap(f_score,cmap="Blues", annot=True, fmt=".2f", xticklabels=labels, yticklabels=labels)
g.set_title('F1-score matrix')

g.set_ylabel('Truth')
g.set_xlabel('Predicted')
g.set_yticklabels(labels, rotation=0)
g.set_xticklabels(labels, rotation=60)
g.xaxis.tick_top()

plt.show()

In [None]:
# Function to get the false positives inside a class
def get_false_positives(truth, preds, clas):
    truth, preds = np.array(truth), np.array(preds)
    return np.where((truth != clas) & (preds == clas))


In [None]:
# Function to plot all the false positives images of a class
def plot_false_positives(truth, preds, idxs, test_images_filenames):
    total = len(idxs[0])
    COLS = 5
    rows, cols = max(2, total // COLS + 1), COLS
    fig, axs = plt.subplots(rows, cols, figsize=(30,3*rows))
    
    for i in range(rows * cols):
        r, c = i // cols, i % cols
        if i >= total:
            fig.delaxes(axs[r,c])
            continue
        img = cv2.imread(test_images_filenames[idxs[0][i]])
        axs[r,c].set_title(f"{idxs[0][i]}\n{truth[idxs[0][i]]}")
        axs[r,c].axis("off")
        axs[r,c].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()

In [None]:
for lab in labels:
    print(f"> Wrongly predicted as {lab}\n")
    plot_false_positives(test_labels_descrip, predictions, get_false_positives(test_labels_descrip, predictions, lab), test_images_filenames)