In [2]:
# Point 5: Deeper analysis of the data

import pandas as pd
import numpy as np

top_2_array = set()
top_5_array = set()
top_10_array = set()

bottom_2_array = set()
bottom_5_array = set()
bottom_10_array = set()

data = pd.read_csv('data/x_train_gr_smpl.csv')
for i in range(10):
    data = data.reindex(np.arange(data.shape[0]))
    labels = pd.read_csv(f'data/y_train_smpl_{i}.csv')
    data['label'] = labels
    data = data.sample(frac=1)
    
    corr_label = data.drop("label", axis=1).apply(lambda x: x.corr(data.label))
    corr_label = [(index, abs(corr_val), i) for index, corr_val in enumerate(corr_label)]
    corr_label = sorted(corr_label, key=lambda tup: tup[1], reverse=True)  # Order by correlation value
            
    for i, tup in enumerate(corr_label[:10]):
        if i < 2:
            top_2_array.add(tup[0])
        if i < 5:
            top_5_array.add(tup[0])
        if i < 10:
            top_10_array.add(tup[0]) #, tup[2]))
            
    for i, tup in enumerate(corr_label[-10:]):
        if i < 2:
            bottom_2_array.add(tup[0])
        if i < 5:
            bottom_5_array.add(tup[0])
        if i < 10:
            bottom_10_array.add(tup[0])

In [3]:
# a = sorted([(int(round(x/48)), x % 48) for x in bottom_2_array])
# a = a.filter(lambda (x, y): x >10 or x < 37)
# print(a)
# b = sorted([x % 48 for x in bottom_5_array])
# print(b)
# c = sorted([(int(round(x[0]/48)), x[0] % 48, x[1]) for x in top_10_array])
# print(c)

In [4]:
# Point 6: Try to improve the classification

data = pd.read_csv('data/x_train_gr_smpl.csv')
labels = pd.read_csv('data/y_train_smpl.csv')

data_top_2 = data[data.columns[list(top_2_array)]].copy(deep=True)
data_top_2['label'] = labels
# data_top_2 = data_top_2.sample(frac=1)

data_top_5 = data[data.columns[list(top_5_array)]].copy(deep=True)
data_top_5['label'] = labels
# data_top_5 = data_top_5.sample(frac=1)

data_top_10 = data[data.columns[list(top_10_array)]].copy(deep=True)
data_top_10['label'] = labels
# data_top_10 = data_top_10.sample(frac=1)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

X = data_top_2.iloc[:, :-1]  # All columns but the label
y = data_top_2['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.20      0.33      0.25       456
           1       0.30      0.20      0.24       617
           2       0.09      0.78      0.17       131
           3       0.88      0.55      0.68       425
           4       0.55      0.40      0.46       703
           5       0.90      0.51      0.65       706
           6       0.49      0.64      0.56       276
           7       0.27      0.38      0.32        86
           8       0.37      0.06      0.10       676
           9       0.33      0.43      0.38       102

    accuracy                           0.37      4178
   macro avg       0.44      0.43      0.38      4178
weighted avg       0.51      0.37      0.39      4178



In [6]:
X = data_top_5.iloc[:, :-1]
y = data_top_5['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.26      0.37      0.31       488
           1       0.29      0.21      0.24       629
           2       0.11      0.93      0.19       131
           3       0.90      0.60      0.72       430
           4       0.58      0.33      0.42       694
           5       0.95      0.46      0.62       740
           6       0.53      0.68      0.59       244
           7       0.16      0.40      0.23        83
           8       0.62      0.09      0.16       634
           9       0.21      0.51      0.30       105

    accuracy                           0.38      4178
   macro avg       0.46      0.46      0.38      4178
weighted avg       0.57      0.38      0.40      4178



In [7]:
X = data_top_10.iloc[:, :-1]
y = data_top_10['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.24      0.38      0.29       465
           1       0.30      0.18      0.23       597
           2       0.12      0.89      0.21       151
           3       0.96      0.60      0.74       437
           4       0.61      0.36      0.45       700
           5       0.88      0.50      0.64       692
           6       0.53      0.69      0.60       255
           7       0.15      0.41      0.21        86
           8       0.68      0.11      0.19       683
           9       0.27      0.49      0.35       112

    accuracy                           0.39      4178
   macro avg       0.47      0.46      0.39      4178
weighted avg       0.58      0.39      0.41      4178



In [8]:
confusion_matrix(y_test, y_pred)

array([[175,  42, 186,   1,  15,   1,  19,  20,   3,   3],
       [152, 110, 243,   0,  17,   1,  13,  33,   0,  28],
       [  2,   1, 135,   0,   0,   0,   0,  13,   0,   0],
       [ 39,  39,  58, 264,  23,   2,   3,   0,   2,   7],
       [104,  36, 159,   3, 250,   1,  17,  97,  14,  19],
       [109,  43,  66,   4,   2, 345,  36,   0,  12,  75],
       [ 27,  13,  11,   0,   5,  14, 175,   3,   4,   3],
       [  8,  15,  11,   0,  13,   0,   4,  35,   0,   0],
       [103,  45, 243,   3,  84,  25,  51,  39,  75,  15],
       [  9,  23,  14,   0,   0,   1,  10,   0,   0,  55]])

# 7. Conclusions (Draft)

### Which streets signs are harder to recognise?
Based on the recall measure labels 1 and 8 are the harsdest to recognize.
### Which street signs are most easily confused?
Based on the confusion matrix the most confused sign is the label 2 and, in order, it's mistaken for labels 1, 8, 0 and 4. 
### Which attributes (fields) are more reliable and which are less reliable in classification of street signs?
By looking at top_2_array we can identify the most reliable features (pixels). By doing some operations like the module we can see those pixels are closer to the centre of the image (or at least don't belong to the borders).
Regarding the less reliable pixels we consider the ones with the lowest correlation value with respect to the label.
### What was the purpose of Tasks 5 and 6?
Prepare the data in a way that we keep the most relevant attributes.
### What would happen if the data sets you used in Tasks 4, 5 and 6 were not randomised?
It doens't happen anything, the results are the same.
### What would happen if there is cross-correlation between the non-class attributes?
There is actually cross-correlation between pixels that are close to each other. As a consecuence of that, if two pixels are correlated (adjacent) to each other, and one of them is included in the top 10 of the most correlated pixels with the label, the other is likely to be in the top 10 as well.

In [33]:
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score

estimator = KMeans(n_clusters=10, random_state=1, n_jobs=-1).fit(data_top_10)

In [34]:
print(homogeneity_score(y, estimator.labels_))

0.20017461793830968
