In [1]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, random_split, ConcatDataset
from torchvision import datasets, transforms
import tensorflow as tf
from skimage import io, color, transform
import sklearn
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, top_k_accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from PIL import Image
from preprocessing import *
from machine_learning import *
import cv2 as cv

### Load data

In [2]:
# Specify the base folder where all your image classes are stored
folder = 'data'

### Load images in the right format for scikit-learn.

In [3]:
images, labels, class_names = load_images(folder, image_size=(300,300))
images = images.astype(np.float32)

## Preprocessing the images

### Image normalization

In [4]:
# Image normalization
images /= 255.0

### Remove noise from images using Gaussian Blur.

In [5]:
for i in range(len(images)):
    images[i] = cv.GaussianBlur(images[i], (5, 5), 0)

## **Classification using color histograms**

### Compute color histograms and flatten images

In [6]:
feature_vectors = []
for img in images:  # 'images' contiene tutte le immagini
    feature_vector = calculate_histograms(img, norm=True)  # Calcola l'istogramma per l'immagine
    feature_vector /= feature_vector.sum()  # Normalizza l'istogramma al totale dei pixel
    feature_vectors.append(feature_vector)  # Aggiungi il vettore alla lista
features = np.array(feature_vectors)  # Converti in array numpy

### Train Test Split.

In [7]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

### SVM

In [None]:
svm = SVC()
grid_svm = {'kernel':['poly', 'rbf'],
            'degree':[2, 3],
            'gamma': ['scale', 'auto']}

grid = GridSearchCV(svm, grid_svm, cv=5, scoring='accuracy', verbose=2)
if grid is not None:
    grid.fit(x_train, y_train)

    print(f'Best score: {grid.best_score_}')
    for hp, value in grid.best_params_.items():
        print(f'{hp}:{value}\n')

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .................degree=2, gamma=scale, kernel=poly; total time= 2.2min
[CV] END .................degree=2, gamma=scale, kernel=poly; total time= 6.2min


In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf', C=1, gamma='scale')  # Usa kernel RBF per catturare relazioni non lineari
svm.fit(x_train, y_train)


### Training error for SVM

In [None]:
y_pred_train = svm.predict(x_train)
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.58      0.58      0.58      1229
           1       0.59      0.44      0.50      1209
           2       0.61      0.45      0.52      1421
           3       0.51      0.95      0.67      4253
           4       0.47      0.50      0.49      2402
           5       0.55      0.21      0.30      1210
           6       0.44      0.32      0.37      1317
           7       0.61      0.24      0.34      1526
           8       0.50      0.21      0.30      1581
           9       0.66      0.55      0.60      1223

    accuracy                           0.53     17371
   macro avg       0.55      0.44      0.47     17371
weighted avg       0.54      0.53      0.50     17371



### Test error for SVM

In [None]:
y_pred = svm.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.49      0.47       271
           1       0.57      0.41      0.47       291
           2       0.56      0.32      0.41       389
           3       0.51      0.96      0.67      1070
           4       0.45      0.51      0.48       637
           5       0.37      0.13      0.20       290
           6       0.41      0.29      0.34       333
           7       0.54      0.22      0.31       389
           8       0.43      0.16      0.23       396
           9       0.44      0.41      0.42       277

    accuracy                           0.49      4343
   macro avg       0.47      0.39      0.40      4343
weighted avg       0.48      0.49      0.45      4343



### K-NN

In [26]:
knn = KNeighborsClassifier()
grid_knn = {'n_neighbors':[i for i in range(50, 350, 50)],
            'weights':['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']}

grid = GridSearchCV(knn, grid_knn, cv=5, scoring='accuracy', verbose=2)
if grid is not None:
    grid.fit(x_train, y_train)

    print(f'Best score: {grid.best_score_}')
    for hp, value in grid.best_params_.items():
        print(f'{hp}:{value}\n')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ..metric=euclidean, n_neighbors=50, weights=uniform; total time=   0.9s
[CV] END ..metric=euclidean, n_neighbors=50, weights=uniform; total time=   0.8s
[CV] END ..metric=euclidean, n_neighbors=50, weights=uniform; total time=   0.8s
[CV] END ..metric=euclidean, n_neighbors=50, weights=uniform; total time=   0.9s
[CV] END ..metric=euclidean, n_neighbors=50, weights=uniform; total time=   1.0s
[CV] END .metric=euclidean, n_neighbors=50, weights=distance; total time=   0.8s
[CV] END .metric=euclidean, n_neighbors=50, weights=distance; total time=   0.9s
[CV] END .metric=euclidean, n_neighbors=50, weights=distance; total time=   1.0s
[CV] END .metric=euclidean, n_neighbors=50, weights=distance; total time=   0.9s
[CV] END .metric=euclidean, n_neighbors=50, weights=distance; total time=   0.8s
[CV] END .metric=euclidean, n_neighbors=100, weights=uniform; total time=   1.0s
[CV] END .metric=euclidean, n_neighbors=100, we

In [30]:
knn = KNeighborsClassifier(n_neighbors=50, weights='distance', metric='manhattan', p=1)

knn.fit(x_train, y_train)

### Training error for KNN

In [31]:
y_pred_train = knn.predict(x_train)
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1229
           1       1.00      1.00      1.00      1209
           2       1.00      1.00      1.00      1421
           3       1.00      1.00      1.00      4253
           4       1.00      1.00      1.00      2402
           5       1.00      1.00      1.00      1210
           6       1.00      1.00      1.00      1317
           7       1.00      1.00      1.00      1526
           8       1.00      1.00      1.00      1581
           9       1.00      1.00      1.00      1223

    accuracy                           1.00     17371
   macro avg       1.00      1.00      1.00     17371
weighted avg       1.00      1.00      1.00     17371



### Test error for KNN

In [32]:
y_pred = knn.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.39      0.46       271
           1       0.41      0.67      0.51       291
           2       0.61      0.30      0.41       389
           3       0.57      0.90      0.70      1070
           4       0.51      0.42      0.46       637
           5       0.40      0.17      0.24       290
           6       0.33      0.47      0.39       333
           7       0.62      0.23      0.33       389
           8       0.38      0.33      0.35       396
           9       0.56      0.36      0.44       277

    accuracy                           0.50      4343
   macro avg       0.50      0.42      0.43      4343
weighted avg       0.51      0.50      0.47      4343



### Decision Tree

In [46]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

grid_tree = {'criterion':['gini', 'entropy', 'log_loss'],
             'splitter' : ['best', 'random'],
             'max_depth': [5, 10, 15]}

grid = GridSearchCV(tree, grid_tree, cv=5, scoring='accuracy', verbose=2)
if grid is not None:
    grid.fit(x_train, y_train)

    print(f'Best score: {grid.best_score_}')
    for hp, value in grid.best_params_.items():
        print(f'{hp}:{value}\n')

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END .........criterion=gini, max_depth=5, splitter=best; total time=   3.9s
[CV] END .........criterion=gini, max_depth=5, splitter=best; total time=   3.8s
[CV] END .........criterion=gini, max_depth=5, splitter=best; total time=   4.1s
[CV] END .........criterion=gini, max_depth=5, splitter=best; total time=   3.8s
[CV] END .........criterion=gini, max_depth=5, splitter=best; total time=   3.9s
[CV] END .......criterion=gini, max_depth=5, splitter=random; total time=   0.4s
[CV] END .......criterion=gini, max_depth=5, splitter=random; total time=   0.4s
[CV] END .......criterion=gini, max_depth=5, splitter=random; total time=   0.3s
[CV] END .......criterion=gini, max_depth=5, splitter=random; total time=   0.5s
[CV] END .......criterion=gini, max_depth=5, splitter=random; total time=   0.4s
[CV] END ........criterion=gini, max_depth=10, splitter=best; total time=   7.7s
[CV] END ........criterion=gini, max_depth=10, s

In [47]:
tree = DecisionTreeClassifier(criterion='log_loss', max_depth=10, splitter='best')
tree.fit(x_train, y_train)

### Train error

In [48]:
y_pred_train = tree.predict(x_train)
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.65      0.60      0.63      1229
           1       0.66      0.66      0.66      1209
           2       0.63      0.55      0.59      1421
           3       0.62      0.86      0.72      4253
           4       0.56      0.62      0.59      2402
           5       0.52      0.32      0.40      1210
           6       0.48      0.50      0.49      1317
           7       0.66      0.50      0.57      1526
           8       0.60      0.39      0.47      1581
           9       0.84      0.66      0.74      1223

    accuracy                           0.62     17371
   macro avg       0.62      0.57      0.59     17371
weighted avg       0.62      0.62      0.61     17371



### Test error

In [49]:
y_pred = tree.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.34      0.35      0.35       271
           1       0.47      0.48      0.47       291
           2       0.40      0.28      0.33       389
           3       0.57      0.77      0.65      1070
           4       0.36      0.40      0.38       637
           5       0.24      0.16      0.20       290
           6       0.28      0.28      0.28       333
           7       0.36      0.30      0.33       389
           8       0.29      0.20      0.23       396
           9       0.53      0.38      0.44       277

    accuracy                           0.43      4343
   macro avg       0.38      0.36      0.37      4343
weighted avg       0.41      0.43      0.41      4343



### RANDOM FOREST

In [41]:
from sklearn.ensemble import RandomForestClassifier

rnd = RandomForestClassifier()
grid_rnd = {'n_estimators':[i for i in range(10, 60, 10)],
            'max_depth':[j for j in range(5, 20, 5)],
            'max_leaf_nodes': [5, 10, 15]}

grid = GridSearchCV(rnd, grid_rnd, cv=5, scoring='accuracy', verbose=2)
if grid is not None:
    grid.fit(x_train, y_train)

    print(f'Best score: {grid.best_score_}')
    for hp, value in grid.best_params_.items():
        print(f'{hp}:{value}\n')

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=10; total time=   0.6s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=10; total time=   0.6s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=10; total time=   0.5s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=10; total time=   0.6s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=10; total time=   0.6s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=20; total time=   1.2s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=20; total time=   1.2s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=20; total time=   1.2s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=20; total time=   1.2s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=20; total time=   1.2s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_estimators=30; total time=   1.9s
[CV] END .....max_depth=5, max_leaf_nodes=5, n_

In [42]:
rnd = RandomForestClassifier(n_estimators=50, max_depth=15, max_leaf_nodes=15, random_state=42)
rnd.fit(x_train, y_train)

### Training error

In [43]:
y_pred_train = rnd.predict(x_train)
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.39      0.44      0.41      1229
           1       0.41      0.50      0.45      1209
           2       0.32      0.36      0.34      1421
           3       0.44      0.95      0.60      4253
           4       0.32      0.24      0.27      2402
           5       0.00      0.00      0.00      1210
           6       0.80      0.01      0.01      1317
           7       0.50      0.07      0.13      1526
           8       0.43      0.01      0.02      1581
           9       0.43      0.57      0.49      1223

    accuracy                           0.41     17371
   macro avg       0.40      0.32      0.27     17371
weighted avg       0.41      0.41      0.32     17371



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Test error

In [44]:
y_pred = rnd.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.35      0.43      0.38       271
           1       0.44      0.55      0.49       291
           2       0.30      0.30      0.30       389
           3       0.44      0.96      0.60      1070
           4       0.41      0.30      0.34       637
           5       0.00      0.00      0.00       290
           6       1.00      0.01      0.02       333
           7       0.67      0.12      0.20       389
           8       0.23      0.01      0.01       396
           9       0.43      0.57      0.49       277

    accuracy                           0.42      4343
   macro avg       0.43      0.32      0.28      4343
weighted avg       0.43      0.42      0.33      4343



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## **Classification with images as matrices** (size 300x300)

### Train, test split

In [6]:
x_train, x_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)
x_train = x_train.reshape(x_train.shape[0], -1)

### Random Forest

In [7]:
rndf = RandomForestClassifier(n_estimators=50, max_depth=15, max_leaf_nodes=15, random_state=42)
rndf.fit(x_train, y_train)

### Train error

In [8]:
y_pred_train = rndf.predict(x_train)
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.51      0.41      0.45      1229
           1       0.30      0.29      0.29      1209
           2       0.46      0.35      0.40      1421
           3       0.48      0.97      0.65      4253
           4       0.28      0.44      0.34      2402
           5       0.50      0.00      0.00      1210
           6       0.00      0.00      0.00      1317
           7       0.00      0.00      0.00      1526
           8       0.53      0.01      0.01      1581
           9       0.44      0.62      0.52      1223

    accuracy                           0.42     17371
   macro avg       0.35      0.31      0.27     17371
weighted avg       0.36      0.42      0.33     17371



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Test error

In [9]:
x_test = x_test.reshape(x_test.shape[0], -1)
y_pred = rndf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.39      0.42       271
           1       0.25      0.22      0.23       291
           2       0.43      0.28      0.34       389
           3       0.47      0.97      0.63      1070
           4       0.29      0.44      0.35       637
           5       0.00      0.00      0.00       290
           6       0.00      0.00      0.00       333
           7       0.00      0.00      0.00       389
           8       0.80      0.01      0.02       396
           9       0.39      0.57      0.46       277

    accuracy                           0.40      4343
   macro avg       0.31      0.29      0.25      4343
weighted avg       0.34      0.40      0.31      4343



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Logistic Regression

In [None]:
lr = LogisticRegression(multi_class='multinomial', solver = 'saga', penalty = 'elasticnet', l1_ratio=0.5, random_state=42)
lr.fit(x_train, y_train)



### Train error

In [None]:
y_pred_train = lr.predict(x_train)
print(classification_report(y_train, y_pred_train))

### Test error

In [None]:
y_pred = lr.predict(x_test)
print(classification_report(y_test, y_pred))