In [3]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torchvision import datasets, transforms
import tensorflow as tf
from skimage import io, color, transform
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, top_k_accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from PIL import Image
from preprocessing import check_dimensions, check_channels, check_max_min_dimensions, count_small_images, delete_invalid_images
import cv2 as cv

### Load data

In [4]:
# Specify the base folder where all your image classes are stored
folder = 'data'
data = datasets.ImageFolder(root = folder, transform=transforms.ToTensor())

In [4]:
class_map = {k:v for k,v in enumerate(data.classes)}
print(class_map)

{0: 'battery', 1: 'biological', 2: 'cardboard', 3: 'clothes', 4: 'glass', 5: 'metal', 6: 'paper', 7: 'plastic', 8: 'shoes', 9: 'trash'}


### Check if all images have the same dimensions and the same number of channels.

2.20 minuti per runnare.

In [5]:
check_dimensions(data)
check_channels(data)

Not all images have the same shape.
All images have 3 channels.


### Check max and min dimensions and channels of the images.

2.09 minuti per runnare.

In [6]:
check_max_min_dimensions(data)

Min Width: 71, Max Width: 6283
Min Height: 51, Max Height: 7786


### Count how man images are smaller than a threshold based on height and width and based on the number of pixels.

In [4]:
count_small_images(data, size_threshold=(224, 224))

Class battery: 30 images have at least one dimension smaller than 224 or 224.
Class biological: 8 images have at least one dimension smaller than 224 or 224.
Class cardboard: 34 images have at least one dimension smaller than 224 or 224.
Class clothes: 0 images have at least one dimension smaller than 224 or 224.
Class glass: 39 images have at least one dimension smaller than 224 or 224.
Class metal: 11 images have at least one dimension smaller than 224 or 224.
Class paper: 31 images have at least one dimension smaller than 224 or 224.
Class plastic: 18 images have at least one dimension smaller than 224 or 224.
Class shoes: 34 images have at least one dimension smaller than 224 or 224.
Class trash: 9 images have at least one dimension smaller than 224 or 224.
Total invalid images: 214


### Since only a few images are smaller than (224, 224), we decided to drop those.

### Ancora da runnare.

In [5]:
clean_data = delete_invalid_images(data, size_threshold=(224, 224))

KeyboardInterrupt: 

### Save cleaned dataset

In [8]:
print(len(data))
len(clean_data)

19407


14173

In [None]:
torch.save(clean_data, 'clean_data.pt')

### Since not all images have the same dimensions nor the same number of channels we have to set a standard dimension and a standard number of channels. We choose to resize images to (224, 224) and RGB format, so 3 channels.