In [1]:
from fastai import *
from fastai.vision import *
from fastai.metrics import error_rate
from pathlib import Path
from glob2 import glob
from sklearn.metrics import confusion_matrix
import torch

import pandas as pd
import numpy as np
import os
import zipfile as zf
import shutil
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
import boto3
import io
from tqdm import tqdm
from data_processing import sort_images, binary_search


## 1. Extract data

In [14]:
def get_category_and_subcategory(filenames,key):
    
    filename = key.split('/')[-1]

    # we are going to do a binary search to find the filename in the excel file

    index = binary_search(filenames, filename)
    
    i = index

    row = excel_data[index:index+1]
    category = row['WASTE_TYPE'][i]
    sub_category = row['WASTE_SUB_TYPE'][i]

    return category,sub_category

In [26]:
from data_processing import sort_images, binary_search

s3 = boto3.client('s3')
bucket_name = 'trashback-data'
excel_file_key = 'waste_pics.xlsx'
image_folder_key = 'trashback-images/'
target_folder_name = 'sorted_images_test/'

sort_images(bucket_name,image_folder_key,target_folder_name,excel_file_key)

Sorting 6411 images...


Progress: 100%|██████████| 6411/6411 [59:46<00:00,  1.79images/s]  


Images sorted successfully.


In [6]:
def create_folder_names_from_excel(excel_data):

    '''

    input : excel_data with 'WASTE_TYPE' and 'WASTE_SUB_TYPE' in the keys

    output : folder_names 

    '''

    folder_names = []
    excel_data.dropna()

    for i in range(45080):
        category = excel_data['WASTE_TYPE'][i]
        sub_cat = excel_data['WASTE_SUB_TYPE'][i]

        folder_name = str(category) + '/' + str(sub_cat) + '/'
        if folder_name not in folder_names:
            folder_names.append(folder_name)

    return folder_names


In [25]:
def create_folders_if_needed(folder_name,folder_names):
    '''

    input : a list of folder names, with category & subcategory : 'Plastique/Autre déchet plastique/

    output : nothing but created the folders if not in the s3 bucket (within sorted images)

    ''' 

    for string in folder_names:
        category,sub_category = string.split('/')[0],string.split('/')[1]

        try:
            s3.head_object(Bucket=bucket_name, Key=f'{folder_name}/{category}/')
        except s3.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '404':
                s3.put_object(Bucket=bucket_name, Key=f'{folder_name}/{category}/', Body='')

        # Create "sub-category" folder if it does not exist

        try:
            s3.head_object(Bucket=bucket_name, Key=f'{folder_name}/{category}/{sub_category}')
        except s3.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '404':
                s3.put_object(Bucket=bucket_name, Key=f'{folder_name}/{category}/{sub_category}', Body='')

    print("Folders created successfully.")
#create_folders_if_needed(folder_names)

### 2. Creating the model

In [4]:
import os, shutil, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

2024-02-20 10:50:11.801194: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
bucket_name = 'trashback-data'
test_folder_key = 'sorted_images_test/'
train_folder_key = 'sorted_images/'
excel_file_key = 'waste_pics.xlsx'


s3 = boto3.client('s3')

excel_obj = s3.get_object(Bucket=bucket_name, Key=excel_file_key)
excel_data = pd.read_excel(io.BytesIO(excel_obj['Body'].read()))


categories = create_folder_names_from_excel(excel_data)  #methode longue pour extraire les categories
num_classes = len(categories)


Model creation

In [None]:
# we are going to creat a dictionnary with the categories

categories_dict = {}
categories = excel_data['WASTE_TYPE'].unique()

for i in range(45080):
    category = excel_data['WASTE_TYPE'][i]
    sub_cat = excel_data['WASTE_SUB_TYPE'][i]

    folder_name = str(category) + '/' + str(sub_cat) + '/'
    if folder_name not in categories_dict:
        categories_dict[folder_name] = i

In [12]:
# we are going to create a dictionary with the keys of images as well as their category
from data_processing import list_keys

def create_dict_of_images(bucket_name,folder_key,excel_data):

    keys = list_keys(bucket_name,folder_key)

    dict_of_images = {}
    keys = []
    

    for key in keys:
        category,sub_category = get_category_and_subcategory(excel_data,key)
        dict_of_images[key] = category + '/' + sub_category

    return dict_of_images


In [20]:
from data_processing import list_keys

keys = list_keys(bucket_name,train_folder_key)

In [75]:
dict_of_images = {}
dict_filenames = {}

for key in keys:
    filename = key.split('/')[-1]
    category,sub_category = key.split('/')[-3],key.split('/')[-2]
    dict_of_images[key] = category + '/' + sub_category
    dict_filenames[filename] = category + '/' + sub_category



In [76]:
# I want to convert this dictionnaire into a dataframe

df = pd.DataFrame(list(dict_of_images.items()),columns = ['path','category'])
df2 = pd.DataFrame(list(dict_filenames.items()),columns = ['filename','category'])

#we shuffle the dataframe

df = df.sample(frac=1).reset_index(drop=True)
df2 = df2.sample(frac=1).reset_index(drop=True)

In [77]:
def split_data(data,ratio):
    last=int(len(data)*ratio)
    return data[:last], data[last:]

In [78]:
base_path = '/Users/macbook/Desktop/Trashback/'

df2['filename'] = base_path + df2['filename']

In [80]:
train,test=split_data(df2,.8)


In [56]:
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)

In [81]:
# we remove the lines of the dataframe that are not in the category list

train = train[(train['category']+'/').isin(categories)]
test = test[(test['category']+'/').isin(categories)]


In [82]:
train["category"].value_counts()

category
Métaux/Canette                                    3445
Plastique/Bouteille en plastique                  2568
Plastique/Autre déchet plastique                  2261
Plastique/Emballage plastique                     2245
Métaux/Autre déchet métaux                        1142
Verre/Bouteille en verre                          1141
Mégots/Mégots                                      813
Papier ou Carton/Carton                            681
Plastique/Sac en plastique                         545
Plastique/Bouchon en plastique                     394
Papier ou Carton/Récipient liquide                 333
Papier ou Carton/Emballage bonbon                  310
Plastique/Polystyrène                              298
Verre/< 10 morceaux de verre                       267
Papier ou Carton/Ticket                            258
Papier ou Carton/Sac en papier                     239
Mégots/< 10 mégots                                 238
Plastique/Gobelet en plastique                     221
T

In [83]:
test["category"].value_counts()

category
Métaux/Canette                                899
Plastique/Bouteille en plastique              663
Plastique/Autre déchet plastique              542
Plastique/Emballage plastique                 533
Métaux/Autre déchet métaux                    312
Verre/Bouteille en verre                      272
Mégots/Mégots                                 201
Papier ou Carton/Carton                       174
Plastique/Sac en plastique                    125
Plastique/Bouchon en plastique                110
Papier ou Carton/Emballage bonbon              83
Papier ou Carton/Récipient liquide             83
Papier ou Carton/Ticket                        66
Verre/< 10 morceaux de verre                   66
Mégots/< 10 mégots                             62
Plastique/Polystyrène                          62
Plastique/Gobelet en plastique                 62
Textile/Autre déchet textile                   55
Papier ou Carton/Sac en papier                 51
Mégots/~ 25 mégots                       

In [84]:
train,valid=split_data(train,.9)


In [85]:
train


Unnamed: 0,filename,category
0,/Users/macbook/Desktop/Trashback/1690890272647-image1690890265729.jpg,Métaux/Canette
1,/Users/macbook/Desktop/Trashback/1697531309273-user.jpg,Verre/< 10 morceaux de verre
2,/Users/macbook/Desktop/Trashback/1689932864492-user.jpg,Plastique/Bouteille en plastique
3,/Users/macbook/Desktop/Trashback/1694002474287-image1694002399218.jpg,Mégots/Mégots
4,/Users/macbook/Desktop/Trashback/1689870910900-image1689870876332.jpg,Mégots/Mégots
...,...,...
18886,/Users/macbook/Desktop/Trashback/1690797034541-image1690797012818.jpg,Plastique/Sac en plastique
18887,/Users/macbook/Desktop/Trashback/1696784502253-image1696784507143.jpg,Papier ou Carton/Ticket
18888,/Users/macbook/Desktop/Trashback/1689865671335-image1689865691067.jpg,Métaux/Canette
18889,/Users/macbook/Desktop/Trashback/1698375594892-image1698375620046.jpg,Plastique/Emballage plastique


In [66]:
#we are going to convert paths into s3 urls

def convert_to_s3_url(bucket_name,df):
    df['path'] = 's3://' + bucket_name + '/' + df['path']
    return df

train = convert_to_s3_url(bucket_name,train)
valid = convert_to_s3_url(bucket_name,valid)
test = convert_to_s3_url(bucket_name,test)

In [86]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator


batch_size = 16
size=224
epoch=50
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train,
    x_col='filename',
    y_col='category',
    target_size=(size, size),
    batch_size=batch_size,
    class_mode="input"
    )
valid_datagen = ImageDataGenerator(rescale=1./255)
valid_generator = train_datagen.flow_from_dataframe(
    dataframe=valid,
    x_col='filename',
    y_col='category',
    target_size=(size, size),
    batch_size=batch_size,
    class_mode="input"
    )

Found 17359 validated image filenames.
Found 1929 validated image filenames.




In [87]:
train.head()

Unnamed: 0,filename,category
0,/Users/macbook/Desktop/Trashback/1690890272647-image1690890265729.jpg,Métaux/Canette
1,/Users/macbook/Desktop/Trashback/1697531309273-user.jpg,Verre/< 10 morceaux de verre
2,/Users/macbook/Desktop/Trashback/1689932864492-user.jpg,Plastique/Bouteille en plastique
3,/Users/macbook/Desktop/Trashback/1694002474287-image1694002399218.jpg,Mégots/Mégots
4,/Users/macbook/Desktop/Trashback/1689870910900-image1689870876332.jpg,Mégots/Mégots


In [88]:
train.head()

Unnamed: 0,filename,category
0,/Users/macbook/Desktop/Trashback/1690890272647-image1690890265729.jpg,Métaux/Canette
1,/Users/macbook/Desktop/Trashback/1697531309273-user.jpg,Verre/< 10 morceaux de verre
2,/Users/macbook/Desktop/Trashback/1689932864492-user.jpg,Plastique/Bouteille en plastique
3,/Users/macbook/Desktop/Trashback/1694002474287-image1694002399218.jpg,Mégots/Mégots
4,/Users/macbook/Desktop/Trashback/1689870910900-image1689870876332.jpg,Mégots/Mégots


In [44]:
import cv2
from PIL import Image
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Input, Lambda, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers.legacy import Adam
import tensorflow as tf 
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization,MaxPooling2D,BatchNormalization,\
                        Permute, TimeDistributed, GlobalAveragePooling2D, SeparableConv2D,\
ZeroPadding2D, Convolution2D, ZeroPadding2D, Conv2DTranspose,ReLU, UpSampling2D, Concatenate, Conv2DTranspose

In [46]:
def se_block_enc(inputs,alpha):
    input_channels = inputs.shape[-1]
    x = tf.keras.layers.GlobalAveragePooling2D()(inputs)
    x = tf.keras.layers.Dense(units=alpha, activation="relu")(x)    
    x = tf.keras.layers.Dense(units=input_channels, activation="sigmoid")(x)    
    x = tf.reshape(x, [-1, 1, 1, input_channels])
    x = inputs * x
    return x

In [47]:

# ENCODER
input_img = Input(shape=(size,size, 3))  
x = Conv2D(48, (3, 3), activation='relu', padding='same')(input_img)
x=se_block_enc(x,20)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(96, (3, 3), activation='relu', padding='same')(x)
x=se_block_enc(x,30)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(192, (3, 3), activation='relu', padding='same')(x)
x=se_block_enc(x,50)
x = MaxPooling2D((2, 2), padding='same')(x)
encoded = Conv2D(32, (1, 1), activation='relu', padding='same')(x)

# Bottolneck
latentSize = (28,28,32)

# DECODER
direct_input = Input(shape=latentSize)
x = Conv2D(192, (1, 1), activation='relu', padding='same')(direct_input)
x = UpSampling2D((2, 2))(x)
x = Conv2D(192, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(96, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(48, (3, 3), activation='relu', padding='same')(x)
x = Conv2D(192, (3, 3), activation='relu', padding='same')(x)

decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

# COMPILE
encoder = Model(input_img, encoded)
decoder = Model(direct_input, decoded)
autoencoder = Model(input_img, decoder(encoded))


In [48]:
autoencoder.summary()


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv2d_1 (Conv2D)           (None, 224, 224, 48)         1344      ['input_2[0][0]']             
                                                                                                  
 global_average_pooling2d (  (None, 48)                   0         ['conv2d_1[0][0]']            
 GlobalAveragePooling2D)                                                                          
                                                                                                  
 dense (Dense)               (None, 20)                   980       ['global_average_pooling

In [5]:
autoencoder.compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='binary_crossentropy')
history = autoencoder.fit_generator(train_generator,
                                 validation_data= valid_generator,
        epochs = epoch, verbose=2)

NameError: name 'autoencoder' is not defined