In [1]:
from fastai import *
from fastai.vision import *
from fastai.metrics import error_rate
from pathlib import Path
from glob2 import glob
from sklearn.metrics import confusion_matrix
import torch

import pandas as pd
import numpy as np
import os
import zipfile as zf
import shutil
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
import boto3
import io
from tqdm import tqdm

## 1. Extract data

In [3]:
def list_keys(bucket_name,image_folder_key):
    '''

    input : bucket name & folder in which we want to list keys

    output : a list of keys without the name of the folder

    '''

    keys=[]

    s3=boto3.resource('s3')
    bucket=s3.Bucket(bucket_name)
    for i in bucket.objects.filter(Prefix=image_folder_key):
        keys.append(i.key)
    keys.pop(0) # remove the first element which is the folder name

    return keys

In [2]:

def binary_search(arr, target):
    low = 0
    high = len(arr) - 1

    while low <= high:
        mid = (low + high) // 2
        mid_value = arr[mid]

        if mid_value == target:
            return mid  # File found at index mid
        elif mid_value < target:
            low = mid + 1
        else:
            high = mid - 1

    return -1  # File not found


In [48]:
def get_category_and_subcategory(filenames,key):
    
    filename = key.split('/')[-1]

    # we are going to do a binary search to find the filename in the excel file

    index = binary_search(filenames, filename)
    
    i = index

    row = excel_data[index:index+1]
    category = row['WASTE_TYPE'][i]
    sub_category = row['WASTE_SUB_TYPE'][i]

    return category,sub_category

In [4]:
s3 = boto3.client('s3')
bucket_name = 'trashback-data'
excel_file_key = 'waste_pics.xlsx'
image_folder_key = 'trashback-images/'

def sort_images(bucket_name,image_folder_key,target_folder_name,excel_file_key):
    keys = list_keys(bucket_name,image_folder_key)

    s3 = boto3.client('s3')


    excel_obj = s3.get_object(Bucket=bucket_name, Key=excel_file_key)
    excel_data = pd.read_excel(io.BytesIO(excel_obj['Body'].read()))

    # Get the total number of images to process
    total_images = len(keys)

    print(f"Sorting {total_images} images...")

    filenames = excel_data['PIC_NAME']
    filenames.dropna(inplace=True)
    
    # Initialize tqdm progress bar
    progress_bar = tqdm(total=total_images, desc='Progress', unit='images')

    # Iterate through each row in the Excel file
    for key in keys:

        filename = key.split('/')[-1]

        # we are going to do a binary search to find the filename in the excel file

        index = binary_search(filenames, filename)
        
        i = index

        row = excel_data[index:index+1]
        category = row['WASTE_TYPE'][i]
        
        sub_category = row['WASTE_SUB_TYPE'][i]
        image_filename = row['PIC_NAME'][i] 
        image_key = image_folder_key + image_filename
        
        # Copy image from S3 to a new location with appropriate folder structure
        new_folder_key = f'sorted_images/{category}/{sub_category}/'
        new_image_key = f'{new_folder_key}{image_filename}'

        #we check if the key exists in the bucket

        '''try:
            s3.head_object(Bucket=bucket_name, Key=image_key)
        except s3.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '404':
                print(f"The image {image_key} does not exist in the bucket. Skipping...")
                progress_bar.update(1)
                continue'''

        '''# Create "category" folder if it does not exist
        try:
            s3.head_object(Bucket=bucket_name, Key=f'sorted_images/{category}/')
        except s3.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '404':
                s3.put_object(Bucket=bucket_name, Key=f'sorted_images/{category}/', Body='')

        # Create "sub-category" folder if it does not exist

        try:
            s3.head_object(Bucket=bucket_name, Key=new_folder_key)
        except s3.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '404':
                s3.put_object(Bucket=bucket_name, Key=new_folder_key, Body='')'''    

        # Copy image to new location
        s3.copy_object(
            Bucket=bucket_name,
            Key=new_image_key,
            CopySource={'Bucket': bucket_name, 'Key': image_key}
        )
        # Delete original image
        s3.delete_object(Bucket=bucket_name, Key=image_key)
            
        # Update progress bar
        progress_bar.update(1)

    # Close the progress bar
    progress_bar.close()

    print("Images sorted successfully.")

In [21]:
s3 = boto3.client('s3')
bucket_name = 'trashback-data'
excel_file_key = 'waste_pics.xlsx'
image_folder_key = 'trashback-images/'
target_folder_name = 'sorted_images'

sort_images(bucket_name,image_folder_key,target_folder_name,excel_file_key)

Progress: 100%|██████████| 18079/18079 [2:11:50<00:00,  2.29images/s]


Images sorted successfully.


In [None]:
s3 = boto3.client('s3')
bucket_name = 'trashback-data'
excel_file_key = 'waste_pics.xlsx'
image_folder_key = 'trashback-images/'
target_folder_name = 'sorted_images_test/'

sort_images(bucket_name,image_folder_key,target_folder_name,excel_file_key)

In [27]:
bucket_name = 'trashback-data'
image_folder_key = 'sorted_images/'
folders = list_keys(bucket_name,image_folder_key)

['sorted_images/Matières Organiques/Autre déchet organique/', 'sorted_images/Matières Organiques/Autre déchet organique/1689781579607-image1689781534306.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1689954925080-image1689954931709.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1689957730533-user.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1690105842615-image1690105846455.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1690359570197-user.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1690460137723-image1690460053578.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1690467250987-image1690467267983.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1690523275960-image1690523276792.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1690652663129-image1690652626304.jpg', 'sorted_images/Matières Organiques/Autre déchet organique/1690707638091-image1690707593308.jp

In [30]:
keys = []
for folder in folders:
    keys.append(folder.split('/')[-1])

In [31]:
print(keys)

['', '1689781579607-image1689781534306.jpg', '1689954925080-image1689954931709.jpg', '1689957730533-user.jpg', '1690105842615-image1690105846455.jpg', '1690359570197-user.jpg', '1690460137723-image1690460053578.jpg', '1690467250987-image1690467267983.jpg', '1690523275960-image1690523276792.jpg', '1690652663129-image1690652626304.jpg', '1690707638091-image1690707593308.jpg', '1690732363506-image1690732380849.jpg', '1690782603573-image1690782607299.jpg', '1690791311352-image1690791310147.jpg', '1691038108327-image1691038111610.jpg', '1691039644954-image1691039614755.jpg', '1691214608387-image1691214587682.jpg', '1691475668373-user.jpg', '1691738727951-image1691738590081.jpg', '1691817927998-image1691817954030.jpg', '1691818956418-image1691818948496.jpg', '1692024140640-image1692024143353.jpg', '1692109032872-image1692109055775.jpg', '1692253736676-image1692204176793.jpg', '1692457563212-image1692441208525.jpg', '1692560626143-image1692560645458.jpg', '1692561636644-image1692561665524.jpg

In [32]:
path = '/Users/macbook/Desktop/Trashback/'
local_keys = os.listdir(path)



['1704370104873-image1704370141963.jpg', '1702474982574-image1702474977945.jpg', '1702994359063-image1702994384503.jpg', '1692430441347-image1692430446975.jpg', '1698764621479-image1698764638986.jpg', '1701604172415-user.jpg', '1694419991101-image1694419945973.jpg', '1691866020295-user.jpg', '1704090776945-user.jpg', '1701507920403-image1701507924125.jpg', '1694241870019-image1694241775795.jpg', '1704131690365-image1704131644673.jpg', '1689967037353-user.jpg', '1692686901405-user.jpg', '1699180964304-image1699180964373.jpg', '1702648373501-image1702648361768.jpg', '1692699437378-image1692699476236.jpg', '1694356633727-image1694356599724.jpg', '1693068998981-image1693069001114.jpg', '1700210940700-image1700210929330.jpg', '1702742490345-image1702742511367.jpg', '1704116302419-image1704116334238.jpg', '1692431725936-image1692431741220.jpg', '1700220048003-image1700220056305.jpg', '1691276855993-user.jpg', '1701962892807-image1701962911617.jpg', '1704556734099-user.jpg', '1704389591654-us

In [37]:
key_not_in_cloud = []
for key in local_keys:
    if key not in keys:
        key_not_in_cloud.append(key)

    

In [43]:
keys_to_upload = key_not_in_cloud[:6544]

In [45]:
keys_to_upload[:5]

['1704370104873-image1704370141963.jpg',
 '1702474982574-image1702474977945.jpg',
 '1702994359063-image1702994384503.jpg',
 '1698764621479-image1698764638986.jpg',
 '1701604172415-user.jpg']

In [49]:
filenames = excel_data['PIC_NAME']
filenames.dropna(inplace=True)
filename = keys_to_upload[0]

index = binary_search(filenames, filename)
        
i = index

row = excel_data[index:index+1]
category = row['WASTE_TYPE'][i]

sub_category = row['WASTE_SUB_TYPE'][i]
image_filename = row['PIC_NAME'][i] 
image_key = image_folder_key + image_filename

# Copy image from S3 to a new location with appropriate folder structure
new_folder_key = f'sorted_images/{category}/{sub_category}/'
new_image_key = f'{new_folder_key}{image_filename}'

In [50]:
print(new_image_key)


sorted_images/Plastique/Emballage plastique/1704370104873-image1704370141963.jpg


In [129]:
# we are going to create a dictionary with the keys as the waste types and the values as a list of the filenames in each waste type

waste_types = excel_data['WASTE_TYPE'].unique()
waste_sub_types = excel_data['WASTE_SUB_TYPE'].unique()

progress_bar = tqdm(total=total_images, desc='Progress', unit='lines')

waste_type_dict = {}
for waste_type in waste_types:
    waste_type_dict[waste_type] = []

if not is_sorted(filenames):

    return 'the filenames are not sorted'

for key in keys:

    filename = key.split('/')[-1]

    # we are going to do a binary search to find the filename in the excel file (we suppose it is sorted)
    if is_sorted(filenames)

    index = binary_search(filenames, filename)
    
    i = index

    row = excel_data[index:index+1]
    waste_type = row['WASTE_TYPE'][i]
    waste_type_dict[waste_type].append(key.split('/')[-1])
    progress_bar.update(1)

progress_bar.close()

# we are going to do the same thing qith the subcategories :

progress_bar = tqdm(total=total_images, desc='Progress', unit='lines')

waste_sub_type_dict = {}
for waste_sub_type in waste_sub_types:
    waste_sub_type_dict[waste_sub_type] = []

for key in keys:
    
        filename = key.split('/')[-1]
    
        # we are going to do a binary search to find the filename in the excel file
    
        index = binary_search(filenames, filename)
        
        row = excel_data[index:index+1]

        i = index
        waste_sub_type = row['WASTE_SUB_TYPE'][i]
        waste_sub_type_dict[waste_sub_type].append(key.split('/')[-1])
        progress_bar.update(1)

progress_bar.close()


Progress:   0%|          | 0/26143 [00:13<?, ?lines/s]
Progress: 100%|██████████| 26143/26143 [00:10<00:00, 2486.25lines/s]
Progress: 100%|██████████| 26143/26143 [00:07<00:00, 3551.13lines/s]


In [35]:
# Test binary search

target_filename = '1689869880436-user.jpg'
index = binary_search(filenames, target_filename)

if index != -1:
    # File found
    print(f"Filename '{target_filename}' found at index {index}.")
    # Retrieve the corresponding row if needed
    corresponding_row = excel_data.iloc[index]
    print("Corresponding row:", corresponding_row)
else:
    print(f"Filename '{target_filename}' not found.")

Filename '1689869880436-user.jpg' found at index 1000.
Corresponding row: CREATED                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                2023-07-20 14:18:00
PIC_NAME                                                                                                                                                                                                                                                                                                                                                                           

In [34]:
def is_sorted(arr):
    arr.dropna(inplace=True)
    for i in range(1, len(arr)):
        if arr[i] < arr[i-1]:
            return False
    return True

filenames = excel_data['PIC_NAME']

if is_sorted(filenames):
    print("Filenames are sorted.")
else:
    print("Filenames are not sorted.")

Filenames are sorted.


In [23]:
def create_folder_names_from_excel(excel_data):

    '''

    input : excel_data with 'WASTE_TYPE' and 'WASTE_SUB_TYPE' in the keys

    output : folder_names 

    '''

    folder_names = []
    excel_data.dropna()

    for i in range(45080):
        category = excel_data['WASTE_TYPE'][i]
        sub_cat = excel_data['WASTE_SUB_TYPE'][i]

        folder_name = str(category) + '/' + str(sub_cat) + '/'
        if folder_name not in folder_names:
            folder_names.append(folder_name)

    return folder_names

#untested

Plastique/Bouteille en plastique/
Mégots/Mégots/
Papier / Carton/Récipient liquide/
Mégots/Mégots/
Plastique/Polystyrène/
Plastique/Autre déchet plastique/
Plastique/Autre déchet plastique/
Plastique/Sac en plastique/
Plastique/Bouteille en plastique/
Papier / Carton/Autre déchet papier/carton/
Papier / Carton/Carton/
Textile/Masque/
Papier / Carton/Autre déchet papier/carton/
Mégots/Mégots/
Plastique/Autre déchet plastique/
Plastique/Autre déchet plastique/
Plastique/Emballage plastique/
Métaux/Canette/
Plastique/Polystyrène/
Plastique/Emballage plastique/
Plastique/Emballage plastique/
Verre/Autre déchet verre/
Plastique/Gobelet en plastique/
Plastique/Autre déchet plastique/
Métaux/Autre déchet métaux/
Papier / Carton/Autre déchet papier/carton/
Métaux/Canette/
Mégots/Mégots/
Plastique/Autre déchet plastique/
Plastique/Bouteille en plastique/
Papier / Carton/Carton/
Papier / Carton/Autre déchet papier/carton/
Plastique/Sac en plastique/
Plastique/Emballage plastique/
Métaux/Canette/

In [29]:
for string in folder_names:
    category,sub_category = string.split('/')[0],string.split('/')[1]

# Create "category" folder if it does not exist

try:
    s3.head_object(Bucket=bucket_name, Key=f'sorted_images/{category}/')
except s3.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '404':
        s3.put_object(Bucket=bucket_name, Key=f'sorted_images/{category}/', Body='')

# Create "sub-category" folder if it does not exist

try:
    s3.head_object(Bucket=bucket_name, Key=f'sorted_images/{category}/{sub_category}')
except s3.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '404':
        s3.put_object(Bucket=bucket_name, Key=f'sorted_images/{category}/{sub_category}', Body='')


In [None]:
def create_folders_if_needed(folder_names):
    '''

    input : a list of folder names, with category & subcategory : 'Plastique/Autre déchet plastique/

    output : nothing but created the folders if not in the s3 bucket (within sorted images)

    ''' 

    for string in folder_names:
        category,sub_category = string.split('/')[0],string.split('/')[1]

    try:
        s3.head_object(Bucket=bucket_name, Key=f'sorted_images/{category}/')
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            s3.put_object(Bucket=bucket_name, Key=f'sorted_images/{category}/', Body='')

    # Create "sub-category" folder if it does not exist

    try:
        s3.head_object(Bucket=bucket_name, Key=f'sorted_images/{category}/{sub_category}')
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            s3.put_object(Bucket=bucket_name, Key=f'sorted_images/{category}/{sub_category}', Body='')

#untested
create_folders_if_needed(folder_names)

In [28]:
for string in folder_names:
    category,sub_category = string.split('/')[0],string.split('/')[1]
    print(category)

Plastique
Mégots
Papier 
Plastique
Plastique
Plastique
Papier 
Papier 
Textile
Plastique
Métaux
Verre
Plastique
Métaux
Plastique
Textile
Métaux
Papier 
Verre
Textile
Métaux
Textile
Papier 
Papier 
Plastique
Matières Organiques
Papier 
Verre
Matières Organiques
Textile
Verre
Plastique
Métaux
Textile
Plastique
Métaux
Mégots
Mégots
Mégots
Mégots
Mégots
Verre
Mégots
Verre
Verre
Métaux
Mégots
Mégots
Matières Organiques
Plastique
Métaux
