#  P6 : Classifiez automatiquement des biens de consommation¶

# Classiffication

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
from os import listdir
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from glob import glob

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D 
from tensorflow.keras.layers import GlobalAveragePooling1D 
from tensorflow.keras.layers import Flatten 
from tensorflow.keras.layers import Dense 
from tensorflow.keras.layers import Dropout 
from tensorflow.keras.layers import Rescaling
from tensorflow.keras.layers import RandomFlip
from tensorflow.keras.layers import RandomRotation
from tensorflow.keras.layers import RandomZoom
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.utils import to_categorical

# Warnings
import warnings
warnings.filterwarnings('ignore')


2023-05-08 16:50:20.296668: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


- ### Données de base

In [92]:
import numpy as np
import os
import cv2
from PIL import Image, ImageOps

# Chemin vers le dossier d'entrée contenant les images
input_path = '/Users/danongohou/Desktop/P6/Images/'

# Chemin vers le dossier de sortie
output_path = '/Users/danongohou/Desktop/P6/Processed_Img_Class/'

# Créer le dossier de sortie s'il n'existe pas
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Parcourir toutes les images dans le dossier d'entrée
for filename in os.listdir(input_path):
    # Vérifier si le fichier est une image
    if filename.endswith('.jpg') or filename.endswith('.png'):
        
        # Charger l'image avec OpenCV
        image = cv2.imread(os.path.join(input_path, filename))

        # Correction de l'exposition avec PILS
        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        image = ImageOps.autocontrast(image)
        image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        
        # Conversion en niveau de gris de l'image
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Equalization de l'histogramme de l'image
        image = cv2.equalizeHist(image)
        
        # Correction du contraste 
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(3,3))
        image = clahe.apply(image)
    
        # Réduction de bruit avec OpenCV
        image = cv2.fastNlMeansDenoising(image, h=10)
    
        # Ajout de BLUR avec OpenCV
        image = cv2.GaussianBlur(image, (5, 5), 0)
        
        # Réduction de dimension avec OpenCV
        image = cv2.resize(image, (224, 224))
        
        # Enregistrer l'image prétraitée dans le dossier de sortie
        output_filename = os.path.splitext(filename)[0] + '.jpg'
        output_file_path = os.path.join(output_path, output_filename)
        cv2.imwrite(output_file_path, image)


In [14]:
# Chargement du jeu de données
df = pd.read_csv('flipkart_com-ecommerce_sample_1050.csv')

# Ajout variable images
image_path = './Processed_Img_Class/'
df['image_path'] = [image_path + row for row in df['image']]

# Ajout de la variable  Categorie
df['label_name'] = df['product_category_tree'].apply(lambda x: x.split('>>')[0][2:].strip())

df = df[['image_path', 'label_name']]
df.head()

Unnamed: 0,image_path,label_name
0,./Processed_Img_Class/55b85ea15a1536d46b7190ad...,Home Furnishing
1,./Processed_Img_Class/7b72c92c2f6c40268628ec5f...,Baby Care
2,./Processed_Img_Class/64d5d4a258243731dc7bbb1e...,Baby Care
3,./Processed_Img_Class/d4684dcdc759dd9cdf415046...,Home Furnishing
4,./Processed_Img_Class/6325b6870c54cd47be6ebfbf...,Home Furnishing


In [18]:
df.to_csv('df.csv')

- ### Division du dataset en train et test

In [30]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Vérifier si les dossiers existent sinon les créer
if not os.path.exists('train_data'):
    os.makedirs('train_data')
if not os.path.exists('test_data'):
    os.makedirs('test_data')

# Charger le fichier CSV en tant que DataFrame
data = pd.read_csv('df.csv')

# Diviser les données en données d'entraînement et de test
train_data, test_data = train_test_split(data, test_size=0.2)

# Enregistrer les données d'entraînement et de test dans des fichiers CSV
train_data.to_csv('train_data/train.csv', index=False)
test_data.to_csv('test_data/test.csv', index=False)



# Charger le fichier train.csv en tant que DataFrame en excluant la première colonne
train_df = pd.read_csv('train_data/train.csv', usecols=lambda col: col != 'Unnamed: 0')
# Charger le fichier test.csv en tant que DataFrame en excluant la première colonne
test_df = pd.read_csv('test_data/test.csv', usecols=lambda col: col != 'Unnamed: 0')

# Afficher les dimensions du DataFrame train_df
print('Dimensions du DataFrame train_df :', train_df.shape)
# Afficher les dimensions du DataFrame test_df
print('Dimensions du DataFrame test_df :', test_df.shape)


Dimensions du DataFrame train_df : (840, 2)
Dimensions du DataFrame test_df : (210, 2)


In [31]:
train_df.head()

Unnamed: 0,image_path,label_name
0,./Processed_Img_Class/3d8222014ec36292c1c143c5...,Kitchen & Dining
1,./Processed_Img_Class/46e6853da6b0c796b7a0d820...,Home Decor & Festive Needs
2,./Processed_Img_Class/f1484a63c0dc79e96c98b2a2...,Kitchen & Dining
3,./Processed_Img_Class/dbb55b8da8c30cb67014d814...,Beauty and Personal Care
4,./Processed_Img_Class/3ee79b1e51c9f9262e8373ce...,Beauty and Personal Care


In [32]:
test_df.head() 

Unnamed: 0,image_path,label_name
0,./Processed_Img_Class/2e3586dc60df258c5478446d...,Home Decor & Festive Needs
1,./Processed_Img_Class/b454f9f449c9dff58b90113b...,Baby Care
2,./Processed_Img_Class/7956d9586de3e25ff586bca5...,Kitchen & Dining
3,./Processed_Img_Class/2cbad7ead8eb8dd92823b9f5...,Watches
4,./Processed_Img_Class/e15195f2a5ebaa2168ccd653...,Watches


- ### Processing train_data

In [40]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Charger les données
train_df = pd.read_csv('train_data/train.csv', usecols=lambda col: col != 'Unnamed: 0')

# Séparer la variable cible des caractéristiques
X = train_df.drop('label_name', axis=1)
y = train_df['label_name']

# Standardiser les caractéristiques numériques
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = StandardScaler()

# Encoder les caractéristiques catégorielles
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Créer le préprocesseur de colonnes
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Appliquer le préprocesseur sur les caractéristiques
X_processed = preprocessor.fit_transform(X)

# Vérifier le résultat
print(X_processed)
print(y)



  (0, 179)	1.0
  (1, 214)	1.0
  (2, 783)	1.0
  (3, 714)	1.0
  (4, 183)	1.0
  (5, 516)	1.0
  (6, 73)	1.0
  (7, 646)	1.0
  (8, 0)	1.0
  (9, 75)	1.0
  (10, 711)	1.0
  (11, 423)	1.0
  (12, 386)	1.0
  (13, 297)	1.0
  (14, 458)	1.0
  (15, 789)	1.0
  (16, 801)	1.0
  (17, 164)	1.0
  (18, 459)	1.0
  (19, 127)	1.0
  (20, 436)	1.0
  (21, 762)	1.0
  (22, 682)	1.0
  (23, 775)	1.0
  (24, 549)	1.0
  :	:
  (815, 744)	1.0
  (816, 482)	1.0
  (817, 330)	1.0
  (818, 147)	1.0
  (819, 428)	1.0
  (820, 780)	1.0
  (821, 741)	1.0
  (822, 821)	1.0
  (823, 345)	1.0
  (824, 413)	1.0
  (825, 314)	1.0
  (826, 165)	1.0
  (827, 610)	1.0
  (828, 339)	1.0
  (829, 148)	1.0
  (830, 185)	1.0
  (831, 504)	1.0
  (832, 611)	1.0
  (833, 302)	1.0
  (834, 693)	1.0
  (835, 550)	1.0
  (836, 687)	1.0
  (837, 566)	1.0
  (838, 808)	1.0
  (839, 396)	1.0
0                Kitchen & Dining
1      Home Decor & Festive Needs
2                Kitchen & Dining
3        Beauty and Personal Care
4        Beauty and Personal Care
             

# Classification supervisée 

## Création du modèle pré-entrainé Resnet152

In [None]:
def create_model_fct():
    # Récupération modèle pré-entraîné
    base_model = ResNet152(include_top=False, weights="imagenet", input_shape=(224, 224, 3))
  
    # Layers non entraînables - on garde les poids du modèle pré-entraîné
    for layer in base_model.layers:
        layer.trainable = False

    # Récupérer la sortie de ce réseau
    x = base_model.output
    # Compléter le modèle
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(4, activation='softmax')(x)

    # Définir le nouveau modèle
    model = Model(inputs=base_model.input, outputs=predictions)
    # Compilation du modèle 
    model.compile(loss="categorical_crossentropy", optimizer='rmsprop', metrics=["accuracy"])

    print(model.summary())
    
    return model


## Entraînement et évaluation du modèle. 

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

# Lecture des fichiers CSV
train_data = pd.read_csv('train_data/train_csv')
test_data = pd.read_csv('test_data/test_csv')

batch_size = 32
k = 5

### Création des générateurs de données
def data_flow_fct(data, datagen, data_type=None) :
    data_flow = datagen.flow_from_dataframe(data, directory='',
                                x_col='image_path', y_col='label_names',
                                weight_col=None, target_size=(256, 256),
                                classes=None, class_mode='categorical',
                                batch_size=batch_size, shuffle=True, seed=42,
                                subset=data_type
                                )
    return data_flow

datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.25,
    preprocessing_function=preprocess_input)

### Validation croisée k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
train_val_indices = [(train_idx, val_idx) for train_idx, val_idx in kf.split(train_data)]

val_accs = []
test_accs = []

for fold, (train_indices, val_indices) in enumerate(train_val_indices):
    print(f'Fold {fold+1}/{k}')

    train_data_fold = train_data.iloc[train_indices]
    val_data_fold = train_data.iloc[val_indices]

    train_flow = data_flow_fct(train_data_fold, datagen, data_type='training')
    val_flow = data_flow_fct(val_data_fold, datagen, data_type='validation')
    test_flow = data_flow_fct(test_data, datagen, data_type=None)

    # Création du modèle
    with tf.device('/gpu:0'): 
        model = create_model_fct()

    # Création du callback
    model_save_path = f"./model_fold{fold+1}_best_weights.h5"
    checkpoint = ModelCheckpoint(model_save_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
    callbacks_list = [checkpoint, es]

    print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

    with tf.device('/gpu:0'): 
        history = model.fit(train_flow,
                            validation_data=val_flow,
                            batch_size=batch_size, 
                            epochs=50, 
                            callbacks=callbacks_list, 
                            verbose=1)
        
    # Enregistrer les données de l'entraînement pour tracer les graphes
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    train_acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    
        
    ### évaluation du modèle
    # Score du dernier epoch
    loss, accuracy = model.evaluate(train_flow, verbose=True)
    print("Training Accuracy   : {:.4f}".format(accuracy))
    loss, accuracy = model.evaluate(val_flow, verbose=True)
    print("Validation Accuracy :  {:.4f}".format(accuracy))

    # Score de l'epoch optimal
    model.load_weights(model_save_path)
    loss, val_accuracy = model.evaluate(val_flow, verbose=False)
    print("Validation Accuracy :  {:.4f}".format(val_accuracy))
    loss, test_accuracy = model.evaluate(test_flow, verbose=False)
    print("Test Accuracy       :  {:.4f}".format(test_accuracy))

    val_accs.append(val_accuracy)
    test_accs.append(test_accuracy)

print(f'Average validation accuracy over {k} folds: {np.mean(val_accs)}')
print(f'Average test accuracy over {k} folds: {np.mean(test_accs)}')

# Tracer les graphes de Loss et d'Accuracy
import matplotlib.pyplot as plt
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

plt.plot(train_acc)
plt.plot(val_acc)
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()