# Title: classification des images machine learning (RF,KNN)
## Author: Ibrahim Bougarroua

## Source de données

  Nous avons utilisé un ensemble de données qui a été préalablement défini pour classer les images de légumes dans notre projet. Si vous utilisez cet ensemble de données, veuillez citer la source : https://www.researchgate.net/publication/352846889_DCNN-Based_Vegetable_Image_Classification_Using_Transfer_Learning_A_Comparative_Study.

  L'expérience initiale a été menée avec 15 types de légumes courants que l'on trouve partout dans le monde, tels que des haricots, de la courge amère, des aubergines, des brocolis, du chou, des poivrons, des carottes, du chou-fleur, des concombres, des pommes de terre, des citrouilles, des radis et des tomates. Un total de 21 000 images de 15 classes ont été utilisées, chaque classe contenant 1400 images de taille 224×224 et au format *.jpg. L'ensemble de données a été divisé en 70% pour l'entraînement, 15% pour la validation et 15% pour les tests.

  L'ensemble de données contient trois dossiers : entraînement (15 000 images), test (3000 images) et validation (3000 images), chacun contenant des sous-dossiers pour différents légumes où les images correspondantes sont présentes.

  Pour télécharger cet ensemble de données, vous pouvez cliquer sur le lien suivant : https://www.kaggle.com/datasets/misrakahmed/vegetable-image-dataset/download?datasetVersionNumber=1. 

  Les images de cet ensemble de données ont été collectées par les auteurs du projet à partir de fermes et de marchés de légumes.

  Nous n'avons pas utilisé le dossier de validation de l'ensemble de données pour notre projet de classification d'images de légumes, car nous avons utilisé l'apprentissage automatique (machine learning) plutôt que l'apprentissage en profondeur (deep learning). En général, le dossier de validation est utilisé pour l'apprentissage en profondeur, où il est important de déterminer les performances du modèle sur des données qu'il n'a pas encore vues. Toutefois, dans notre cas, nous avons utilisé une technique de machine learning qui ne nécessite pas de validation séparée, car le modèle est capable d'ajuster ses paramètres pendant la formation pour obtenir les meilleurs résultats possibles.

In [None]:
import os ,pathlib
import random
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import shutil
import numpy as np
import os
import cv2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from joblib import dump, load


In [None]:
def copy_random_images(input_dir, output_dir, pct=0.2):
    """
    Copy a random sample of images from each subdirectory in the input directory
    to the corresponding subdirectory in the output directory.
    pct is the percentage of images to copy (default: 20%).
    """
    for subdir in os.listdir(input_dir):
        subdir_path = os.path.join(input_dir, subdir)
        if not os.path.isdir(subdir_path):
            continue
        output_subdir_path = os.path.join(output_dir, subdir)
        os.makedirs(output_subdir_path, exist_ok=True)
        img_files = [f for f in os.listdir(subdir_path) if f.endswith(".jpg")]
        num_images = len(img_files)
        num_images_to_copy = int(num_images * pct)
        random.seed(42)  # Set a random seed for reproducibility
        img_files_to_copy = random.sample(img_files, num_images_to_copy)
        for img_file in img_files_to_copy:
            img_path = os.path.join(subdir_path, img_file)
            output_img_path = os.path.join(output_subdir_path, img_file)
            shutil.copy(img_path, output_img_path)

In [None]:
train_path_or = "Vegetable_Images/train"
test_path_or = "Vegetable_Images/test"

train_path = "veggie_data/train/"
test_path = "veggie_data/test/"

Seulement 30% du dataset a été utilisé dans cet exemple.

In [None]:
copy_random_images(train_path_or,train_path,0.3)
copy_random_images(test_path_or,test_path,0.3)


In [None]:
def count_files(rootdir):
    for path in pathlib.Path(rootdir).iterdir():
        if path.is_dir():
            print("There are " + str(len([name for name in os.listdir(path) \
            if os.path.isfile(os.path.join(path, name))])) + " files in " + \
            str(path.name))

In [None]:
print("Les fichiers de Test \n")
count_files(test_path)
print("Les fichiers de Train \n" )
count_files(train_path)

In [None]:
def get_class(rootdir):
    class_list=[]
    for path in pathlib.Path(rootdir).iterdir():
        if path.is_dir():
            class_list.append(str(path.name))
    return class_list        

In [None]:
print(get_class(train_path))

In [None]:
def show_rd_imgs(dir_path,list_cat):
    for c in list_cat :
        d=os.path.join(dir_path,c)
        file_list = [f for f in os.listdir(d) if os.path.isfile(os.path.join(d, f))]
        random_file = random.choice(file_list)
        img = Image.open(os.path.join(d, random_file))
        img_array = np.array(img)
        plt.imshow(img_array)
        plt.title(c)
        plt.show()

In [None]:
show_rd_imgs(train_path,get_class(train_path))

In [None]:
# Define the list of classes
classes = get_class(train_path)

# Define the size of your input images (assuming all images have the same size)
img_size = (256, 256)


In [None]:
# Load the training images and labels
X_train = []
y_train = []
for class_name in classes:
    class_dir = os.path.join(train_path, class_name)
    for img_name in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_name)
        img = cv2.imread(img_path)
        img = cv2.resize(img, img_size)
        X_train.append(img)
        y_train.append(class_name)


In [None]:
# Convert the training data to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)


In [None]:
print(len(y_train))

In [None]:
from sklearn import preprocessing
le= preprocessing.LabelEncoder()
le.fit(y_train)
y_train_encoded= le.transform(y_train)

In [None]:
y_train = y_train_encoded

In [None]:
X_train = X_train.astype('float32') / 255.0

In [None]:
nsamples, nx, ny, nrgb = X_train.shape
x_train = X_train.reshape((nsamples,nx*ny*nrgb))

In [None]:
# Load the testing images and labels
X_test = []
y_test = []
for class_name in classes:
    class_dir = os.path.join(test_path, class_name)
    for img_name in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_name)
        img = cv2.imread(img_path)
        img = cv2.resize(img, img_size)
        X_test.append(img)
        y_test.append(class_name)


In [None]:
# Convert the testing data to NumPy arrays
X_test = np.array(X_test)
y_test = np.array(y_test)


In [None]:
X_test= X_test.astype('float32') /255.0

In [None]:
y_test_encoded= le.transform(y_test)

In [None]:
y_test=y_test_encoded

In [None]:
nsamples, nx, ny, nrgb = X_test.shape
x_test = X_test.reshape((nsamples,nx*ny*nrgb))

### Random Forest


In [None]:
# Extract features from the training data (for example, using a pre-trained model or feature extractor)
# features_train = ...

# Train a Random Forest classifier on the training data
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(x_train, y_train)

In [None]:
score = rfc.score(x_test, y_test)
print("Accuracy: {:.2f}%".format(score * 100))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, labels=None, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(7, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.show()


In [None]:
# Prédire les classes pour les données de train
y_pred1 = rfc.predict(x_train)


In [None]:
len(y_train)

In [None]:
y_trainX = le.inverse_transform(y_train)
y_pred1 = le.inverse_transform(y_pred1)

In [None]:
plot_confusion_matrix(y_trainX, y_pred1, classes)

In [None]:
# Prédire les classes pour les données de test
y_pred = rfc.predict(x_test)



In [None]:
y_test = le.inverse_transform(y_test)
y_pred = le.inverse_transform(y_pred)

In [None]:
plot_confusion_matrix(y_test, y_pred, classes)

### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
# Train the classifier on the training data
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_test)

In [None]:
# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
# Prédire les classes pour les données de test
y_pred2 = knn.predict(x_train)

In [None]:
del y_train

In [None]:
# Load the training images and labels
X_train = []
y_train = []
for class_name in classes:
    class_dir = os.path.join(train_path, class_name)
    for img_name in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_name)
        img = cv2.imread(img_path)
        img = cv2.resize(img, img_size)
        X_train.append(img)
        y_train.append(class_name)


In [None]:
from sklearn import preprocessing
le= preprocessing.LabelEncoder()
le.fit(y_train)
y_train_encoded= le.transform(y_train)

In [None]:
# Convert the training data to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)


In [None]:
y_train = y_train_encoded

In [None]:
y_train = le.inverse_transform(y_train)


In [None]:
plot_confusion_matrix(y_train, y_pred2, classes)

In [None]:
del y_test

In [None]:
# Load the testing images and labels
X_test = []
y_test = []
for class_name in classes:
    class_dir = os.path.join(test_path, class_name)
    for img_name in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_name)
        img = cv2.imread(img_path)
        img = cv2.resize(img, img_size)
        X_test.append(img)
        y_test.append(class_name)

In [None]:
# Convert the testing data to NumPy arrays
X_test = np.array(X_test)
y_test = np.array(y_test)

In [None]:
y_test=y_train_encoded

In [None]:
# Prédire les classes pour les données de test
y_test = le.inverse_transform(y_test)


In [None]:
y_pred2 = knn.predict(x_train)

In [None]:
plot_confusion_matrix(y_test, y_pred2, classes)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred2, target_names=classes))


In [None]:
def predictShow(path,t):
    i = cv2.imread(path)
    i = cv2.cvtColor(i, cv2.COLOR_BGR2RGB)
    plt.imshow(i)
    plt.title(str(t[0]))
    plt.show()

In [None]:
i = cv2.imread('test2.jpg')
img = cv2.resize(i, img_size)
X= np.array(img)
X=X / 255.0
nx, ny, nrgb = X.shape
x = X.reshape((1,nx*ny*nrgb))
y_pred = knn.predict(x)
y_decoded = le.inverse_transform(y_pred)


In [None]:
predictShow("1.jpg",y_decoded)

In [None]:
from joblib import dump, load
dump(rfc, 'rfc_model.joblib')
# Load the model from disk
rfc = load('rfc_model.joblib')