# Crops Plants Image Classification
In this example, we use the AutoImageClassification library to create a model to classify crops plants images.

### Importing neccessary moduls

In [7]:
# AutoImageClassification library moduls
import AutoImageClassification.Bing_Image_Download as BID 
import AutoImageClassification.ImageEmbedder as IE 
import AutoImageClassification.Anomaly as AN 
import AutoImageClassification.ImageClassification as IC

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Constant values
Names of folders and files for storing models and datasets.

In [8]:
Labeled_folder = "Agricultural-crops" # Path to the main folder of the labeled dataset
Downloaded_Images_folder = 'downloaded_images' # Path to store the scraped images
Max_num_downloads = 150 # Maximum number of scraped images per category

Emb_path = 'Embeddings features' # Path to store all image embedding features
Emb_downloaded = 'downloaded' # Subfolder name for the embedding features of the downloaded images
Emb_labeled = 'labeled' # Subfolder name for the embedding features of the labeled images 
Emb_downloaded_clean = 'downloaded_clean' # Subfolder name for the embedding features of the downloaded images after filtering irrelevant images
Emb_final_dataset = 'final_dataset' # Subfolder name for the embedding features of the final dataset containing train and test dataset

Model_path = 'Models' # Path to store all the trained models
Pseudo_Labeling_Model_Name ='Pseudo_Labeling_Model.pth' # Filename to store the classification model used for pseudo-labeling
Final_Model_Name ='ImageClassification_Model.pth' # Filename to store the final classification model

## Image Scrapping

In [None]:
# Using folder structure of labeled data, we list categories and search keywords 
Categories = [d for d in os.listdir(Labeled_folder) if os.path.isdir(os.path.join(Labeled_folder, d))]
Keywords = [cat + ' plant' for cat in Categories]
# Class to download bing images
crops_image = BID.Bing_Image_Download(Max_num_downloads, Downloaded_Images_folder, Categories, Keywords)
crops_image.Downloading() # Start downloading

## Embedding 
Extract embedding features of downloaded and labeled images using pretrained EfficientNetB7 model and save it into .npz files. Create one file for each category

In [None]:
ImageEmbedder = IE.ImageEmbedder(batch_size=4)

Emb_path_downloaded = os.path.join(Emb_path, Emb_downloaded)
ImageEmbedder.Embedding(Downloaded_Images_folder, Emb_path_downloaded)

In [None]:
Emb_path_labled = os.path.join(Emb_path, Emb_labeled)
ImageEmbedder.Embedding(Labeled_folder, Emb_path_labled, augment = True) # Perform image augmentation before creating embedding features

In [9]:
# Loading embedding features and store them into a pandas dataframes 
Emb_path_downloaded = os.path.join(Emb_path, Emb_downloaded)
Emb_path_labled = os.path.join(Emb_path, Emb_labeled)

df_Downloaded = IE.ImageEmbedder.Loading_Embeddings(Emb_path_downloaded)
df_Labeled = IE.ImageEmbedder.Loading_Embeddings(Emb_path_labled)

df_all = pd.concat([df_Downloaded,df_Labeled], ignore_index=True) # using all the images (labeled and downloaded) in anomoly detection

## Anomaly detection
Finding Anomalies in downloaded images using autoencoder recunstruction error.

In [None]:
X = np.vstack(df_all['Embedding'])  # shape (N, D) suitable for training
input_dimention = len(X[0]) # embedding size
model_anomaly = AN.Anomaly(input_dimention) 
model_anomaly.train_autoencoder(X,model_output_path = Model_path,epochs=100)

In [12]:
# Add anomaly and reconstruction error to downloaded dataframe
X_Downloaded = np.vstack(df_Downloaded['Embedding'])  # shape (N, D)
anomaly_df = model_anomaly.anomaly_detecting(X_Downloaded)
df_Downloaded['Anomaly'] = anomaly_df[0] # 0 for normals, 1 for anomalies
df_Downloaded['Anomaly_error'] = anomaly_df[1]
# Normalized anomaly error
min_val = df_Downloaded['Anomaly_error'].min()
max_val = df_Downloaded['Anomaly_error'].max()
range_val = max_val - min_val
df_Downloaded['Anomaly_error_normalized'] = (df_Downloaded['Anomaly_error'] - min_val) / range_val 

## Pseudo Labeling
Train a model using all labeled data (train and test) to Pseudo label the downloaded images

In [None]:
Pseudo_Labeling_model = IC.ImageClassification()

# Define hyperparameter grid for tuning the model
param_grid = {
    'lr': [1e-3, 3e-4],
    'dropout': [0.3, 0.5],
    'hidden_size': [128,256]
}
# Train the model and save it into a .pth file 
Pseudo_Labeling_model.Training_Tuning(param_grid, df_Labeled, model_output_path = Model_path, model_output_name = Pseudo_Labeling_Model_Name)

In [14]:
#Load the model
Pseudo_model_path = os.path.join(Model_path, Pseudo_Labeling_Model_Name)
Pseudo_Labeling_model_loaded = IC.ImageClassification()
Pseudo_Labeling_model_loaded.load_model(Pseudo_model_path)

In [17]:
# Assign predicted class and the probability of the predicted class to df_Downloaded using the pseudo-labeling model
df_Downloaded['predictions'] = Pseudo_Labeling_model_loaded.predict_probabilities(df_Downloaded)
df_Downloaded['predicted_class'] = df_Downloaded['predictions'].apply(lambda x: x.index(max(x)))
df_Downloaded['predicted_class_probability'] = df_Downloaded['predictions'].apply(lambda x: max(x))

# Clean downloaded images
Combining predicted_class_probability and Anomaly_error_normalized to estimate a confidence score for downloaded images. 
If the predicted label matches the original category (search keyword) and the confidence is high enough, keep the image. Otherwise, discard it.

In [18]:
df_Downloaded['confidence_score'] = df_Downloaded['predicted_class_probability'] * (1 -   0.4 * df_Downloaded['Anomaly_error_normalized'])

In [None]:
# Plot confidence score distribution to choose a suitable cut-off.
import seaborn as sns
import matplotlib.pyplot as plt
def plot_confidence_distribution(df, column='confidence_score', bins=50):
    plt.figure(figsize=(8, 5))
    sns.histplot(df[column], bins=bins, kde=True, color='skyblue')
    plt.title('Distribution of Confidence Score')
    plt.xlabel('Confidence Score')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

plot_confidence_distribution(df_Downloaded)

In [None]:
# Only retain images that are correctly labeled and/or have a confidence score > 0.8.
# Using 'or' makes the condition less strict.
df_Downloaded_Clean = df_Downloaded[
    (df_Downloaded['predicted_class'] == df_Downloaded['Cat_Index']) |
    (df_Downloaded['confidence_score'] > 0.7)
]
#Drop the unnecessary columns
df_Downloaded_Clean = df_Downloaded_Clean.iloc[:, :3]

discarded_images_percent = 100 * ((df_Downloaded.shape)[0] - (df_Downloaded_Clean.shape)[0] ) / (df_Downloaded.shape)[0]
print(f'The percentage of images that are discarded:{discarded_images_percent}')

In [21]:
# Save the embeding of the cleaned downloaded images
CleanDownloaded_filename = f"CleanDownloadedEmbedding.npz"
Emb_path_downloaded_clean = os.path.join(Emb_path, Emb_downloaded_clean)
os.makedirs(Emb_path_downloaded_clean, exist_ok=True)
embedding_file = os.path.join(Emb_path_downloaded_clean, CleanDownloaded_filename)
np.savez(embedding_file, embeddings=df_Downloaded_Clean)

## Image Classification Model
Using both labeled dataset and cleaned downloaded images to train the image classification model

In [22]:
#Load Downloaded_Clean dataset
Emb_path_Downloaded_Clean= os.path.join(Emb_path,Emb_downloaded_clean, CleanDownloaded_filename)
df_Downloaded_Clean = IE.ImageEmbedder.Load_one_embedding(Emb_path_Downloaded_Clean)

In [23]:
# Final dataset
df_final = pd.concat([df_Downloaded_Clean,df_Labeled], ignore_index=True)
# create train/test dataset and save it into two files 
train_df, test_df = train_test_split(df_final, test_size = 0.2, stratify=df_final['Cat_Index'], random_state=42)
Emb_path_finaldataset = os.path.join(Emb_path, Emb_final_dataset)
os.makedirs(Emb_path_finaldataset, exist_ok=True)
train_file = os.path.join(Emb_path_finaldataset, f"Train.npz")
test_file = os.path.join(Emb_path_finaldataset, f"Test.npz")
np.savez(train_file, embeddings=train_df)
np.savez(test_file, embeddings=test_df)

In [None]:
# Model training
ImageClassification_model = IC.ImageClassification()
param_grid = {
    'lr': [1e-3, 3e-4],
    'dropout': [0.3, 0.5],
    'hidden_size': [128,256]
}
ImageClassification_model.Training_Tuning(param_grid, train_df, model_output_path = Model_path, model_output_name = Final_Model_Name)

## Model Evaluation

In [31]:
# Load the model
Final_Model_path = os.path.join(Model_path,Final_Model_Name)
Model_loaded = IC.ImageClassification()
Model_loaded.load_model(Final_Model_path)

In [32]:
# Load the test dataset
Emb_path_test= os.path.join(Emb_path,Emb_final_dataset,'Test.npz')
df_test = IE.ImageEmbedder.Load_one_embedding(Emb_path_test)

In [None]:
# Evaluate model using test dataset
Model_loaded.validate(df_test)