In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
image_files = []
for file in os.listdir("./retail-products-classification/train"):
    if file.endswith(".jpg"):
        image_files.append(file.replace(".jpg", ""))
print(len(image_files))

42000


In [3]:
# Define constants and hyperparameters
batch_size = 32
num_epochs = 30
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define early stopping parameters
early_stopping_patience = 5  # Number of epochs to wait for improvement
best_val_loss = float('inf')  # Initialize the best validation loss
no_improvement_count = 0  # Initialize the count for epochs with no improvement

# Data Preparation

# Organize your dataset into folders as described earlier
# Create a CSV file train.csv with columns ImgId and categories

# Define the path to your dataset folder
dataset_path = "retail-products-classification"

# Read the CSV file and extract labels/classes
csv_file = os.path.join(dataset_path, "train.csv")
df = pd.read_csv(csv_file)
classes = df["categories"].unique()  # Extract unique classes/categories
print(len(df))

# Create a dictionary to map class names to class indices
class_to_idx = {cls: idx for idx, cls in enumerate(classes)}
num_classes = len(classes)
print(classes)


46229
['Arts, Crafts & Sewing' 'Cell Phones & Accessories'
 'Clothing, Shoes & Jewelry' 'Tools & Home Improvement'
 'Health & Personal Care' 'Baby Products' 'Baby' 'Patio, Lawn & Garden'
 'Beauty' 'Sports & Outdoors' 'Electronics' 'All Electronics' 'Automotive'
 'Toys & Games' 'All Beauty' 'Office Products' 'Appliances'
 'Musical Instruments' 'Industrial & Scientific' 'Grocery & Gourmet Food'
 'Pet Supplies']


In [4]:
# Update the transform for data preprocessing
transform = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class CustomDataset(Dataset):
    def __init__(self, csv_file, root_dir, class_to_idx, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.class_to_idx = class_to_idx
        self.transform = transform

        #print(self.data.head())

        self.data = self.data[self.data['ImgId'].isin(image_files)]

    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.data.iloc[idx, 0])
        image = Image.open("{}.jpg".format(img_name)).convert('RGB')
        label = self.class_to_idx[self.data.iloc[idx, 3]]  # Map class name to class index

        if self.transform:
            image = self.transform(image)

        return image, label


In [5]:
# Data Loading

# Split the dataset into train, validation, and test sets
train_ratio = 0.6  # Percentage of data for training
val_ratio = 0.2   # Percentage of data for validation
test_ratio = 0.2  # Percentage of data for testing

train_data, temp_data = train_test_split(df, test_size=1 - train_ratio, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

# Create DataLoaders for training, validation, and test sets
train_dataloader = DataLoader(
    CustomDataset(csv_file=csv_file, root_dir=os.path.join(dataset_path, "train"), class_to_idx=class_to_idx, transform=transform),
    batch_size=batch_size,
    shuffle=True
)
val_dataloader = DataLoader(
    CustomDataset(csv_file=csv_file, root_dir=os.path.join(dataset_path, "train"), class_to_idx=class_to_idx, transform=transform),
    batch_size=batch_size,
    shuffle=False
)
test_dataloader = DataLoader(
    CustomDataset(csv_file=csv_file, root_dir=os.path.join(dataset_path, "train"), class_to_idx=class_to_idx, transform=transform),
    batch_size=batch_size,
    shuffle=False
)


In [6]:
# Model Definition

# Load pre-trained MobileNetV2 model
mobilenet_v2 = models.mobilenet_v2(weights="MobileNet_V2_Weights.DEFAULT")

# Modify the output layer for your specific number of classes
mobilenet_v2.classifier[1] = nn.Linear(mobilenet_v2.classifier[1].in_features, num_classes)

# Move the model to GPU if available
mobilenet_v2 = mobilenet_v2.to(device)

# Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mobilenet_v2.parameters(), lr=learning_rate)


In [7]:
# Training Loop with Early Stopping
for epoch in range(num_epochs):
    mobilenet_v2.train()
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = mobilenet_v2(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation
    mobilenet_v2.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = mobilenet_v2(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    val_loss /= len(val_dataloader)

    # Check if validation loss has improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improvement_count = 0
        # Save the model checkpoint if validation loss improved
        torch.save(mobilenet_v2.state_dict(), 'best_model.pth')
    else:
        no_improvement_count += 1

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}, Validation Loss: {val_loss}')

    # Check for early stopping
    if no_improvement_count >= early_stopping_patience:
        print(f'Early stopping triggered after {early_stopping_patience} epochs without improvement.')
        break


Epoch [1/30], Loss: 2.027583599090576, Validation Loss: 1.9107921517539914
Epoch [2/30], Loss: 2.0178229808807373, Validation Loss: 1.7223138462380425
Epoch [3/30], Loss: 1.4634932279586792, Validation Loss: 1.5301806280291543
Epoch [4/30], Loss: 1.4913073778152466, Validation Loss: 1.4096607604920184
Epoch [5/30], Loss: 1.0657446384429932, Validation Loss: 1.3031420273265126
Epoch [6/30], Loss: 1.3714121580123901, Validation Loss: 1.1540862173636453
Epoch [7/30], Loss: 1.212152123451233, Validation Loss: 1.0180759560671455
Epoch [8/30], Loss: 0.8762522339820862, Validation Loss: 0.9363038296338173
Epoch [9/30], Loss: 0.6563171744346619, Validation Loss: 0.7866326612274972
Epoch [10/30], Loss: 0.7052903771400452, Validation Loss: 0.7141301410623117
Epoch [11/30], Loss: 0.592488169670105, Validation Loss: 0.638092515333318
Epoch [12/30], Loss: 0.700343668460846, Validation Loss: 0.5341538782348546
Epoch [13/30], Loss: 0.48541373014450073, Validation Loss: 0.49418130629360446
Epoch [14/3

In [8]:
# Evaluate on the test set
mobilenet_v2.load_state_dict(torch.load('best_model.pth'))  # Load the best model checkpoint
mobilenet_v2.eval()
all_labels = []
all_predictions = []

with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = mobilenet_v2(inputs)
        _, predicted = torch.max(outputs.data, 1)

        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
report = classification_report(all_labels, all_predictions, target_names=classes)
cm = confusion_matrix(all_labels, all_predictions)

print(f'Test Accuracy: {accuracy*100}%')
print(report)
print("Confusion Matrix:")
print(cm)

Test Accuracy: 95.70714285714286%
                           precision    recall  f1-score   support

    Arts, Crafts & Sewing       0.96      0.95      0.96      2000
Cell Phones & Accessories       0.94      0.97      0.96      2000
Clothing, Shoes & Jewelry       0.97      0.97      0.97      2000
 Tools & Home Improvement       0.88      0.98      0.93      2000
   Health & Personal Care       0.95      0.97      0.96      2000
            Baby Products       0.97      0.96      0.96      2000
                     Baby       0.98      0.96      0.97      2000
     Patio, Lawn & Garden       0.97      0.94      0.95      2000
                   Beauty       0.96      0.93      0.95      2000
        Sports & Outdoors       0.95      0.95      0.95      2000
              Electronics       0.94      0.96      0.95      2000
          All Electronics       0.97      0.91      0.94      2000
               Automotive       0.97      0.95      0.96      2000
             Toys & Games  