In [None]:
import pandas as pd
import os
import shutil
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

returns_df = pd.read_csv('../data/processed/returns.csv')
catalog_df = pd.read_csv('../data/processed/catalog.csv')

# Create image labels
defect_classes = ['Dresses', 'Tops', 'Jackets', 'Pants', 'Skirts', 'Jeans']
defective_images_df = returns_df[returns_df['class_name'].isin(defect_classes)][['image_filename']].copy()
defective_images_df['label'] = 'Defective'
normal_images_df = catalog_df[~catalog_df['image_filename'].isin(defective_images_df['image_filename'])][['image_filename']].copy()
normal_images_df = normal_images_df.sample(n=len(defective_images_df), random_state=42)
normal_images_df['label'] = 'Normal'
image_df = pd.concat([defective_images_df, normal_images_df], ignore_index=True)

# Organize images into folders for Keras
train_df, val_df = train_test_split(image_df, test_size=0.2, random_state=42, stratify=image_df['label'])
BASE_DIR = '../data/images_for_model/'
SOURCE_IMAGE_DIR = '../data/raw/images/'

def organize_images(df, split_name):
    split_dir = os.path.join(BASE_DIR, split_name)
    if os.path.exists(split_dir): shutil.rmtree(split_dir)
    for _, row in df.iterrows():
        label_dir = os.path.join(split_dir, row['label'])
        os.makedirs(label_dir, exist_ok=True)
        source_path = os.path.join(SOURCE_IMAGE_DIR, row['image_filename'])
        if os.path.exists(source_path):
            shutil.copy(source_path, os.path.join(label_dir, row['image_filename']))

organize_images(train_df, 'train')
organize_images(val_df, 'validation')

# Create Keras datasets
IMG_SIZE = (160, 160)
BATCH_SIZE = 32
train_dataset = tf.keras.utils.image_dataset_from_directory(os.path.join(BASE_DIR, 'train'), image_size=IMG_SIZE, batch_size=BATCH_SIZE)
validation_dataset = tf.keras.utils.image_dataset_from_directory(os.path.join(BASE_DIR, 'validation'), image_size=IMG_SIZE, batch_size=BATCH_SIZE)

# Build and train the model
base_model = tf.keras.applications.MobileNetV2(input_shape=(160, 160, 3), include_top=False, weights='imagenet')
base_model.trainable = False
inputs = keras.Input(shape=(160, 160, 3))
x = tf.keras.applications.mobilenet_v2.preprocess_input(inputs)
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_dataset, epochs=10, validation_data=validation_dataset)

# Save the model
model.save('../models/image_classifier_model.keras')
print("\n--- IMAGE MODEL TRAINED AND SAVED ---")

In [None]:
import os

# 1. Check the current working directory
current_directory = os.getcwd()
print(f"This notebook's current location is: {current_directory}\n")

# 2. Define the path we are trying to check
processed_folder_path = '../data/processed'
print(f"Checking for the folder at this relative path: {processed_folder_path}")

# 3. Check if the folder and files exist
try:
    # Get the absolute path for clarity
    abs_path = os.path.abspath(processed_folder_path)
    print(f"This translates to the absolute path: {abs_path}\n")
    
    files = os.listdir(processed_folder_path)
    if files:
        print("SUCCESS: The 'processed' folder was found and it contains:")
        for file in files:
            print(f"- {file}")
    else:
        print("ERROR: The 'processed' folder exists, but it is EMPTY.")
        
except FileNotFoundError:
    print("CRITICAL ERROR: The '../data/processed' folder DOES NOT EXIST.")
    print("This is why the error is happening. Please run the '01_data_preparation.ipynb' notebook first.")

In [None]:
# We'll create our own labels. A simple rule: returns for classes that often have
# physical quality complaints will be our "Defective" examples.
defect_classes = ['Dresses', 'Tops', 'Jackets', 'Pants', 'Skirts', 'Jeans']
defective_images_df = returns_df[returns_df['class_name'].isin(defect_classes)][['image_filename']].copy()
defective_images_df['label'] = 'Defective'

# For "Normal" images, we'll sample from the main catalog, ensuring they are not in the defective list
normal_images_df = catalog_df[~catalog_df['image_filename'].isin(defective_images_df['image_filename'])][['image_filename']].copy()
# Let's balance our dataset by sampling a similar number of normal images
normal_images_df = normal_images_df.sample(n=len(defective_images_df), random_state=42)
normal_images_df['label'] = 'Normal'

# Combine them into a single dataframe
image_df = pd.concat([defective_images_df, normal_images_df], ignore_index=True)
print(f"Created a labeled dataset of {len(image_df)} images.")
print(image_df['label'].value_counts())

In [None]:
# Keras works best when images are in folders like: train/Normal, train/Defective, etc.
# This cell will create this structure and copy the images.

# First, split our dataframe
train_df, val_df = train_test_split(image_df, test_size=0.2, random_state=42, stratify=image_df['label'])

# Define the new base directory for our sorted images
BASE_DIR = '../data/images_for_model/'

# Function to copy images
def organize_images(df, split_name):
    split_dir = os.path.join(BASE_DIR, split_name)
    # Remove existing directory to start fresh
    if os.path.exists(split_dir):
        shutil.rmtree(split_dir)
    
    for index, row in df.iterrows():
        label_dir = os.path.join(split_dir, row['label'])
        os.makedirs(label_dir, exist_ok=True)
        
        source_path = os.path.join(SOURCE_IMAGE_DIR, row['image_filename'])
        dest_path = os.path.join(label_dir, row['image_filename'])
        
        if os.path.exists(source_path):
            shutil.copy(source_path, dest_path)

# Organize the training and validation images
print("Organizing training images...")
organize_images(train_df, 'train')
print("Organizing validation images...")
organize_images(val_df, 'validation')
print("Image organization complete.")

In [None]:
# Now, let's load these organized images into a format Keras can use
IMG_SIZE = (160, 160)
BATCH_SIZE = 32

train_dataset = tf.keras.utils.image_dataset_from_directory(
    os.path.join(BASE_DIR, 'train'),
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE
)

validation_dataset = tf.keras.utils.image_dataset_from_directory(
    os.path.join(BASE_DIR, 'validation'),
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE
)

# Configure datasets for performance
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)

In [None]:
# We'll use a powerful pre-trained model (MobileNetV2) and fine-tune it for our task.
# This is called Transfer Learning.

# 1. Create a data augmentation layer
data_augmentation = keras.Sequential([
    layers.RandomFlip('horizontal'),
    layers.RandomRotation(0.2),
])

# 2. Load the pre-trained base model
base_model = tf.keras.applications.MobileNetV2(input_shape=(160, 160, 3),
                                               include_top=False,
                                               weights='imagenet')
base_model.trainable = False # Freeze the base model

# 3. Build our final model
inputs = keras.Input(shape=(160, 160, 3))
x = data_augmentation(inputs)
x = tf.keras.applications.mobilenet_v2.preprocess_input(x) # Preprocess input for MobileNetV2
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1, activation='sigmoid')(x) # Sigmoid for binary classification

model = keras.Model(inputs, outputs)

# 4. Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

model.summary()

In [None]:
# Now we fit the model to our data
initial_epochs = 10

history = model.fit(
    train_dataset,
    epochs=initial_epochs,
    validation_data=validation_dataset
)

In [None]:
# Save the model for later use in our final application
model.save('../models/image_classifier_model.keras')

print("Image classification model saved successfully in the 'models' folder.")