# Imports

In [2]:
import os
import torch
import torch.nn as nn # Neural Network
import torch.optim as optim # Optimizer's Relu etc. 
from torch.utils.data import Dataset, DataLoader # Preprocessing Dataset, Loading Dataset

from PIL import Image # Open Image
import torchvision.transforms as transforms # Image Processing & Augmentation i.e. Resize, Flip, Normalize etc. 

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity # Measures Similarity Between Vectors 
import pandas as pd
import seaborn as sns

In [5]:
df_images = pd.read_csv('/kaggle/input/fashion-product-images-dataset/fashion-dataset/images.csv')

df_styles = pd.read_csv(
    '/kaggle/input/fashion-product-images-dataset/fashion-dataset/styles.csv',
    on_bad_lines='skip',
    encoding='utf-8'
)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/fashion-product-images-dataset/fashion-dataset/images.csv'

In [None]:
df_styles.columns

In [None]:
df_styles['masterCategory']

In [None]:
df_styles['subCategory'].unique()

In [None]:
df_styles['masterCategory']
df_masterCategory_OnlyApparel = df_styles[df_styles['masterCategory']=='Apparel']

In [None]:
df_masterCategory_OnlyApparel

In [None]:
df_masterCategory_OnlyApparel['subCategory'].unique()

In [None]:
# df_masterCategory_OnlyApparel['subCategory'] == df_styles['Bottomwear']
df_subCategory_OnlyApparel_Bottomwear = df_masterCategory_OnlyApparel['subCategory']

In [None]:
df_styles.head()

In [None]:
df_updated_styles = df_styles[df_styles['gender'].isin(['Men', 'Women'])]
df_updated_styles

In [None]:
df_images.head()

In [None]:
df_images.columns

In [None]:
# Quick overview
print("Number of products in styles.csv:", len(df_styles))
print("Number of images in images.csv:", len(df_images))

In [None]:
print("Styles CSV Shape", df_styles.shape)
print("Images CSV Shape", df_images.shape)

# Exploratory Data Analysis

In [None]:
total_products = df_styles['id'].nunique()
print("Total unique products:", total_products)


# Merging Image with Meta Data

In [None]:
# Total images
df = df_styles
print("Total images:", len(df))

# Number of unique categories
print("Master categories:", df['masterCategory'].nunique())
print("Subcategories:", df['subCategory'].nunique())
print("Article types:", df['articleType'].nunique())
print("Colors:", df['baseColour'].nunique())


# Class Distribution (Master Category)

In [None]:
# Count images per masterCategory
category_counts = df['masterCategory'].value_counts()

plt.figure(figsize=(10,5))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title("Number of images per Master Category")
plt.ylabel("Count")
plt.xlabel("Master Category")
plt.xticks(rotation=45)
plt.show()

# Subcategory Distribution

In [None]:
# Top 20 subcategories
sub_counts = df['subCategory'].value_counts().head(20)

plt.figure(figsize=(12,6))
sns.barplot(x=sub_counts.index, y=sub_counts.values)
plt.title("Top 20 Subcategories")
plt.xticks(rotation=90)
plt.show()

# Color distribution

In [None]:
color_counts = df['baseColour'].value_counts().head(15)

plt.figure(figsize=(12,5))
sns.barplot(x=color_counts.index, y=color_counts.values)
plt.title("Top 15 Colors")
plt.xticks(rotation=45)
plt.show()


# Number of Images Per Master Category

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='masterCategory', order=df['masterCategory'].value_counts().index)
plt.title("Number of Images per Master Category")
plt.ylabel("Count")
plt.xlabel("Master Category")
plt.xticks(rotation=45)
plt.show()


# Number of Images per sub category

In [None]:
top_subcategories = df['subCategory'].value_counts().head(20)
plt.figure(figsize=(12,6))
sns.barplot(x=top_subcategories.index, y=top_subcategories.values)
plt.title("Top 20 Subcategories by Image Count")
plt.xticks(rotation=90)
plt.ylabel("Count")
plt.show()

# Color distribution (top 15)

In [None]:
top_colors = df['baseColour'].value_counts().head(15)
plt.figure(figsize=(12,5))
sns.barplot(x=top_colors.index, y=top_colors.values)
plt.title("Top 15 Colors in Dataset")
plt.xticks(rotation=45)
plt.ylabel("Count")
plt.show()


# Images per gender

In [None]:
gender_counts = df['gender'].value_counts()
plt.figure(figsize=(6,4))
sns.barplot(x=gender_counts.index, y=gender_counts.values)
plt.title("Number of Images per Gender")
plt.ylabel("Count")
plt.show()


# Category vs. Gender Heatmap

In [None]:
category_gender = pd.crosstab(df['masterCategory'], df['gender'])
plt.figure(figsize=(10,6))
sns.heatmap(category_gender, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Category vs Gender Distribution")
plt.show()


# Subcategory vs. Color Heatmap (top 20 subcategories)

In [None]:
top_subs = df['subCategory'].value_counts().head(20).index
df_top = df[df['subCategory'].isin(top_subs)]

sub_color = pd.crosstab(df_top['subCategory'], df_top['baseColour'])
plt.figure(figsize=(12,6))
sns.heatmap(sub_color, cmap="coolwarm", annot=False)
plt.title("Top 20 Subcategories vs Color Distribution")
plt.show()


In [None]:
subcategory_counts = df_styles['subCategory'].value_counts()
subcategory_counts.head(10)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
subcategory_counts[:15].plot(kind='bar')
plt.title("Top 15 SubCategories by Image Count")
plt.ylabel("Number of Images")
plt.xlabel("SubCategory")
plt.show()


# Filter Top Wear

In [None]:
topwear_df = df_styles[df_styles['subCategory'] == 'Topwear'].reset_index(drop=True)

print("Topwear images:", topwear_df.shape[0])


In [None]:
import matplotlib.pyplot as plt
from PIL import Image
import os

base_path = '/kaggle/input/fashion-product-images-dataset/fashion-dataset'
styles_df = df_styles
# Parameters
IMAGE_FOLDER = f"{base_path}/images"
top_n = 5  # number of images per subcategory
subcategories = ['Topwear', 'Shoes', 'Bags', 'Bottomwear', 'Watches']  # chosen for your project

plt.figure(figsize=(20, 5))

for i, subcat in enumerate(subcategories):
    sub_df = styles_df[styles_df['subCategory'] == subcat].sample(top_n, random_state=42)
    
    for j, row in enumerate(sub_df.itertuples()):
        plt.subplot(len(subcategories), top_n, i*top_n + j + 1)
        img_path = os.path.join(IMAGE_FOLDER, str(row.id) + ".jpg")
        img = Image.open(img_path)
        plt.imshow(img)
        plt.axis("off")
        if j == 0:
            plt.ylabel(subcat, fontsize=14)
plt.suptitle("Sample Images from Each SubCategory", fontsize=25)
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.barplot(x=subcategories, y=[styles_df[styles_df['subCategory']==sc].shape[0] for sc in subcategories])
plt.title("Number of Images per SubCategory")
plt.ylabel("Number of Images")
plt.xlabel("SubCategory")
plt.show()


# Filtering Top Wear Articles 

In [None]:
topwear_df = styles_df[styles_df['subCategory'] == 'Topwear']


In [None]:
topwear_counts = topwear_df['articleType'].value_counts()
topwear_counts.head(10)  # show top 10 for sanity check


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Take top 10 article types
top_10 = topwear_counts[:10]

plt.figure(figsize=(12, 6))
sns.barplot(x=top_10.index, y=top_10.values, palette="viridis")
plt.xticks(rotation=45)
plt.title("Top 10 Topwear Article Types and Their Counts")
plt.xlabel("Article Type")
plt.ylabel("Number of Items")
plt.show()


# Filtering Top Wear Articles with respect to 'Gender'

In [None]:
topwear_gender_counts = topwear_df.groupby(['articleType', 'gender']).size().reset_index(name='count')
topwear_gender_counts.head(10)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.barplot(data=topwear_gender_counts, x='articleType', y='count', hue='gender', palette='Set2')
plt.xticks(rotation=45)
plt.title("Topwear Article Types Distribution by Gender")
plt.xlabel("Article Type")
plt.ylabel("Number of Items")
plt.legend(title='Gender')
plt.show()


# Train Test Validation Split
### Train 70% 
### Validation 15%
### Test 15%

In [None]:
from sklearn.model_selection import train_test_split

apparel_df = styles_df[
    styles_df['subCategory'].isin(['Topwear', 'Bottomwear'])
]

train_df, temp_df = train_test_split(
    apparel_df, test_size=0.3, stratify=apparel_df['subCategory'], random_state=42
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['subCategory'], random_state=42
)


In [None]:
import pandas as pd

base_path = "/kaggle/input/fashion-product-images-dataset/fashion-dataset"

styles_df = pd.read_csv(
    f"{base_path}/styles.csv",
    engine="python",
    on_bad_lines="skip"
)

print(styles_df.shape)
styles_df.head()



In [None]:
styles_df.columns


# Remove Missing Id's 

In [None]:
styles_df.dropna(subset=['id'], inplace=True)
styles_df['id'] = styles_df['id'].astype(int)


# Filter only Topwear & Bottomwear

In [None]:
apparel_df = styles_df[
    styles_df['subCategory'].isin(['Topwear', 'Bottomwear'])
].copy()

label_map = {'Topwear': 0, 'Bottomwear': 1}
apparel_df['label'] = apparel_df['subCategory'].map(label_map)
apparel_df['image'] = apparel_df['id'].astype(str) + ".jpg"

apparel_df.head()


In [None]:
apparel_df['subCategory'].value_counts()


# Train / Validation / Test Split (FINAL & CLEAN)

In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    apparel_df,
    test_size=0.3,
    stratify=apparel_df['label'],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label'],
    random_state=42
)

print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)


# Image Transforms (Basic, No Augmentation Yet)

In [None]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((128, 128)),  # smaller for faster training
    transforms.ToTensor(),
])


# Custom Dataset Class

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import os

IMAGE_DIR = f"{base_path}/images"

class FashionDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.df.loc[idx, 'image'])
        image = Image.open(img_path).convert("RGB")
        label = self.df.loc[idx, 'label']

        if self.transform:
            image = self.transform(image)

        return image, label


# DataLoaders

In [None]:
from torch.utils.data import DataLoader

train_ds = FashionDataset(train_df, IMAGE_DIR, transform)
val_ds   = FashionDataset(val_df, IMAGE_DIR, transform)
test_ds  = FashionDataset(test_df, IMAGE_DIR, transform)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=2)


# FINAL CHECKPOINT (DO NOT SKIP)

In [None]:
images, labels = next(iter(train_loader))
print(images.shape)
print(labels[:10])


# Build CNN FROM SCRATCH (NO PRETRAINED)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()

        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)

        self.pool = nn.MaxPool2d(2, 2)

        self.fc1 = nn.Linear(128 * 16 * 16, 256)
        self.fc2 = nn.Linear(256, 2)  # 2 classes

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 64x64
        x = self.pool(F.relu(self.conv2(x)))  # 32x32
        x = self.pool(F.relu(self.conv3(x)))  # 16x16

        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


# Model Setup

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SimpleCNN().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# Train the CNN (CORE STEP)

In [None]:
def train_model(model, train_loader, val_loader, epochs=5):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = 100 * correct / total

        model.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()

        val_acc = 100 * val_correct / val_total

        print(f"Epoch [{epoch+1}/{epochs}] "
              f"Loss: {train_loss/len(train_loader):.4f} "
              f"Train Acc: {train_acc:.2f}% "
              f"Val Acc: {val_acc:.2f}%")


In [None]:
import os

image_dir = f"{base_path}/images"

existing_images = set(os.listdir(image_dir))
print("Total images on disk:", len(existing_images))


In [None]:
apparel_df['image'] = apparel_df['id'].astype(str) + ".jpg"

# Keep only rows where image exists
apparel_df = apparel_df[apparel_df['image'].isin(existing_images)].copy()

print("After cleaning:", apparel_df.shape)


In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    apparel_df,
    test_size=0.3,
    stratify=apparel_df['label'],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label'],
    random_state=42
)


In [None]:
train_ds = FashionDataset(train_df, image_dir, transform)
val_ds   = FashionDataset(val_df, image_dir, transform)
test_ds  = FashionDataset(test_df, image_dir, transform)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=2)


In [None]:
# Try loading one image
from PIL import Image
img_path = os.path.join(image_dir, train_df.iloc[0]['image'])
Image.open(img_path)


# Starting Training 

In [None]:
train_model(model, train_loader, val_loader, epochs=10)

# Convert CNN -> Feature Extractor

In [None]:
class FeatureExtractor(nn.Module):
    def __init__(self, trained_model):
        super().__init__()
        self.features = nn.Sequential(
            trained_model.conv1,
            nn.ReLU(),
            trained_model.pool,
            trained_model.conv2,
            nn.ReLU(),
            trained_model.pool,
            trained_model.conv3,
            nn.ReLU(),
            trained_model.pool
        )
        self.fc = trained_model.fc1  # keep embedding layer

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


# Initialize Feature Extractor

In [None]:
feature_model = FeatureExtractor(model).to(device)
feature_model.eval()


# Extract Feature Vectors (CORE STEP)

In [None]:
import numpy as np

def extract_features(model, dataloader):
    features = []
    labels = []
    images = []

    with torch.no_grad():
        for imgs, lbls in dataloader:
            imgs = imgs.to(device)
            emb = model(imgs)
            features.append(emb.cpu().numpy())
            labels.extend(lbls.numpy())

    return np.vstack(features), np.array(labels)


# Extract Training Embeddings (DATABASE)

In [None]:
train_features, train_labels = extract_features(feature_model, train_loader)
print(train_features.shape)


# Similarity Computation (RECOMMENDATION ENGINE)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


# Recommend Function

In [None]:
def recommend(query_feature, feature_db, top_k=5):
    sims = cosine_similarity(query_feature.reshape(1, -1), feature_db)
    top_indices = sims[0].argsort()[-top_k-1:][::-1]
    return top_indices


# Test Recommendation (VISUAL OUTPUT)
### 10.1 Pick Query Image

In [None]:
query_img, _ = test_ds[10]
query_tensor = query_img.unsqueeze(0).to(device)

with torch.no_grad():
    query_feature = feature_model(query_tensor).cpu().numpy()


# Get Similar Images 

In [None]:
indices = recommend(query_feature, train_features, top_k=5)
indices

# Display Results

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,4))

# Query
plt.subplot(1,6,1)
plt.imshow(query_img.permute(1,2,0))
plt.title("Query")
plt.axis('off')

# Recommendations
for i, idx in enumerate(indices[:5]):
    img_path = os.path.join(image_dir, train_df.iloc[idx]['image'])
    img = Image.open(img_path)

    plt.subplot(1,6,i+2)
    plt.imshow(img)
    plt.title(f"Rec {i+1}")
    plt.axis('off')

plt.show()


# UI HTML CSS JS

In [None]:
from IPython.core.display import display, HTML


<div class="container">
    <h1>Fashion Product Recommendation</h1>

    <div class="upload-section">
        <input type="file" id="queryImage" accept="image/*" />
        <button onclick="showRecommendations()">Recommend</button>
    </div>

    <div class="query-display">
        <h3>Query Image</h3>
        <img id="queryImg" src="" alt="Query" />
    </div>

    <div class="recommendations">
        <h3>Recommended Products</h3>
        <div id="recGrid" class="grid"></div>
    </div>
</div>


In [None]:
from IPython.core.display import display, HTML

html_code = """
<div class="container">
    <h1>Fashion Product Recommendation</h1>

    <div class="upload-section">
        <input type="file" id="queryImage" accept="image/*" />
        <button id="recBtn">Recommend</button>
    </div>

    <div class="query-display">
        <h3>Query Image</h3>
        <img id="queryImg" src="" alt="Query" />
    </div>

    <div class="recommendations">
        <h3>Recommended Products</h3>
        <div id="recGrid" class="grid"></div>
    </div>
</div>

<style>
body {
    font-family: 'Arial', sans-serif;
    background-color: #f5f5f5;
    color: #333;
}

.container {
    width: 90%;
    margin: auto;
    text-align: center;
}

.upload-section {
    margin: 20px 0;
}

.query-display img {
    width: 200px;
    height: 200px;
    object-fit: cover;
    border-radius: 10px;
    border: 2px solid #ddd;
}

.grid {
    display: flex;
    justify-content: center;
    gap: 15px;
    flex-wrap: wrap;
    margin-top: 10px;
}

.grid img {
    width: 150px;
    height: 150px;
    object-fit: cover;
    border-radius: 8px;
    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
    transition: transform 0.3s, box-shadow 0.3s;
}

.grid img:hover {
    transform: scale(1.05);
    box-shadow: 0 8px 12px rgba(0,0,0,0.2);
}
</style>

<script>
document.getElementById('recBtn').onclick = function() {
    const fileInput = document.getElementById('queryImage');
    const queryImg = document.getElementById('queryImg');
    const recGrid = document.getElementById('recGrid');

    if (fileInput.files.length === 0) {
        alert("Upload an image!");
        return;
    }

    // Show query image
    queryImg.src = URL.createObjectURL(fileInput.files[0]);

    // Clear previous recommendations
    recGrid.innerHTML = "";

    // Demo: show 5 images from dataset (update with real paths)
    for (let i = 1; i <= 5; i++) {
        const img = document.createElement('img');
        img.src = "/kaggle/input/fashion-product-images-dataset/fashion-dataset/images/" + (39400 + i) + ".jpg"; // Replace with real recommended IDs
        recGrid.appendChild(img);
    }
};
</script>
"""

display(HTML(html_code))
