In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(
    "styles.csv",
    on_bad_lines="skip",
    encoding="utf-8"
)


In [3]:
# Rename columns
df = df.rename(columns={
    "id": "product_id",
    "productDisplayName": "title",
    "masterCategory": "master_category",
    "subCategory": "sub_category",
    "articleType": "article_type",
    "baseColour": "base_colour"
})


In [4]:
df["product_id"] = df["product_id"].astype(str)

In [None]:
# Clean missing values

df = df.dropna(subset=["product_id", "title"])

In [6]:
df

Unnamed: 0,product_id,gender,master_category,sub_category,article_type,base_colour,season,year,usage,title
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt
...,...,...,...,...,...,...,...,...,...,...
44419,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,2013.0,Casual,Gas Men Caddy Casual Shoe
44420,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,2011.0,Casual,Lotto Men's Soccer Track Flip Flop
44421,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,2011.0,Casual,Puma Men Graphic Stellar Blue Tshirt
44422,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,2017.0,Casual,Rasasi Women Blue Lady Perfume


In [None]:
# Generate prices

def generate_price(row):
    cat = row["master_category"]

    if cat == "Footwear":
        return np.random.randint(1200, 3500)
    elif cat == "Apparel":
        return np.random.randint(800, 2500)
    elif cat == "Accessories":
        return np.random.randint(300, 2000)
    else:
        return np.random.randint(500, 2000)

df["price"] = df.apply(generate_price, axis=1)


In [None]:
# Check columns

df.columns.tolist()

['product_id',
 'gender',
 'master_category',
 'sub_category',
 'article_type',
 'base_colour',
 'season',
 'year',
 'usage',
 'title',
 'price']

In [9]:
df.head()

Unnamed: 0,product_id,gender,master_category,sub_category,article_type,base_colour,season,year,usage,title,price
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,2211
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,1273
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,865
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,1702
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,1771


In [10]:
# Check for missing values

df["product_id"].isna().sum()

0

In [11]:
# Check if product id is unique

df["product_id"].nunique(), len(df)

(44417, 44417)

In [12]:
# Check for  missing titles

df["title"].isna().sum()

0

In [13]:
# Check for missing category

df["master_category"].value_counts(dropna=False)

master_category
Apparel           21397
Accessories       11272
Footwear           9219
Personal Care      2398
Free Items          105
Sporting Goods       25
Home                  1
Name: count, dtype: int64

In [14]:
# Check price distribution

df["price"].describe()

count    44417.000000
mean      1641.471193
std        672.735331
min        300.000000
25%       1146.000000
50%       1601.000000
75%       2062.000000
max       3499.000000
Name: price, dtype: float64

In [15]:
df["product_id"].head(), df["product_id"].tail()

(0    15970
 1    39386
 2    59263
 3    21379
 4    53759
 Name: product_id, dtype: object,
 44419    17036
 44420     6461
 44421    18842
 44422    46694
 44423    51623
 Name: product_id, dtype: object)

In [None]:
# Check image mapping

import os

image_ids = set([f.split('.')[0] for f in os.listdir("images")])
len(image_ids)

44441

In [17]:
df.shape

(44417, 11)

In [18]:
# Filter rows that have matching url

df = df[df["product_id"].isin(image_ids)]

In [19]:
df

Unnamed: 0,product_id,gender,master_category,sub_category,article_type,base_colour,season,year,usage,title,price
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,2211
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,1273
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,865
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,1702
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,1771
...,...,...,...,...,...,...,...,...,...,...,...
44419,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,2013.0,Casual,Gas Men Caddy Casual Shoe,2026
44420,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,2011.0,Casual,Lotto Men's Soccer Track Flip Flop,3025
44421,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,2011.0,Casual,Puma Men Graphic Stellar Blue Tshirt,2274
44422,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,2017.0,Casual,Rasasi Women Blue Lady Perfume,585


In [None]:
# Export cleaned data 
df.to_csv("products_cleaned_no_img.csv", index=False)


In [None]:
# Add img urls to csv

products = pd.read_csv("products_cleaned_no_img.csv")

bucket = "notify-products"   # YOUR bucket name here
base_url = f"https://{bucket}.s3.amazonaws.com/images/fashion"

products["image_url"] = products["product_id"].apply(lambda x: f"{base_url}/{x}.jpg")

products.to_csv("products_clean.csv", index=False)


In [23]:
products

Unnamed: 0,product_id,gender,master_category,sub_category,article_type,base_colour,season,year,usage,title,price,image_url
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,2211,https://notify-products.s3.amazonaws.com/image...
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,1273,https://notify-products.s3.amazonaws.com/image...
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,865,https://notify-products.s3.amazonaws.com/image...
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,1702,https://notify-products.s3.amazonaws.com/image...
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,1771,https://notify-products.s3.amazonaws.com/image...
...,...,...,...,...,...,...,...,...,...,...,...,...
44407,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,2013.0,Casual,Gas Men Caddy Casual Shoe,2026,https://notify-products.s3.amazonaws.com/image...
44408,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,2011.0,Casual,Lotto Men's Soccer Track Flip Flop,3025,https://notify-products.s3.amazonaws.com/image...
44409,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,2011.0,Casual,Puma Men Graphic Stellar Blue Tshirt,2274,https://notify-products.s3.amazonaws.com/image...
44410,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,2017.0,Casual,Rasasi Women Blue Lady Perfume,585,https://notify-products.s3.amazonaws.com/image...
