In [1]:
import pandas as pd
import numpy as np
from itertools import chain

import json

# Purpose of this notebook

- This notebook is for data exploration, EDA, and strategy planning.


# Load Data


In [5]:
data = []
with open("../backend/app/raw_data/meta_amazon_fashion.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        data.append(item)

df_amazon_fashion = pd.DataFrame(data)

# Dataset basic info


In [6]:
df_amazon_fashion.columns

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together'],
      dtype='object')

In [7]:
df_amazon_fashion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 826108 entries, 0 to 826107
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   main_category    826108 non-null  object 
 1   title            826108 non-null  object 
 2   average_rating   826108 non-null  float64
 3   rating_number    826108 non-null  int64  
 4   features         826108 non-null  object 
 5   description      826108 non-null  object 
 6   price            50249 non-null   float64
 7   images           826108 non-null  object 
 8   videos           826108 non-null  object 
 9   store            799270 non-null  object 
 10  categories       826108 non-null  object 
 11  details          826108 non-null  object 
 12  parent_asin      826108 non-null  object 
 13  bought_together  0 non-null       object 
dtypes: float64(2), int64(1), object(11)
memory usage: 88.2+ MB


In [8]:
df_amazon_fashion["price"].isna().sum()
print(f"Number of items without price: {df_amazon_fashion['price'].isna().sum()}")

df_amazon_fashion["store"].isna().sum()
print(f"Number of items without store: {df_amazon_fashion['store'].isna().sum()}")


Number of items without price: 775859
Number of items without store: 26838


In [9]:
df_amazon_fashion.describe()

Unnamed: 0,average_rating,rating_number,price
count,826108.0,826108.0,50249.0
mean,3.91066,17.942204,40.795929
std,0.982282,221.79273,154.089435
min,1.0,1.0,0.01
25%,3.4,2.0,11.79
50%,4.0,4.0,19.89
75%,4.7,10.0,34.99
max,5.0,46299.0,13000.0


In [10]:
df_amazon_fashion.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together
0,AMAZON FASHION,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],GiveGift,[],{'Package Dimensions': '10.31 x 8.5 x 1.73 inc...,B08BHN9PK5,
1,AMAZON FASHION,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"[Drawstring closure, Machine Wash]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],DouBCQ,[],{'Package Dimensions': '15 x 10.2 x 0.4 inches...,B08R39MRDW,
2,AMAZON FASHION,Pastel by Vivienne Honey Vanilla Girls' Trapez...,4.3,11,"[Zipper closure, Hand Wash Only]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Pastel by Vivienne,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B077KJHCJ4,
3,AMAZON FASHION,Mento Streamtail,2.0,1,"[Thermoplastic Rubber sole, High Density Premi...",[Slip on the Women's Mento and you're ready to...,29.81,[{'thumb': 'https://m.media-amazon.com/images/...,[],Guy Harvey,[],{'Package Dimensions': '11.22 x 4.72 x 4.33 in...,B0811M2JG9,
4,AMAZON FASHION,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"[Pull On closure, Size Guide: ""S"" fits calf 10...",[Ronnox Calf Sleeves - Allowing Your Body to P...,17.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'HONEST Review: RONNOX Women's 3-Pa...,RONNOX,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07SB2892S,


# Dataset preprocessing


In [11]:
# Handle missing values or values with only space
df_amazon_fashion = df_amazon_fashion.applymap(
    lambda x: np.nan               # turn into NaN
    if (
        (isinstance(x, list) and len(x) == 0)
        or (isinstance(x, str) and x.strip() == "")
    )
    else x
)
df_amazon_fashion.info()


  df_amazon_fashion = df_amazon_fashion.applymap(


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 826108 entries, 0 to 826107
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   main_category    826108 non-null  object 
 1   title            826050 non-null  object 
 2   average_rating   826108 non-null  float64
 3   rating_number    826108 non-null  int64  
 4   features         463074 non-null  object 
 5   description      59289 non-null   object 
 6   price            50249 non-null   float64
 7   images           826107 non-null  object 
 8   videos           52107 non-null   object 
 9   store            799267 non-null  object 
 10  categories       0 non-null       float64
 11  details          826108 non-null  object 
 12  parent_asin      826108 non-null  object 
 13  bought_together  0 non-null       object 
dtypes: float64(3), int64(1), object(10)
memory usage: 88.2+ MB


In [12]:
# Duplicated products check
dupes = df_amazon_fashion[df_amazon_fashion["title"].duplicated(keep=False)]
dupes.sort_values(["title", "price"])

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together
24011,AMAZON FASHION,""" Colors of Love"" Iridescent Glitter Opal with...",5.0,1,,,,[{'thumb': 'https://m.media-amazon.com/images/...,,BONALUNA,,"{'Is Discontinued By Manufacturer': 'No', 'Pro...",B07C5NC3NK,
426278,AMAZON FASHION,""" Colors of Love"" Iridescent Glitter Opal with...",3.9,4,,,,[{'thumb': 'https://m.media-amazon.com/images/...,,,,"{'Is Discontinued By Manufacturer': 'No', 'Pro...",B07C5K4QYR,
507984,AMAZON FASHION,""" Colors of Love"" Iridescent Glitter Opal with...",5.0,1,,,,[{'thumb': 'https://m.media-amazon.com/images/...,,BONALUNA,,{},B07C5M7QZR,
72762,AMAZON FASHION,""" Pretty Bold"" Geometric Layered with Gold or ...",4.4,9,,,,[{'thumb': 'https://m.media-amazon.com/images/...,,BONALUNA,,{'Product Dimensions': '2.75 x 1 x 0.5 inches;...,B07CSLK8PD,
93351,AMAZON FASHION,""" Pretty Bold"" Geometric Layered with Gold or ...",4.3,9,,,,[{'thumb': 'https://m.media-amazon.com/images/...,,BONALUNA,,{'Product Dimensions': '2.75 x 1.63 x 0.5 inch...,B07CSK29YH,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137831,AMAZON FASHION,,5.0,1,,,,[{'thumb': 'https://m.media-amazon.com/images/...,,Vera Bradley,,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07576RCQM,
137835,AMAZON FASHION,,2.6,3,[Hand Wash Only],,,[{'thumb': 'https://m.media-amazon.com/images/...,,GRACE KARIN,,{'Package Dimensions': '13.35 x 10.16 x 1.73 i...,B096VQ6B75,
137837,AMAZON FASHION,,3.0,4,"[Microfiber,flannel, Hand Wash Only]",,,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Serafina Home Haunted Fleece Throw...,Hretqwi,,"{'Brand': 'Hretqwi', 'Special Feature': 'Skin ...",B0B7L5YW2Q,
137839,AMAZON FASHION,,4.5,118,"[Polyester,Spandex, Pull On closure]",,,[{'thumb': 'https://m.media-amazon.com/images/...,,Zicotour,,{'Package Dimensions': '11.2 x 7 x 1.1 inches;...,B086RC3HGL,


In [13]:
# Parent_ASIN check
df_amazon_fashion["parent_asin"].nunique()


826108

In [14]:
# Discontinued products count
df_amazon_fashion["is_discontinued"] = df_amazon_fashion["details"].apply(
    lambda x: x.get("Is Discontinued By Manufacturer") if isinstance(x, dict) else None
)

df_amazon_fashion["is_discontinued"].value_counts()

is_discontinued
No       293405
Yes        1838
false       134
False       121
True          2
Name: count, dtype: int64

In [15]:
from collections import Counter

key_counter = Counter()

for d in df_amazon_fashion["details"]:
    if isinstance(d, dict):
        key_counter.update(d.keys())

# Show the result
print(key_counter)


Counter({'Date First Available': 797051, 'Package Dimensions': 528478, 'Item model number': 371915, 'Is Discontinued By Manufacturer': 295500, 'Product Dimensions': 160469, 'Department': 119646, 'Manufacturer': 60671, 'Brand': 41816, 'Age Range (Description)': 23485, 'Material': 22709, 'Item Weight': 21064, 'Style': 18639, 'Color': 15047, 'Closure Type': 12144, 'Size': 8224, 'Shape': 6137, 'Reusability': 5950, 'Theme': 5864, 'Special Feature': 5690, 'Pattern': 5397, 'Country of Origin': 3834, 'Unit Count': 3726, 'Item Package Quantity': 3285, 'Clasp Type': 3216, 'Sport': 3149, 'Neck Style': 3009, 'Batteries': 2728, 'Item Dimensions LxWxH': 2689, 'Fit Type': 2569, 'Sleeve Type': 2356, 'Chain Type': 2326, 'Manufacturer recommended age': 2048, 'Number of Items': 1990, 'Number of Pieces': 1957, 'Fabric Type': 1828, 'Occasion': 1629, 'Product Care Instructions': 1609, 'Frame Material': 1582, 'Collection Name': 1532, 'Metal Type': 1266, 'Item Length': 1203, 'Shirt form type': 1158, 'Brand Na

In [16]:
# Items price below 0.10 too cheap to be true
num_items_below_05 = df_amazon_fashion.loc[df_amazon_fashion["price"]<=0.05].shape[0]
num_items_below_10 = df_amazon_fashion.loc[df_amazon_fashion["price"]<=0.10].shape[0]

print(f"Number of items below 0.05: {num_items_below_05}")
print(f"Number of items below 0.10: {num_items_below_10}")

Number of items below 0.05: 56
Number of items below 0.10: 60


# Observations

### Missing Value

1. There are 58 values missing in the title
2. There are 363034 values missing in the features
3. There are 766819 values missing in description
4. There are 775814 values missing in price
5. There are 1 value missing in images
6. There are 774001 values missing in videos
7. There are 26838 values missing in store
8. All values are missing in categories and bought_together

- For semantic search, it is important to have title and description for all products.
- We have to use input image to the llm to fill in the missing tittle.
- Since almost all product have images, it is possible to use gpt-vision to generate description for each of the product, but we have to consider the cost to do that.
- Fashion products can be further categorized into shoes, shirt, dress and etc using either title or the image to improve search
- Average rating and rating number could be recalculated using either Bayesian Average Formula
- Extracting features from the video would be too expensive.

### Product without price

- About 94% of the products' price are missing, I am going to assume that the product is out of stock.
- I would keep the product without price on the recommendation list if the detail of is_discontinue = No or False or false
- For product without price and is_discontinue = True or Yes, the product would be dropped from recommendation because there is no chance for amazon to restock it
- For product without price but without is_discontinue mentioned in the details, I would keep in the product list but will be lowered the importance
- Product with price would be the first priority of recommendation, then products that are currently out of stock but will be restock will be the secondary, products that are currently out of stock but unknown about the restock will be the tertiary.

### Product without store

- Assume the product without store is unavailable because there is no way customer can buy it, thus I would exclude them from the recommendation.

### Repetitive products or similar product

- There are 84978 products with the same title, thus we need to arrange it by rating_number or avg_rating or price.
- We have to look for a way to provide the best recommendation base on the avg_rating and rating_number such as Bayesian Average Rating for the top 5 most relevant product search

### Product grouping

- All parent_asin values are unique, doesn't seems like the products can be grouped with this column data

### Product availability

- There are 1840 products with {"Is Discontinued By Manufacturer": "Yes" / True} in their details.
- These products might be out of stock when recommend to the customer, which we should may be drop these products.

### Product details

- There are 544 different key in the details column of each product, this can be used as a useful information to help customer find specific product.

### Product with price under 0.05 dollar

- These products are too cheap to be true, might be a scam for customer thus I would exclude it from the recommendation list.
