In [21]:
import pandas as pd

df = pd.read_json("../data_collection/raw_files/products.json")

# Shape of the data frame (rows x columns)

In [22]:
df.shape

(580, 7)

# Print Data Frame Head

In [23]:
df.head()

Unnamed: 0,title,price,description,stock,sku,categories,tags
0,Pidgeot,£185.00,This Pokémon has a dazzling plumage of beautif...,90 in stock,8462,"[Bird, Pokemon]","[Bird, Keen Eye, pidgeot]"
1,Rattata,£128.00,Rattata is cautious in the extreme. Even while...,16 in stock,1009,"[Mouse, Pokemon]","[Mouse, rattata, Run Away]"
2,Raticate,£60.00,Raticate’s sturdy fangs grow steadily. To keep...,204 in stock,5745,"[Mouse, Pokemon]","[Mouse, raticate, Run Away]"
3,Fearow,£95.00,Fearow is recognized by its long neck and elon...,276 in stock,9127,"[Beak, Pokemon]","[Beak, fearow, Keen Eye]"
4,Arbok,£182.00,This Pokémon is terrifically strong in order t...,248 in stock,9230,"[Cobra, Pokemon]","[arbok, Cobra, Shed Skin]"


# Statistical Summary

In [24]:
df.describe()

Unnamed: 0,sku
count,580.0
mean,5457.839655
std,2651.591882
min,1004.0
25%,3164.75
50%,5365.5
75%,7814.25
max,9991.0


## We need to clean up special characters in front of price tags and sku not need

In [25]:
# remove sku column
df = df.drop(columns=["sku"])

# remove special char Â£ in each price column
price_cols = ["price"]
df[price_cols] = df[price_cols].replace(r"£", "", regex=True).astype(float)

In [26]:
df.describe()

Unnamed: 0,price
count,580.0
mean,111.701724
std,51.713997
min,25.0
25%,66.0
50%,111.0
75%,158.0
max,200.0


In [27]:
df.head()

Unnamed: 0,title,price,description,stock,categories,tags
0,Pidgeot,185.0,This Pokémon has a dazzling plumage of beautif...,90 in stock,"[Bird, Pokemon]","[Bird, Keen Eye, pidgeot]"
1,Rattata,128.0,Rattata is cautious in the extreme. Even while...,16 in stock,"[Mouse, Pokemon]","[Mouse, rattata, Run Away]"
2,Raticate,60.0,Raticate’s sturdy fangs grow steadily. To keep...,204 in stock,"[Mouse, Pokemon]","[Mouse, raticate, Run Away]"
3,Fearow,95.0,Fearow is recognized by its long neck and elon...,276 in stock,"[Beak, Pokemon]","[Beak, fearow, Keen Eye]"
4,Arbok,182.0,This Pokémon is terrifically strong in order t...,248 in stock,"[Cobra, Pokemon]","[arbok, Cobra, Shed Skin]"


# Text normalization

In [28]:
df['title'] = df['title'].str.strip().str.title()
# remove in stock ant to int
df['stock'] = df['stock'].str.extract(r'(\d+)').astype(int)
# remove extra white spaces(spaces, tabs, new lines) and replace with single whitespace
df['description'] = df['description'].str.replace(r'\s+', ' ', regex=True).str.strip()
# categories good to have first leter capitol
df['categories'] = df['categories'].apply(lambda lst: [c.strip().title() for c in lst])
# strip tags and to lower case
df['tags'] = df['tags'].apply(lambda lst: [t.strip().lower() for t in lst])
df.head()

Unnamed: 0,title,price,description,stock,categories,tags
0,Pidgeot,185.0,This Pokémon has a dazzling plumage of beautif...,90,"[Bird, Pokemon]","[bird, keen eye, pidgeot]"
1,Rattata,128.0,Rattata is cautious in the extreme. Even while...,16,"[Mouse, Pokemon]","[mouse, rattata, run away]"
2,Raticate,60.0,Raticate’s sturdy fangs grow steadily. To keep...,204,"[Mouse, Pokemon]","[mouse, raticate, run away]"
3,Fearow,95.0,Fearow is recognized by its long neck and elon...,276,"[Beak, Pokemon]","[beak, fearow, keen eye]"
4,Arbok,182.0,This Pokémon is terrifically strong in order t...,248,"[Cobra, Pokemon]","[arbok, cobra, shed skin]"


# Convert to categorical

In [29]:
# for eac hcategory a new row is creating
df_exploded = df.explode('categories')

# No drop_first since we have many combinations
df_dummies = pd.get_dummies(df_exploded, columns=['categories'], prefix='cat')

# to avoide other columns not to sum up by rows
non_dummy_cols = ['title', 'price', 'description', 'stock', 'tags']

dummy_cols = [c for c in df_dummies.columns if c.startswith('cat_')]

# group by original index for dummy columns only
df_final = pd.concat(
    [df[non_dummy_cols], df_dummies[dummy_cols].groupby(df_exploded.index).sum()],
    axis=1
)
df_final.head()

Unnamed: 0,title,price,description,stock,tags,cat_Alpha,cat_Angler,cat_Ant Pit,cat_Anteater,cat_Aqua Mouse,...,cat_Windveiled,cat_Wing Fish,cat_Wish,cat_Wood Gecko,cat_Woodpecker,cat_Wool,cat_Woolly Crab,cat_Wrestling,cat_Young Fowl,cat_Zen Charm
0,Pidgeot,185.0,This Pokémon has a dazzling plumage of beautif...,90,"[bird, keen eye, pidgeot]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rattata,128.0,Rattata is cautious in the extreme. Even while...,16,"[mouse, rattata, run away]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Raticate,60.0,Raticate’s sturdy fangs grow steadily. To keep...,204,"[mouse, raticate, run away]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fearow,95.0,Fearow is recognized by its long neck and elon...,276,"[beak, fearow, keen eye]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Arbok,182.0,This Pokémon is terrifically strong in order t...,248,"[arbok, cobra, shed skin]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df_final.describe()

Unnamed: 0,price,stock,cat_Alpha,cat_Angler,cat_Ant Pit,cat_Anteater,cat_Aqua Mouse,cat_Aqua Rabbit,cat_Arm Thrust,cat_Armor,...,cat_Windveiled,cat_Wing Fish,cat_Wish,cat_Wood Gecko,cat_Woodpecker,cat_Wool,cat_Woolly Crab,cat_Wrestling,cat_Young Fowl,cat_Zen Charm
count,580.0,580.0,580.0,580.0,580.0,580.0,580.0,580.0,580.0,580.0,...,580.0,580.0,580.0,580.0,580.0,580.0,580.0,580.0,580.0,580.0
mean,111.701724,159.713793,0.001724,0.001724,0.001724,0.001724,0.001724,0.001724,0.001724,0.003448,...,0.001724,0.001724,0.001724,0.003448,0.001724,0.003448,0.001724,0.001724,0.001724,0.001724
std,51.713997,84.169113,0.041523,0.041523,0.041523,0.041523,0.041523,0.041523,0.041523,0.058671,...,0.041523,0.041523,0.041523,0.058671,0.041523,0.058671,0.041523,0.041523,0.041523,0.041523
min,25.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,66.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,111.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,158.0,233.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,200.0,300.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
import os

PROCESSED_DIR = "processed_files"
os.makedirs(PROCESSED_DIR, exist_ok=True)
df_final.to_json(f"{PROCESSED_DIR}/product_processed.jsonl", orient="records", lines=True)
