In [2]:
import pandas as pd

df = pd.read_json("../data_collection/raw_files/books.json")

# Shape of the data frame (rows x columns)

In [3]:
df.shape

(1000, 13)

# Print Data Frame Head

In [4]:
df.head()

Unnamed: 0,title,price,category,available,stock_count,rating,description,upc,product_type,price_excl_tax,price_incl_tax,tax,num_reviews
0,A Light in the Attic,£51.77,Poetry,True,22,Three,It's hard to imagine a world without A Light i...,a897fe39b1053632,Books,£51.77,£51.77,£0.00,0
1,Tipping the Velvet,£53.74,Historical Fiction,True,20,One,"""Erotic and absorbing...Written with starling ...",90fa61229261140a,Books,£53.74,£53.74,£0.00,0
2,Soumission,£50.10,Fiction,True,20,One,"Dans une France assez proche de la nôtre, un h...",6957f44c3847a760,Books,£50.10,£50.10,£0.00,0
3,Sharp Objects,£47.82,Mystery,True,20,Four,"WICKED above her hipbone, GIRL across her hear...",e00eb4fd7b871a48,Books,£47.82,£47.82,£0.00,0
4,Sapiens: A Brief History of Humankind,£54.23,History,True,20,Five,From a renowned historian comes a groundbreaki...,4165285e1663650f,Books,£54.23,£54.23,£0.00,0


# Statistical Summary

In [5]:
df.describe()

Unnamed: 0,stock_count,num_reviews
count,1000.0,1000.0
mean,8.585,0.0
std,5.654622,0.0
min,1.0,0.0
25%,3.0,0.0
50%,7.0,0.0
75%,14.0,0.0
max,22.0,0.0


## We need to clean up special characters in front of price tags and num_reviews, upc not need

In [6]:
# remove num_reviews column
df = df.drop(columns=["num_reviews", "upc"])

# remove special char Â£ in each price column
price_cols = ["price", "price_excl_tax", "price_incl_tax", "tax"]
df[price_cols] = df[price_cols].replace(r"£", "", regex=True).astype(float)

In [7]:
df.describe()

Unnamed: 0,price,stock_count,price_excl_tax,price_incl_tax,tax
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,35.07035,8.585,35.07035,35.07035,0.0
std,14.44669,5.654622,14.44669,14.44669,0.0
min,10.0,1.0,10.0,10.0,0.0
25%,22.1075,3.0,22.1075,22.1075,0.0
50%,35.98,7.0,35.98,35.98,0.0
75%,47.4575,14.0,47.4575,47.4575,0.0
max,59.99,22.0,59.99,59.99,0.0


## tax is not applicable here, we can remove columns [price_excl_tax	price_incl_tax	tax]

In [8]:
df = df.drop(columns=["price_excl_tax", "price_incl_tax", "tax"])

In [9]:
df.describe()

Unnamed: 0,price,stock_count
count,1000.0,1000.0
mean,35.07035,8.585
std,14.44669,5.654622
min,10.0,1.0
25%,22.1075,3.0
50%,35.98,7.0
75%,47.4575,14.0
max,59.99,22.0


# Text normalization

In [10]:
df["description"] = df["description"].str.replace(r"\.\.\.more$", "", regex=True)
df["description"] = df["description"].str.strip()
df["description"] = df["description"].str.replace(r"\s+", " ", regex=True)

In [11]:
import re
def normalize_title(title):
    # remove parenthses
    title = re.sub(r"\s*\(.*?\)", "", title)
    # remove #
    title = title.replace("#", "")
    # strip
    return title.strip()

df["title"] = df["title"].apply(normalize_title)

In [12]:
df.head()

Unnamed: 0,title,price,category,available,stock_count,rating,description,product_type
0,A Light in the Attic,51.77,Poetry,True,22,Three,It's hard to imagine a world without A Light i...,Books
1,Tipping the Velvet,53.74,Historical Fiction,True,20,One,"""Erotic and absorbing...Written with starling ...",Books
2,Soumission,50.1,Fiction,True,20,One,"Dans une France assez proche de la nôtre, un h...",Books
3,Sharp Objects,47.82,Mystery,True,20,Four,"WICKED above her hipbone, GIRL across her hear...",Books
4,Sapiens: A Brief History of Humankind,54.23,History,True,20,Five,From a renowned historian comes a groundbreaki...,Books


# Convert to categorical

In [13]:
df['rating'] = df['rating'].astype('category')
df['product_type'] = df['product_type'].astype('category')
df['available'] = df['available'].astype('category')
df["category"] = df["category"].astype("category")

print(df.dtypes)

title             object
price            float64
category        category
available       category
stock_count        int64
rating          category
description       object
product_type    category
dtype: object


In [15]:
num_categories = df["category"].nunique()
print(num_categories)

50


## Categoris are 50 (books categoreis in the web page also 50) so we are good 

In [67]:
df.describe()

Unnamed: 0,price,stock_count
count,1000.0,1000.0
mean,35.07035,8.585
std,14.44669,5.654622
min,10.0,1.0
25%,22.1075,3.0
50%,35.98,7.0
75%,47.4575,14.0
max,59.99,22.0


In [68]:
non_books_count = (df['product_type'] != 'Books').sum()
unavailable_books = (df['available'] == False).sum()

print(f"Non books:{non_books_count}, unavaialable:{unavailable_books}")


Non books:0, unavaialable:0


# According to above all are books and all are avaialble, so no use those fields

In [69]:
df = df.drop(columns=["product_type", "available"])

# Good to assign int value for rating, its easy for processing and storing

In [17]:
rating_map = {
    "Zero": 0,
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

df['rating'] = df['rating'].map(rating_map)

has_nulls = df['rating'].isnull().any()
print(has_nulls)

False


In [71]:
df.head()

Unnamed: 0,title,price,category,stock_count,rating,description
0,A Light in the Attic,51.77,Poetry,22,3,It's hard to imagine a world without A Light i...
1,Tipping the Velvet,53.74,Historical Fiction,20,1,"""Erotic and absorbing...Written with starling ..."
2,Soumission,50.1,Fiction,20,1,"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,47.82,Mystery,20,4,"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,54.23,History,20,5,From a renowned historian comes a groundbreaki...


In [72]:
import os

PROCESSED_DIR = "processed_files"
os.makedirs(PROCESSED_DIR, exist_ok=True)
df.to_json(f"{PROCESSED_DIR}/books_processed.jsonl", orient="records", lines=True)