In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import altair as alt

from product_search.preprocessing import process_product_data

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 20)

In [2]:
# df = pd.read_json('data/esci.json', lines=True)
df = pd.read_json('data/sample.json', lines=True)

In [None]:
df.head(2)

In [None]:
# Basic information about the dataset
print("=== Dataset Overview ===")
print(f"Number of records: {len(df)}")
print("\nColumns:")
df.columns.tolist()

In [None]:
# Data types
print("\n=== Data Types ===")
df.dtypes

In [None]:
# Total number of records
print("\n=== Total Records ===")
total_records = len(df)
print(f"Total number of records in dataset: {total_records:,}")


In [None]:
# Check for missing values
print("\n=== Missing Values ===")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])  # Only show columns with missing values
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

In [None]:
# Distribution of product types
print("\n=== Product Type Distribution ===")
type_dist = df['type'].value_counts()
print(type_dist)

In [None]:
chart = alt.Chart(df.reset_index()).mark_bar().encode(
    x=alt.X('type:N', title='Type'),
    y=alt.Y('count():Q', title='Count')
).properties(
    width=500,
    height=300,
    title='Distribution of Product Types'
)
chart.show()

In [None]:
# Distribution of locales
print("\n=== Locale Distribution ===")
locale_dist = df['locale'].value_counts()
print(locale_dist)

In [None]:
# Create a DataFrame for the locale distribution
locale_df = pd.DataFrame({
    'locale': locale_dist.index,
    'count': locale_dist.values
})

# Calculate percentage
locale_df['percentage'] = locale_df['count'] / locale_df['count'].sum() * 100

# Create donut chart
chart = alt.Chart(locale_df).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field='count', type='quantitative'),
    color=alt.Color(field='locale', type='nominal'),
    tooltip=['locale', 'percentage']
).properties(
    width=400,
    height=400,
    title='Distribution of Locales'
)
chart.show()

In [None]:
# Analyze ratings distribution
print("\n=== Ratings Statistics ===")
# Convert ratings to numeric, removing 'ratings' text and commas
df['ratings_count'] = df['ratings'].str.extract('(\d+(?:,\d+)?)', expand=False).str.replace(',', '').astype(float)
print(df['ratings_count'].describe())

In [None]:
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('ratings_count:Q', 
            bin=alt.Bin(maxbins=30),
            title='Number of Ratings'),
    y=alt.Y('count():Q',
            scale=alt.Scale(type='log'),
            title='Frequency')
).properties(
    width=500,
    height=300,
    title='Distribution of Number of Ratings'
)
chart.show()

In [None]:
# Analyze star ratings
print("\n=== Star Ratings Statistics ===")
# Convert stars to numeric, extracting just the number
df['stars_numeric'] = df['stars'].str.extract('([\d.]+)').astype(float)
print(df['stars_numeric'].describe())

In [None]:
chart = alt.Chart(df).mark_boxplot().encode(
    x=alt.X('type:N', title='Type'),
    y=alt.Y('stars_numeric:Q', title='Star Rating')
).properties(
    width=600,
    height=300,
    title='Star Ratings Distribution by Product Type'
)
chart.show()

 # EDA on the processed data

In [None]:
processed_df = process_product_data(df, sample=False)
processed_df.head()

In [None]:
# we can look at the minimum and maximum values of Number-like attributes to properly set up the space
processed_df[["price", "review_count", "review_rating"]].describe().loc[["min", "max"]]

In [None]:
# Check for NaN values in numeric columns
print("\nNaN value counts:")
print(processed_df[["price", "review_count", "review_rating"]].isna().sum())