In [9]:
import pandas as pd
import numpy as np
import altair as alt
import vegafusion as vf

alt.data_transformers.enable("vegafusion")
import os


os.chdir('/Users/dsagher/Library/CloudStorage/Dropbox/Documents/school/fall 2025/CMSE 830/midterm_project_2')

In [10]:
# Processed Data
df_customer = pd.read_csv('./data/processed/customers_processed.csv')
df_geo = pd.read_csv('./data/processed/geolocation.csv')
df_order_item = pd.read_csv('./data/processed/order_items.csv')
df_order_payment = pd.read_csv('./data/processed/order_payments.csv')
df_order_review = pd.read_csv('./data/processed/order_reviews.csv')
df_product = pd.read_csv('./data/processed/products.csv')
df_seller = pd.read_csv('./data/processed/sellers.csv')
df_product_category = pd.read_csv('./data/processed/product_category.csv')
df_order = pd.read_csv('./data/processed/orders_processed.csv')


In [11]:
df_product = (pd.merge(df_product_category, df_product, on='product_category_name', how='inner')
            .drop(columns=['product_category_name'])
            .rename(columns={'product_category_name_english': 'product_category_name'})
)

# Which Product Categories have the highest sales?

In [12]:
"""
Top Item Categories in Sales
- Top 10 item categories in sales
- Item categories in sales by percentage of total sales
"""

# Prepare top 10 item categories by total purchase value ("sales")
#! This has the top 10 most expensive products, not the top 10 categories in sales lol
top_cats = (
    df_order_item
    .merge(df_product[['product_id', 'product_category_name']], on='product_id', how='inner')
    .groupby('product_category_name')['price']
    .sum()
    .reset_index()
    .sort_values(by='price', ascending=False)
    .head(10)
)
top_cats['price_pct'] = 100 * top_cats['price'] / top_cats['price'].sum()

# Bar chart of top 10 item categories in sales
bar_chart = alt.Chart(top_cats).mark_bar(opacity=0.85).encode(
    x=alt.X('product_category_name:N', sort='-y', title='Product Category', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('price:Q', title='Total Sales (BRL)'),
    color=alt.Color(field="product_category_name", type="nominal")
).properties(
    title='Top 10 Product Categories by Sales',
    width=500
)

# Pie chart of these categories by percentage of total sales among top 10
pie_chart = alt.Chart(top_cats).mark_arc(opacity=0.85).encode(
    theta=alt.Theta(field="price", type="quantitative"),
    color=alt.Color(field="product_category_name", type="nominal"),
    tooltip=['product_category_name:N', alt.Tooltip('price:Q', format=".2f"), alt.Tooltip('price_pct:Q', format='.2f', title='% of Top 10')]
).properties(
    title="Top 10 Categories as Percentage of Sales",
    width=350,
    height=350
)

bar_chart | pie_chart


# Which regions have the highest sales and in what categories?

In [13]:
# Get unique zips with city, state, lat, lng, and region
unique_zips = (df_geo[['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state','geolocation_lat','geolocation_lng', 'geolocation_region']]
                .groupby('geolocation_zip_code_prefix')
                .first())

# Merge order and customer data to get zips
customer_order = pd.merge(df_order, df_customer, on='customer_id', how='inner')

# Merge zips and geo location with customer order data
customer_order_geo = pd.merge(unique_zips, customer_order, left_on='geolocation_zip_code_prefix', right_on='customer_zip_code_prefix', how='inner')

# Calculate sales by region and product category
sales_by_region = (customer_order_geo
        .merge(df_order_item, on='order_id', how='inner')
        .merge(df_product, on='product_id', how='inner')
        .groupby(["product_category_name", "geolocation_region"])
        .agg({"price": "sum", "order_id": "count"})
        .reset_index()
        .rename(columns={"product_category_name": "Product Category", "price": "Sales", "order_id": "Orders"}))

# Calculate ARPU
sales_by_region = (sales_by_region
        .assign(ARPU=lambda x: round(x["Sales"] / x["Orders"], 2))
        .sort_values(by="ARPU", ascending=False))


In [27]:
sales_by_region.head(20)

Unnamed: 0,Product Category,geolocation_region,Sales,Orders,ARPU
300,small_appliances_home_oven_and_coffee,North,3998.0,2,1999.0
68,computers,South,21293.85,17,1252.58
65,computers,Central-West,16535.95,14,1181.14
66,computers,North,10995.9,10,1099.59
69,computers,Southeast,114284.72,104,1098.89
67,computers,Northeast,58552.71,57,1027.24
302,small_appliances_home_oven_and_coffee,South,13839.8,15,922.65
299,small_appliances_home_oven_and_coffee,Central-West,2250.0,3,750.0
258,music,Central-West,1216.32,2,608.16
213,home_appliances_2,Northeast,10172.06,17,598.36


In [None]:
# ARPU and total sales by Region and Product Category
bubble_chart = alt.Chart(sales_by_region).mark_circle(opacity=0.7).encode(
    x=alt.X('Sales:Q', title='Total Sales (BRL)'),
    y=alt.Y('ARPU:Q', title='Average Revenue per Order (ARPU)'),
    size=alt.Size('Orders:Q', title='Order Count', scale=alt.Scale(range=[30, 1000])),
    color=alt.Color('geolocation_region:N', title='Region'),
    tooltip=['Product Category', 'geolocation_region', 'Sales', 'ARPU', 'Orders']
).properties(
    title='Sales vs ARPU by Product Category and Region',
    width=800,
    height=500
).interactive()

bubble_chart

# View sales over time

In [None]:
import calendar

#Convert timestamp to month, year, and day
df_order['order_purchase_month'] = pd.to_datetime(df_order['order_purchase_timestamp']).dt.month
df_order['order_purchase_year'] = pd.to_datetime(df_order['order_purchase_timestamp']).dt.year
df_order['order_purchase_day'] = pd.to_datetime(df_order['order_purchase_timestamp']).dt.day

# Group by year and month and count the number of orders
sales_over_time = df_order.groupby(['order_purchase_year', 'order_purchase_month'])['order_id'].count().reset_index()

# Convert month number to month name
sales_over_time['order_purchase_month_word'] = sales_over_time['order_purchase_month'].apply(lambda x: calendar.month_name[x])

In [None]:
# Filter for 2017 and 2018
# 2016 and 2019 are too low to see any real trends
mask = (sales_over_time['order_purchase_year'] == 2018) | (sales_over_time['order_purchase_year'] == 2017)

seventeen_eighteen_chart = alt.Chart(sales_over_time[mask]
).mark_area(opacity=0.6).encode(
    x=alt.X(
        "order_purchase_month_word:O",
        title="Month",
        sort=[
            "January","February","March","April","May","June",
            "July","August","September","October","November","December"
        ]
    ),
    y=alt.Y("order_id", title="Sales"),
    color=alt.Color(
        "order_purchase_year:O",
        title="Year",
        scale=alt.Scale(domain=[2017, 2018], range=['darkgreen', 'orange'])
    )
).properties(
    title='2017–2018 Sales by Month',
    width=1250,
    height=500
)

seventeen_eighteen_chart

# Next Steps
- Hypothesis forumalation for significance testing
- Correlation analysis and multivariate analysis