# Data Viz

In [21]:
# imports
import pandas as pd 
import numpy as np 
import altair as alt

In [22]:
cust = pd.read_excel('../data_full/Customers.xlsx')
cust.head()

Unnamed: 0,id,first_name,last_name,gender,DOB,LoyaltyMember,EmailList,Source
0,1,Eveleen,Erat,F,1977-02-01 00:00:00,1.0,1.0,Newspaper
1,2,Micheil,Fransseni,M,1980-06-17 00:00:00,1.0,0.0,Social
2,4,Carin,Oulett,F,1973-03-14 00:00:00,1.0,0.0,Referral
3,8,Mallory,McShane,F,1968-03-24 00:00:00,0.0,1.0,Social
4,9,Billy,Labat,M,1979-07-28 00:00:00,0.0,1.0,Newspaper


In [23]:
order = pd.read_excel('../data_full/OrderInfo.xlsx')
order.head()

Unnamed: 0,CustomerID,LocationID,Date,Time,EmployeeID,OrderID
0,445,L01,2017-02-23,14:56:00,3,1
1,450,L01,2017-01-03,16:53:00,3,2
2,445,L01,2017-01-04,14:11:00,3,3
3,428,L01,2017-01-19,14:57:00,1,4
4,462,L01,2017-02-20,16:55:00,2,5


In [24]:
customer_order_df = cust.merge(
    order,
    left_on='id',
    right_on='CustomerID',
    how='right'
)

customer_order_df = customer_order_df.drop(columns={'CustomerID'}).rename(columns={'id': 'CustomerID'})

customer_order_df

Unnamed: 0,CustomerID,first_name,last_name,gender,DOB,LoyaltyMember,EmailList,Source,LocationID,Date,Time,EmployeeID,OrderID
0,445,Worthington,Dewen,M,1966-09-23 00:00:00,0.0,0.0,Social,L01,2017-02-23,14:56:00,3,1
1,450,Trip,Lago,M,1980-09-27 00:00:00,1.0,0.0,Referral,L01,2017-01-03,16:53:00,3,2
2,445,Worthington,Dewen,M,1966-09-23 00:00:00,0.0,0.0,Social,L01,2017-01-04,14:11:00,3,3
3,428,Ibrahim,Handscomb,M,1958-10-20 00:00:00,1.0,1.0,Newspaper,L01,2017-01-19,14:57:00,1,4
4,462,Vernor,Beurich,M,1992-06-13 00:00:00,0.0,1.0,Social,L01,2017-02-20,16:55:00,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61749,62773,Anna,Powers,F,1969-11-11,0.0,0.0,Social,L01,2025-12-31,17:44:27,6,53814
61750,58612,Steven,Schneider,M,1988-03-15,1.0,1.0,WalkIn,L10,2025-12-31,10:00:12,49,50740
61751,41961,Bradford,Jackalin,M,,,,,L08,2025-12-31,16:55:55,41,57592
61752,35991,Cori,Moreinu,F,1971-08-18 00:00:00,1.0,1.0,Referral,L06,2025-12-31,17:16:05,34,50927


In [25]:
sales = pd.read_csv('../data_full/LineItemSales.csv')
sales.head()

Unnamed: 0,LineItemID,OrderID,ItemID,Qty,DiscountID,Date,LocationID
0,1,1,20,1,D3,2017-02-23,L01
1,2,1,2,1,,2017-02-23,L01
2,3,1,53,1,,2017-02-23,L01
3,4,2,18,1,,2017-01-03,L01
4,5,2,41,1,,2017-01-03,L01


In [26]:
sales.loc[sales['LocationID'] == 'L15']

Unnamed: 0,LineItemID,OrderID,ItemID,Qty,DiscountID,Date,LocationID
126632,126632,35266,21,1,,2025-01-02,L15
126652,126652,58473,47,1,,2025-01-02,L15
126693,126693,35231,7,1,,2025-01-02,L15
126719,126719,35287,6,1,,2025-01-03,L15
126731,126731,35364,17,1,,2025-01-03,L15
...,...,...,...,...,...,...,...
161833,161833,35395,9,1,,2025-12-27,L15
161834,161834,35395,56,1,,2025-12-27,L15
161837,161837,60035,1,1,,2025-12-27,L15
161838,161838,60035,58,1,,2025-12-27,L15


In [27]:
# Make sure Date is datetime
sales["Date"] = pd.to_datetime(sales["Date"], errors="coerce")

# Aggregate YoY sales (total qty) by Location + Year
sales_yoy = (
    sales
    .assign(Year=sales["Date"].dt.year)
    .groupby(["Year", "LocationID"], as_index=False)
    .agg(
        total_qty=("Qty", "sum"),
        line_items=("LineItemID", "count"),
        orders=("OrderID", "nunique"),
    )
)

# Did they survive the groupby?
sales_yoy["LocationID"].unique()

# Specifically inspect the missing ones
# sales_yoy[sales_yoy["LocationID"].isin(["L12","L13","L14","L15"])].sort_values(["LocationID","Year"])

array(['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09',
       'L10', 'L11', 'L12', 'L13', 'L14', 'L15'], dtype=object)

In [28]:
# YoY chart: line per location
chart = (
    alt.Chart(sales_yoy)
    .mark_line(point=True)
    .encode(
        x=alt.X("Year:O", title="Year"),
        y=alt.Y("total_qty:Q", title="Total Qty Sold"),
        color=alt.Color("LocationID:N", title="Location"),
        tooltip=[
            alt.Tooltip("Year:O", title="Year"),
            alt.Tooltip("LocationID:N", title="Location"),
            alt.Tooltip("total_qty:Q", title="Total Qty"),
            alt.Tooltip("orders:Q", title="Unique Orders"),
            alt.Tooltip("line_items:Q", title="Line Items"),
        ],
    )
    .properties(title="Year-over-Year Sales (Total Qty) by Location", width=650, height=350)
)

chart

In [29]:
# 1) Aggregate "sales" as total quantity sold per location
sales_by_loc = (
    sales
    .groupby("LocationID", as_index=False)
    .agg(total_qty=("Qty", "sum"),
         line_items=("LineItemID", "count"),
         orders=("OrderID", "nunique"))
)

# 2) Bar chart: total quantity sold by location
chart = (
    alt.Chart(sales_by_loc)
    .mark_bar()
    .encode(
        x=alt.X("LocationID:N", title="Location"),
        y=alt.Y("total_qty:Q", title="Total Qty Sold"),
        tooltip=[
            alt.Tooltip("LocationID:N", title="Location"),
            alt.Tooltip("total_qty:Q", title="Total Qty"),
            alt.Tooltip("orders:Q", title="Unique Orders"),
            alt.Tooltip("line_items:Q", title="Line Items"),
        ],
    )
    .properties(title="Sales (Total Qty) by Location", width=500, height=300)
)

chart

In [30]:
# 1) Aggregate "sales" as total quantity sold per location
cust_by_loc = (
    customer_order_df
    .groupby("LocationID", as_index=False)
    .agg(
         line_items=("CustomerID", "count"),
         orders=("OrderID", "nunique"))
)

# 2) Bar chart: total quantity sold by location
chart = (
    alt.Chart(sales_by_loc)
    .mark_bar()
    .encode(
        x=alt.X("LocationID:N", title="Location"),
        y=alt.Y("total_qty:Q", title="Total Qty Sold"),
        tooltip=[
            alt.Tooltip("LocationID:N", title="Location"),
            alt.Tooltip("total_qty:Q", title="Total Qty"),
            alt.Tooltip("orders:Q", title="Unique Orders"),
            alt.Tooltip("Customers:Q", title="Customers"),
        ],
    )
    .properties(title="Sales (Total Qty) by Location", width=500, height=300)
)

chart

In [31]:
import pandas as pd
import altair as alt

# Ensure datetime
customer_order_df["Date"] = pd.to_datetime(customer_order_df["Date"], errors="coerce")

# Aggregate YoY by Location
cust_yoy = (
    customer_order_df
    .dropna(subset=["Date"])  # avoid Year=NaN group
    .assign(Year=lambda df: df["Date"].dt.year)
    .groupby(["Year", "LocationID"], as_index=False)
    .agg(
        orders=("OrderID", "nunique"),
        customers=("CustomerID", "nunique"),
        rows=("CustomerID", "count"),
    )
)

# YoY chart: unique customers by location
chart = (
    alt.Chart(cust_yoy)
    .mark_line(point=True)
    .encode(
        x=alt.X("Year:O", title="Year"),
        y=alt.Y("customers:Q", title="Unique Customers"),
        color=alt.Color("LocationID:N", title="Location"),
        tooltip=[
            alt.Tooltip("Year:O", title="Year"),
            alt.Tooltip("LocationID:N", title="Location"),
            alt.Tooltip("customers:Q", title="Unique Customers"),
            alt.Tooltip("orders:Q", title="Unique Orders"),
            alt.Tooltip("rows:Q", title="Rows"),
        ],
    )
    .properties(title="Year-over-Year Unique Customers by Location", width=650, height=350)
)

chart