In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patheffects as path_effects


file_path = r"E:\Projects\Gamezone Orders Data\Data\Cleaned\gamezone_orders_data_cleaned.csv"
df = pd.read_csv(
    file_path,
    parse_dates=['purchase_ts_cleaned', 'ship_ts'],
    dtype={
        'purchase_year': 'Int64',
        'purchase_month': 'Int64',
        'time_to_ship': 'Int64',
        'revenue': 'float'
    },
    encoding='utf-8',
    keep_default_na=False,
    na_values=['']  # Only treat empty strings as NaN
)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21864 entries, 0 to 21863
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   user_id                          21864 non-null  object        
 1   order_id                         21864 non-null  object        
 2   purchase_ts                      21864 non-null  object        
 3   purchase_ts_cleaned              21863 non-null  datetime64[ns]
 4   purchase_year                    21864 non-null  Int64         
 5   purchase_month                   21864 non-null  Int64         
 6   time_to_ship                     21864 non-null  Int64         
 7   ship_ts                          21864 non-null  datetime64[ns]
 8   product_name                     21864 non-null  object        
 9   product_name_cleaned             21864 non-null  object        
 10  product_id                       21864 non-null  object   

In [24]:
df['order_id'].nunique()

21717

In [5]:
# Ensure datetime
df['purchase_ts_cleaned'] = pd.to_datetime(df['purchase_ts_cleaned'])

# Create year-month column
df['year_month'] = df['purchase_ts_cleaned'].dt.to_period('M')

# --- 1. Calculate Monthly Active Buyers (unique users per month)
mab = (
    df.groupby('year_month')['user_id']
      .nunique()
      .rename('mab')
      .to_frame()
)

# Convert PeriodIndex → Timestamp for easier pct change
mab.index = mab.index.to_timestamp()

# --- 2. Calculate MoM %
mab['mom_pct'] = mab['mab'].pct_change() * 100

# --- 3. Calculate YoY %
mab['yoy_pct'] = mab['mab'].pct_change(12) * 100

# --- Final table
mab = mab.reset_index().rename(columns={'year_month': 'month'})
mab


Unnamed: 0,month,mab,mom_pct,yoy_pct
0,2019-01-01,419,,
1,2019-02-01,318,-24.105012,
2,2019-03-01,518,62.893082,
3,2019-04-01,530,2.316602,
4,2019-05-01,501,-5.471698,
5,2019-06-01,474,-5.389222,
6,2019-07-01,501,5.696203,
7,2019-08-01,528,5.389222,
8,2019-09-01,577,9.280303,
9,2019-10-01,492,-14.731369,


In [6]:
# Month key
df['year_month'] = df['purchase_ts_cleaned'].dt.to_period('M')

# --- 1. Monthly Active Buyers (unique users per month)
mab = (
    df.groupby('year_month')['user_id']
      .nunique()
      .rename('mab')
      .to_frame()
)

# Convert PeriodIndex → Timestamp to support pct_change
mab.index = mab.index.to_timestamp()

# --- 2. MoM change %
mab['mom_change_pct'] = mab['mab'].pct_change() * 100

# --- 3. YoY change % (12-month difference)
mab['yoy_change_pct'] = mab['mab'].pct_change(12) * 100

# Final tidy format
mab = mab.reset_index().rename(columns={'year_month': 'month'})
mab


Unnamed: 0,month,mab,mom_change_pct,yoy_change_pct
0,2019-01-01,419,,
1,2019-02-01,318,-24.105012,
2,2019-03-01,518,62.893082,
3,2019-04-01,530,2.316602,
4,2019-05-01,501,-5.471698,
5,2019-06-01,474,-5.389222,
6,2019-07-01,501,5.696203,
7,2019-08-01,528,5.389222,
8,2019-09-01,577,9.280303,
9,2019-10-01,492,-14.731369,


In [7]:
import pandas as pd

# Ensure datetime
df['purchase_ts_cleaned'] = pd.to_datetime(df['purchase_ts_cleaned'])

# Month key
df['year_month'] = df['purchase_ts_cleaned'].dt.to_period('M')

# --- 1. Monthly Active Buyers (unique users per month)
mab = (
    df.groupby('year_month')['user_id']
      .nunique()
      .rename('mab')
      .to_frame()
)

# Convert PeriodIndex → Timestamp to support pct_change
mab.index = mab.index.to_timestamp()

# --- 2. MoM change %
mab['mom_change_pct'] = mab['mab'].pct_change() * 100

# --- 3. YoY change % (12-month difference)
mab['yoy_change_pct'] = mab['mab'].pct_change(12) * 100

# Final tidy format
mab = mab.reset_index().rename(columns={'year_month': 'month'})
mab


Unnamed: 0,month,mab,mom_change_pct,yoy_change_pct
0,2019-01-01,419,,
1,2019-02-01,318,-24.105012,
2,2019-03-01,518,62.893082,
3,2019-04-01,530,2.316602,
4,2019-05-01,501,-5.471698,
5,2019-06-01,474,-5.389222,
6,2019-07-01,501,5.696203,
7,2019-08-01,528,5.389222,
8,2019-09-01,577,9.280303,
9,2019-10-01,492,-14.731369,


In [8]:
df['order_id'].nunique()

21717

In [9]:
# checking unique products 

df["product_name_cleaned"].value_counts()

product_name_cleaned
nintendo switch                   10386
27in 4k gaming monitor             4723
jbl quantum 100 gaming headset     4296
sony playstation 5 bundle           977
dell gaming mouse                   719
lenovo ideapad gaming 3             669
acer nitro v gaming laptop           87
razer pro gaming headset              7
Name: count, dtype: int64

In [10]:
df['product_name_cleaned'].nunique()


8

In [11]:
# unique countries 

df['country_code'].nunique()

152

In [12]:
# checking most countries 

df['country_code'].value_counts().head()

country_code
US    10294
GB     1808
CA      956
AU      893
DE      855
Name: count, dtype: int64

In [13]:
# marketing channels 

df['marketing_channel_cleaned'].value_counts()

marketing_channel_cleaned
direct          17434
email            3256
affiliate         721
social media      323
unknown           130
Name: count, dtype: int64

In [14]:
# purchase platform 

df['purchase_platform'].value_counts()

purchase_platform
website       19783
mobile app     2081
Name: count, dtype: int64

In [15]:
# account type

df['account_creation_method_cleaned'].value_counts()

account_creation_method_cleaned
desktop    16458
mobile      4232
unknown      826
tablet       323
tv            25
Name: count, dtype: int64

In [16]:
total_revenue = df['revenue'].sum()

total_revenue

6151266.49

In [17]:
total_orders = df['order_id'].count()
total_orders

21864

In [18]:
unique_customers = df['user_id'].nunique()
unique_customers

19851

In [19]:
average_order_value = total_revenue/total_orders

average_order_value.round(2)

281.34

In [20]:
df['purchase_ts_cleaned'].max()

Timestamp('2021-02-28 00:00:00')

In [21]:
# Group, filter, and calculate in one flow
monthly_summary = (
    df[df['purchase_month'] != 0]  # Filter invalid months upfront
    .groupby('purchase_month')
    .agg(
        total_revenue=('revenue', 'sum'),
        total_orders=('order_id', 'count'),
        unique_customers=('user_id', 'nunique')
    )
    .assign(
        aov=lambda x: (x['total_revenue'] / x['total_orders']).round(2),
        month_name=lambda x: x.index.map({
            1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr',
            5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug',
            9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
        })
    )
    .reset_index()
    [['purchase_month', 'month_name', 'total_revenue', 'total_orders', 'unique_customers', 'aov']]
)

monthly_summary

Unnamed: 0,purchase_month,month_name,total_revenue,total_orders,unique_customers,aov
0,1,Jan,499725.27,1821,1465,274.42
1,2,Feb,514735.06,1825,1818,282.05
2,3,Mar,417653.1,1509,1504,276.77
3,4,Apr,472963.91,1723,1712,274.5
4,5,May,468890.58,1727,1725,271.51
5,6,Jun,433190.52,1617,1596,267.9
6,7,Jul,448530.84,1693,1573,264.93
7,8,Aug,525416.75,1797,1654,292.39
8,9,Sep,618444.77,2177,1860,284.08
9,10,Oct,464867.06,1549,1520,300.11


In [22]:
df['purchase_year'].head()

0    2021
1    2021
2    2021
3    2021
4    2021
Name: purchase_year, dtype: Int64

In [23]:
monthly_summary.to_csv("monthly_summary", index=False )