In [2]:
import pandas as pd 
import numpy as np

df = pd.read_csv(r"C:\Users\shaif\OneDrive\Desktop\gamezone_orders_data.csv", sep=",", encoding="utf-8",keep_default_na=False, na_values=[""], dtype=str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21864 entries, 0 to 21863
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   USER_ID                          21864 non-null  object
 1   ORDER_ID                         21864 non-null  object
 2   PURCHASE_TS                      21864 non-null  object
 3   PURCHASE_TS_CLEANED              21863 non-null  object
 4   PURCHASE_YEAR                    21863 non-null  object
 5   PURCHASE_MONTH                   21863 non-null  object
 6   TIME_TO_SHIP                     21863 non-null  object
 7   SHIP_TS                          21864 non-null  object
 8   PRODUCT_NAME                     21864 non-null  object
 9   PRODUCT_NAME_CLEANED             21864 non-null  object
 10  PRODUCT_ID                       21864 non-null  object
 11   USD_PRICE                       21859 non-null  object
 12  PURCHASE_PLATFORM               

In [3]:
# data cleaning 

df.columns = [col.lower() for col in df.columns] # lowering all column names 

df.columns = df.columns.str.strip()  # removes spaces around names

df['purchase_ts_cleaned'] = pd.to_datetime(df['purchase_ts_cleaned'], errors='coerce', dayfirst=True) # converting purchase_ts 

df['ship_ts'] = pd.to_datetime(df['ship_ts'], errors='coerce', dayfirst=True) #  Convert to datetime

df['revenue'] = df['revenue'].replace(r'[\$,]', '', regex=True)
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce')

df['region'] = df['region'].astype(str).str.strip().replace('', pd.NA)
df['region'].isna().sum()

df['purchase_year'] = df['purchase_year'].fillna(0).astype(int)
df['purchase_month'] = df['purchase_month'].fillna(0).astype(int)
df['time_to_ship'] = df['time_to_ship'].fillna(0).astype(int)

df['country_code'] = df['country_code'].astype(str).str.strip().str.upper() #cleaning the countries 

df['product_name_cleaned'] = df['product_name_cleaned'].str.strip().str.lower() # cleaning product name 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21864 entries, 0 to 21863
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   user_id                          21864 non-null  object        
 1   order_id                         21864 non-null  object        
 2   purchase_ts                      21864 non-null  object        
 3   purchase_ts_cleaned              21863 non-null  datetime64[ns]
 4   purchase_year                    21864 non-null  int32         
 5   purchase_month                   21864 non-null  int32         
 6   time_to_ship                     21864 non-null  int32         
 7   ship_ts                          21864 non-null  datetime64[ns]
 8   product_name                     21864 non-null  object        
 9   product_name_cleaned             21864 non-null  object        
 10  product_id                       21864 non-null  object   

In [4]:
# ✅ 1️⃣ TOP-SELLING PRODUCTS (by total revenue)
top_selling_products = (
    df.groupby('product_name_cleaned')['revenue']
    .sum()
    .reset_index()
    .rename(columns={'revenue': 'total_revenue'})
    .sort_values('total_revenue', ascending=False)
)
top_selling_products['revenue_share_%'] = (
    top_selling_products['total_revenue'] / top_selling_products['total_revenue'].sum() * 100
).round(2)

# ✅ 2️⃣ MOST PROFITABLE PRODUCTS (based on revenue per order)
profit_per_product = (
    df.groupby('product_name_cleaned')['revenue']
    .mean()
    .reset_index()
    .rename(columns={'revenue': 'avg_revenue_per_order'})
    .sort_values('avg_revenue_per_order', ascending=False)
)

# ✅ 3️⃣ AVERAGE PRICE PER PRODUCT
avg_price_per_product = (
    df.groupby('product_name_cleaned')['revenue']
    .mean()
    .reset_index()
    .rename(columns={'revenue': 'avg_price'})
)

# ✅ 4️⃣ PRODUCT REPEAT PURCHASE RATE (% of customers who ordered product >1 time)
repeat_customers = (
    df.groupby(['product_name_cleaned', 'user_id'])['order_id']
    .nunique()
    .reset_index()
)
repeat_rate = (
    repeat_customers.groupby('product_name_cleaned')
    .apply(lambda x: (x['order_id'] > 1).mean() * 100)
    .reset_index(name='repeat_purchase_rate_%')
)

# ✅ 5️⃣ MONTHLY REVENUE TREND PER PRODUCT
monthly_revenue_trend = (
    df.groupby(['product_name_cleaned', 'purchase_year', 'purchase_month'])['revenue']
    .sum()
    .reset_index()
    .rename(columns={'revenue': 'monthly_revenue'})
    .sort_values(['product_name_cleaned', 'purchase_year', 'purchase_month'])
)

# ✅ 6️⃣ PRODUCT PERFORMANCE BY CHANNEL & REGION
product_channel_region = (
    df.groupby(['product_name_cleaned', 'marketing_channel_cleaned', 'region'])['revenue']
    .sum()
    .reset_index()
    .rename(columns={'revenue': 'total_revenue'})
)

# ✅ 7️⃣ CREATE SUMMARY TABLE
product_summary = (
    top_selling_products[['product_name_cleaned', 'total_revenue', 'revenue_share_%']]
    .merge(profit_per_product, on='product_name_cleaned', how='left')
    .merge(avg_price_per_product, on='product_name_cleaned', how='left')
    .merge(repeat_rate, on='product_name_cleaned', how='left')

)

product_summary

monthly_revenue_trend



  .apply(lambda x: (x['order_id'] > 1).mean() * 100)


Unnamed: 0,product_name_cleaned,purchase_year,purchase_month,monthly_revenue
0,27in 4k gaming monitor,2019,1,33501.10
1,27in 4k gaming monitor,2019,2,26409.22
2,27in 4k gaming monitor,2019,3,35169.23
3,27in 4k gaming monitor,2019,4,48754.08
4,27in 4k gaming monitor,2019,5,38973.22
...,...,...,...,...
170,sony playstation 5 bundle,2020,10,116500.71
171,sony playstation 5 bundle,2020,11,117975.57
172,sony playstation 5 bundle,2020,12,167269.38
173,sony playstation 5 bundle,2021,1,81970.33


In [5]:
product_summary.to_csv('product_summary.csv', index=False)
monthly_revenue_trend.to_csv('product_monthly_trend.csv', index=False)
product_channel_region.to_csv('product_channel_region.csv', index=False)

In [6]:
# ✅ 1️⃣ TOP-SELLING PRODUCTS (by total revenue)
top_selling_products = (
    df.groupby('product_name_cleaned')['revenue']
    .sum()
    .reset_index()
    .rename(columns={'revenue': 'total_revenue'})
    .sort_values('total_revenue', ascending=False)
)
top_selling_products['revenue_share_%'] = (
    top_selling_products['total_revenue'] / top_selling_products['total_revenue'].sum() * 100
).round(2)

# ✅ 2️⃣ UNITS SOLD PER PRODUCT
units_sold = (
    df.groupby('product_name_cleaned')['order_id']
    .count()
    .reset_index()
    .rename(columns={'order_id': 'units_sold'})
)

# ✅ 3️⃣ AVERAGE PRICE PER PRODUCT
avg_price_per_product = (
    df.groupby('product_name_cleaned')['revenue']
    .mean()
    .reset_index()
    .rename(columns={'revenue': 'avg_price'})
)

# ✅ 4️⃣ PRODUCT REPEAT PURCHASE RATE
repeat_customers = (
    df.groupby(['product_name_cleaned', 'user_id'])['order_id']
    .nunique()
    .reset_index()
)
repeat_rate = (
    repeat_customers.groupby('product_name_cleaned')
    .apply(lambda x: (x['order_id'] > 1).mean() * 100)
    .reset_index(name='repeat_purchase_rate_%')
)

# ✅ 5️⃣ UNIQUE CUSTOMERS PER PRODUCT
unique_customers_per_product = (
    df.groupby('product_name_cleaned')['user_id']
    .nunique()
    .reset_index()
    .rename(columns={'user_id': 'unique_customers'})
)

# ✅ 6️⃣ CREATE PRODUCT SUMMARY TABLE
product_summary = (
    top_selling_products[['product_name_cleaned', 'total_revenue', 'revenue_share_%']]
    .merge(units_sold, on='product_name_cleaned', how='left')
    .merge(unique_customers_per_product, on='product_name_cleaned', how='left')
    .merge(avg_price_per_product, on='product_name_cleaned', how='left')
    .merge(repeat_rate, on='product_name_cleaned', how='left')
)

# Add revenue rank
product_summary['revenue_rank'] = product_summary['total_revenue'].rank(
    ascending=False, method='dense'
).astype(int)

# ✅ 7️⃣ MONTHLY REVENUE TREND (Top 10 products only for manageability)
top_10_products = top_selling_products.head(10)['product_name_cleaned'].tolist()

monthly_revenue_trend = (
    df[df['product_name_cleaned'].isin(top_10_products)]
    .groupby(['product_name_cleaned', 'purchase_year', 'purchase_month'])['revenue']
    .sum()
    .reset_index()
    .rename(columns={'revenue': 'monthly_revenue'})
    .sort_values(['product_name_cleaned', 'purchase_year', 'purchase_month'])
)

# ✅ 8️⃣ PRODUCT PERFORMANCE BY CHANNEL & REGION (Top 10 products)
product_channel_region = (
    df[df['product_name_cleaned'].isin(top_10_products)]
    .groupby(['product_name_cleaned', 'marketing_channel_cleaned', 'region'])['revenue']
    .sum()
    .reset_index()
    .rename(columns={'revenue': 'total_revenue'})
    .sort_values('total_revenue', ascending=False)
)

# ✅ 9️⃣ CREATE FINAL DICTIONARY
product_kpis = {
    "Product Summary": product_summary,
    "Monthly Revenue Trend (Top 10)": monthly_revenue_trend,
    "Product x Channel x Region (Top 10)": product_channel_region
}

product_kpis

  .apply(lambda x: (x['order_id'] > 1).mean() * 100)


{'Product Summary':              product_name_cleaned  total_revenue  revenue_share_%  units_sold  \
 0          27in 4k gaming monitor     1968565.34            32.00        4723   
 1                 nintendo switch     1658717.65            26.97       10386   
 2       sony playstation 5 bundle     1589243.85            25.84         977   
 3         lenovo ideapad gaming 3      735506.56            11.96         669   
 4  jbl quantum 100 gaming headset       96197.67             1.56        4296   
 5      acer nitro v gaming laptop       65661.18             1.07          87   
 6               dell gaming mouse       36490.01             0.59         719   
 7        razer pro gaming headset         884.23             0.01           7   
 
    unique_customers    avg_price  repeat_purchase_rate_%  revenue_rank  
 0              4358   417.511207                7.342818             1  
 1              9686   159.707072                6.989469             2  
 2               90

In [7]:
# Main product summary
product_summary.to_csv('product_summary.csv', index=False)

# Monthly revenue trend for top products
monthly_revenue_trend.to_csv('product_monthly_trend.csv', index=False)

# Product performance by channel and region
product_channel_region.to_csv('product_channel_region.csv', index=False)

In [8]:
product_summary

Unnamed: 0,product_name_cleaned,total_revenue,revenue_share_%,units_sold,unique_customers,avg_price,repeat_purchase_rate_%,revenue_rank
0,27in 4k gaming monitor,1968565.34,32.0,4723,4358,417.511207,7.342818,1
1,nintendo switch,1658717.65,26.97,10386,9686,159.707072,6.989469,2
2,sony playstation 5 bundle,1589243.85,25.84,977,900,1626.65696,8.444444,3
3,lenovo ideapad gaming 3,735506.56,11.96,669,617,1099.411898,8.265802,4
4,jbl quantum 100 gaming headset,96197.67,1.56,4296,4025,22.502379,6.509317,5
5,acer nitro v gaming laptop,65661.18,1.07,87,78,754.726207,11.538462,6
6,dell gaming mouse,36490.01,0.59,719,647,51.106457,10.819165,7
7,razer pro gaming headset,884.23,0.01,7,7,126.318571,0.0,8
