In [7]:
import pandas as pd 
import numpy as np

df = pd.read_csv(r"C:\Users\shaif\OneDrive\Desktop\gamezone_orders_data.csv", sep=",", encoding="utf-8",keep_default_na=False, na_values=[""], dtype=str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21864 entries, 0 to 21863
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   USER_ID                          21864 non-null  object
 1   ORDER_ID                         21864 non-null  object
 2   PURCHASE_TS                      21864 non-null  object
 3   PURCHASE_TS_CLEANED              21863 non-null  object
 4   PURCHASE_YEAR                    21863 non-null  object
 5   PURCHASE_MONTH                   21863 non-null  object
 6   TIME_TO_SHIP                     21863 non-null  object
 7   SHIP_TS                          21864 non-null  object
 8   PRODUCT_NAME                     21864 non-null  object
 9   PRODUCT_NAME_CLEANED             21864 non-null  object
 10  PRODUCT_ID                       21864 non-null  object
 11   USD_PRICE                       21859 non-null  object
 12  PURCHASE_PLATFORM               

In [8]:
# data cleaning 

df.columns = [col.lower() for col in df.columns] # lowering all column names 

df.columns = df.columns.str.strip()  # removes spaces around names

df['purchase_ts_cleaned'] = pd.to_datetime(df['purchase_ts_cleaned'], errors='coerce', dayfirst=True) # converting purchase_ts 

df['ship_ts'] = pd.to_datetime(df['ship_ts'], errors='coerce', dayfirst=True) #  Convert to datetime

df['revenue'] = df['revenue'].replace(r'[\$,]', '', regex=True)
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce')

df['region'] = df['region'].astype(str).str.strip().replace('', pd.NA)
df['region'].isna().sum()

df['purchase_year'] = df['purchase_year'].fillna(0).astype(int)
df['purchase_month'] = df['purchase_month'].fillna(0).astype(int)
df['time_to_ship'] = df['time_to_ship'].fillna(0).astype(int)

df['country_code'] = df['country_code'].astype(str).str.strip().str.upper() #cleaning the countries 

df['product_name_cleaned'] = df['product_name_cleaned'].str.strip().str.lower() # cleaning product name 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21864 entries, 0 to 21863
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   user_id                          21864 non-null  object        
 1   order_id                         21864 non-null  object        
 2   purchase_ts                      21864 non-null  object        
 3   purchase_ts_cleaned              21863 non-null  datetime64[ns]
 4   purchase_year                    21864 non-null  int32         
 5   purchase_month                   21864 non-null  int32         
 6   time_to_ship                     21864 non-null  int32         
 7   ship_ts                          21864 non-null  datetime64[ns]
 8   product_name                     21864 non-null  object        
 9   product_name_cleaned             21864 non-null  object        
 10  product_id                       21864 non-null  object   

In [4]:
# --- üë• CUSTOMER & USER KPIs ---

import pandas as pd

# ‚úÖ 1Ô∏è‚É£ TOTAL CUSTOMERS
total_customers = df['user_id'].nunique()

# ‚úÖ 2Ô∏è‚É£ REPEAT PURCHASE RATE
orders_per_customer = df.groupby('user_id')['order_id'].nunique()
repeat_customers = orders_per_customer[orders_per_customer > 1].count()
repeat_purchase_rate = (repeat_customers / total_customers) * 100

# ‚úÖ 3Ô∏è‚É£ AVERAGE ORDERS PER CUSTOMER
avg_orders_per_customer = df['order_id'].nunique() / total_customers

# ‚úÖ 4Ô∏è‚É£ CUSTOMER DISTRIBUTION BY COUNTRY / REGION
customer_distribution = (
    df.groupby('region')['user_id']
    .nunique()
    .reset_index()
    .rename(columns={'user_id': 'unique_customers'})
)
customer_distribution['customer_share_%'] = (
    customer_distribution['unique_customers'] / total_customers * 100
)

# ‚úÖ 5Ô∏è‚É£ CUSTOMER ACQUISITION METHOD SPLIT
acquisition_split = (
    df.groupby('account_creation_method_cleaned')['user_id']
    .nunique()
    .reset_index()
    .rename(columns={'user_id': 'unique_customers'})
)
acquisition_split['share_%'] = (
    acquisition_split['unique_customers'] / total_customers * 100
)

# ‚úÖ 6Ô∏è‚É£ CREATE SUMMARY TABLE
summary_data = {
    'Metric': [
        'Total Customers',
        'Repeat Purchase Rate (%)',
        'Average Orders per Customer'
    ],
    'Value': [
        total_customers,
        round(repeat_purchase_rate, 2),
        round(avg_orders_per_customer, 2)
    ]
}

customer_summary = pd.DataFrame(summary_data)

# ‚úÖ 7Ô∏è‚É£ MERGE ALL INTO ONE DATAFRAME
# Add region distribution and acquisition split under the same CSV for one-stop view
customer_kpis = {
    "Summary": customer_summary,
    "Customer Distribution by Region": customer_distribution,
    "Customer Acquisition Method Split": acquisition_split
}

customer_kpis



{'Summary':                         Metric     Value
 0              Total Customers  19851.00
 1     Repeat Purchase Rate (%)      9.34
 2  Average Orders per Customer      1.09,
 'Customer Distribution by Region':     region  unique_customers  customer_share_%
 0     APAC              1430          7.203667
 1     EMEA              8640         43.524256
 2    LATAM               356          1.793361
 3       NA              9388         47.292328
 4  unknown                37          0.186389,
 'Customer Acquisition Method Split':   account_creation_method_cleaned  unique_customers    share_%
 0                         desktop             14900  75.059191
 1                          mobile              3992  20.109818
 2                          tablet               304   1.531409
 3                              tv                22   0.110826
 4                         unknown               746   3.757997}

In [5]:
# Combine all three into one single CSV file
with open('customer_kpis.csv', 'w', newline='') as f:
    f.write("üë• CUSTOMER SUMMARY\n")
    customer_summary.to_csv(f, index=False)
    f.write("\n\nüåç CUSTOMER DISTRIBUTION BY REGION\n")
    customer_distribution.to_csv(f, index=False)
    f.write("\n\nüß≠ CUSTOMER ACQUISITION METHOD SPLIT\n")
    acquisition_split.to_csv(f, index=False)

In [None]:
# --- üïí CUSTOMER FIRST PURCHASE & COHORT ANALYSIS ---

# ‚úÖ 1Ô∏è‚É£ Compute each customer's first purchase date
first_purchase = (
    df.groupby('user_id')['purchase_ts_cleaned']
    .min()
    .reset_index()
    .rename(columns={'purchase_ts_cleaned': 'first_purchase_date'})
)

# ‚úÖ 2Ô∏è‚É£ Drop old column if it already exists (avoid merge conflicts)
if 'first_purchase_date' in df.columns:
    df = df.drop(columns=['first_purchase_date'])

# ‚úÖ 3Ô∏è‚É£ Merge back into main dataframe
df = df.merge(first_purchase, on='user_id', how='left')

# ‚úÖ 4Ô∏è‚É£ Create cohort fields based on first purchase
df['first_purchase_month'] = df['first_purchase_date'].dt.to_period('M')
df['first_purchase_year'] = df['first_purchase_date'].dt.year

# ‚úÖ 5Ô∏è‚É£ Cohort-level performance summary
cohort_summary = (
    df.groupby('first_purchase_month')
    .agg(
        unique_customers=('user_id', 'nunique'),
        total_orders=('order_id', 'count'),
        total_revenue=('revenue', 'sum')
    )
    .reset_index()
)

# ‚úÖ 6Ô∏è‚É£ Add derived metrics
cohort_summary['avg_orders_per_customer'] = (
    cohort_summary['total_orders'] / cohort_summary['unique_customers']
)
cohort_summary['avg_revenue_per_customer'] = (
    cohort_summary['total_revenue'] / cohort_summary['unique_customers']
)

# ‚úÖ 7Ô∏è‚É£ Calculate repeat rate by first purchase cohort
repeat_check = (
    df.groupby(['first_purchase_month', 'user_id'])['order_id']
    .nunique()
    .reset_index()
)
repeat_check['is_repeat'] = repeat_check['order_id'] > 1

cohort_repeat_summary = (
    repeat_check.groupby('first_purchase_month')['is_repeat']
    .agg(['sum', 'count'])
    .reset_index()
)
cohort_repeat_summary['repeat_rate_%'] = (
    cohort_repeat_summary['sum'] / cohort_repeat_summary['count'] * 100
)
cohort_repeat_summary = cohort_repeat_summary.rename(
    columns={'sum': 'repeat_customers', 'count': 'total_customers'}
)

# ‚úÖ 8Ô∏è‚É£ Merge repeat rate into cohort summary
cohort_summary = cohort_summary.merge(
    cohort_repeat_summary[['first_purchase_month', 'repeat_rate_%']],
    on='first_purchase_month',
    how='left'
)

# ‚úÖ 9Ô∏è‚É£ Final cleanup
cohort_summary = cohort_summary.sort_values('first_purchase_month').reset_index(drop=True)

cohort_summary

Unnamed: 0,first_purchase_month,unique_customers,total_orders,total_revenue,avg_orders_per_customer,avg_revenue_per_customer,repeat_rate_%
0,2019-01,419,485,113142.0,1.157518,270.02864,14.797136
1,2019-02,318,336,86619.15,1.056604,272.387264,5.345912
2,2019-03,514,538,123695.48,1.046693,240.652685,4.280156
3,2019-04,522,548,125321.63,1.049808,240.079751,4.40613
4,2019-05,495,515,130048.64,1.040404,262.724525,3.434343
5,2019-06,467,486,111887.38,1.040685,239.587537,3.640257
6,2019-07,497,568,134067.33,1.142857,269.753179,13.078471
7,2019-08,514,576,132198.05,1.120623,257.19465,11.284047
8,2019-09,568,686,164051.75,1.207746,288.823504,19.366197
9,2019-10,479,508,125942.54,1.060543,262.928058,5.010438


In [None]:
cohort_summary.to_csv("cohort_summary.csv", index=False, encoding="utf-8")


In [None]:
# --- üë• CUSTOMER & USER KPIs ---

# ‚úÖ 1Ô∏è‚É£ TOTAL CUSTOMERS
total_customers = df['user_id'].nunique()

# ‚úÖ 2Ô∏è‚É£ REPEAT PURCHASE RATE
orders_per_customer = df.groupby('user_id')['order_id'].nunique()
repeat_customers = orders_per_customer[orders_per_customer > 1].count()
repeat_purchase_rate = (repeat_customers / total_customers) * 100

# ‚úÖ 3Ô∏è‚É£ AVERAGE ORDERS PER CUSTOMER
avg_orders_per_customer = df['order_id'].nunique() / total_customers

# ‚úÖ 4Ô∏è‚É£ CUSTOMER LIFETIME VALUE (CLV)
customer_ltv = (
    df.groupby('user_id')['revenue']
    .sum()
    .reset_index()
    .rename(columns={'revenue': 'lifetime_value'})
)
avg_ltv = customer_ltv['lifetime_value'].mean()
median_ltv = customer_ltv['lifetime_value'].median()

# ‚úÖ 5Ô∏è‚É£ CUSTOMER SEGMENTATION (by purchase frequency)
customer_segments = pd.cut(
    orders_per_customer,
    bins=[0, 1, 2, 5, float('inf')],
    labels=['One-time (1)', 'Low (2)', 'Medium (3-5)', 'High (6+)'],
    right=True
)
segment_distribution = (
    customer_segments.value_counts()
    .reset_index()
    .rename(columns={'index': 'segment', 'count': 'customers'})
)
segment_distribution['share_%'] = (
    segment_distribution['customers'] / total_customers * 100
)

# ‚úÖ 6Ô∏è‚É£ TOP CUSTOMERS (High-value customers)
top_customers = (
    customer_ltv.nlargest(100, 'lifetime_value')
    .agg({'lifetime_value': ['sum', 'mean', 'count']})
)
top_100_revenue = top_customers.loc['sum', 'lifetime_value']
top_100_share = (top_100_revenue / df['revenue'].sum()) * 100

# ‚úÖ 7Ô∏è‚É£ CUSTOMER DISTRIBUTION BY REGION
customer_distribution = (
    df.groupby('region')['user_id']
    .nunique()
    .reset_index()
    .rename(columns={'user_id': 'unique_customers'})
)
customer_distribution['customer_share_%'] = (
    customer_distribution['unique_customers'] / total_customers * 100
)

# Add revenue metrics by region
region_revenue = (
    df.groupby('region')
    .agg(
        total_revenue=('revenue', 'sum'),
        total_orders=('order_id', 'count')
    )
    .reset_index()
)
customer_distribution = customer_distribution.merge(region_revenue, on='region', how='left')
customer_distribution['revenue_per_customer'] = (
    customer_distribution['total_revenue'] / customer_distribution['unique_customers']
)
customer_distribution['orders_per_customer'] = (
    customer_distribution['total_orders'] / customer_distribution['unique_customers']
)

# ‚úÖ 8Ô∏è‚É£ CUSTOMER ACQUISITION METHOD SPLIT
acquisition_split = (
    df.groupby('account_creation_method_cleaned')['user_id']
    .nunique()
    .reset_index()
    .rename(columns={'user_id': 'unique_customers'})
)
acquisition_split['share_%'] = (
    acquisition_split['unique_customers'] / total_customers * 100
)

# Add performance metrics by acquisition method
acquisition_performance = (
    df.groupby('account_creation_method_cleaned')
    .agg(
        total_revenue=('revenue', 'sum'),
        total_orders=('order_id', 'count')
    )
    .reset_index()
)
acquisition_split = acquisition_split.merge(
    acquisition_performance,
    on='account_creation_method_cleaned',
    how='left'
)
acquisition_split['revenue_per_customer'] = (
    acquisition_split['total_revenue'] / acquisition_split['unique_customers']
)
acquisition_split['orders_per_customer'] = (
    acquisition_split['total_orders'] / acquisition_split['unique_customers']
)

# ‚úÖ 9Ô∏è‚É£ CREATE FINAL SUMMARY TABLE
summary_data = {
    'Metric': [
        'Total Customers',
        'Repeat Purchase Rate (%)',
        'Average Orders per Customer',
        'Average Lifetime Value ($)',
        'Median Lifetime Value ($)',
        'Top 100 Customers Revenue Share (%)',
        'One-time Customers (%)',
        'Repeat Customers (%)'
    ],
    'Value': [
        total_customers,
        round(repeat_purchase_rate, 2),
        round(avg_orders_per_customer, 2),
        round(avg_ltv, 2),
        round(median_ltv, 2),
        round(top_100_share, 2),
        round(100 - repeat_purchase_rate, 2),
        round(repeat_purchase_rate, 2)
    ]
}
customer_summary = pd.DataFrame(summary_data)

# ‚úÖ üîü MERGE ALL INTO ONE DICTIONARY
customer_kpis = {
    "Summary": customer_summary,
    "Customer Segments by Frequency": segment_distribution,
    "Customer Distribution by Region": customer_distribution,
    "Customer Acquisition Method Split": acquisition_split
}




In [None]:
customer_kpis

{'Summary':                                 Metric     Value
 0                      Total Customers  19851.00
 1             Repeat Purchase Rate (%)      9.34
 2          Average Orders per Customer      1.09
 3           Average Lifetime Value ($)    309.87
 4            Median Lifetime Value ($)    168.00
 5  Top 100 Customers Revenue Share (%)      5.10
 6               One-time Customers (%)     90.66
 7                 Repeat Customers (%)      9.34,
 'Customer Segments by Frequency':        order_id  customers    share_%
 0  One-time (1)      17996  90.655383
 1       Low (2)       1741   8.770339
 2  Medium (3-5)        114   0.574278
 3     High (6+)          0   0.000000,
 'Customer Distribution by Region':     region  unique_customers  customer_share_%  total_revenue  total_orders  \
 0     APAC              1430          7.203667      531226.11          1588   
 1     EMEA              8640         43.524256     2562575.68          9515   
 2    LATAM               356    

In [None]:
# ‚úÖ 1Ô∏è‚É£1Ô∏è‚É£ EXPORT TO CSV (all in one file with headers separating sections)
with open("customer_kpis.csv", "w", encoding="utf-8") as f:
    for name, data in customer_kpis.items():
        f.write(f"\n\n### {name}\n")
        data.to_csv(f, index=False)