# Module 5 Numpy and Descriptive Analytics

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
import warnings

In [2]:
# Filtering the warning
warnings.filterwarnings('ignore')

In [3]:
# Generate random sales data

# Reproducibility
np.random.seed(42)

#Simulate a customer purchase dataset
n = 500
costumer_data =  pd.DataFrame({
    'customer_id': range(1, n+1),
    'age': np.random.randint(18, 70, size = n),
    'annual_spend': np.random.exponential(scale = 1500, size = n).round(2),
    'num_purchases': np.random.poisson(lam = 8, size = n),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size = n),
    'cohort_month': np.random.choice(['2025-01', '2025-02', '2025-03', '2025-04'], size = n)
})
costumer_data.head()

Unnamed: 0,customer_id,age,annual_spend,num_purchases,region,cohort_month
0,1,56,4647.16,8,East,2025-01
1,2,69,2008.53,7,North,2025-02
2,3,46,1212.35,7,West,2025-02
3,4,32,1419.05,6,West,2025-04
4,5,60,816.06,5,East,2025-01


## 1. Descriptive Statiscs Review

A quick sanity check on your dataset is the first step in any data workflow

In [4]:
# Full Summar
costumer_data.describe()

Unnamed: 0,customer_id,age,annual_spend,num_purchases
count,500.0,500.0,500.0,500.0
mean,250.5,44.22,1530.54618,7.866
std,144.481833,15.036082,1557.403414,2.677009
min,1.0,18.0,6.96,1.0
25%,125.75,32.0,416.51,6.0
50%,250.5,45.0,1049.585,8.0
75%,375.25,57.0,2114.76,10.0
max,500.0,69.0,12258.67,16.0


In [7]:
# Numpy for Manual Statistics

spend = costumer_data['annual_spend'].values

print(f"Mean: RM{np.mean(spend):,.2f}")
print(f"Median: RM{np.median(spend):,.2f}")
print(f"Standard Deviation: RM{np.std(spend):,.2f}")
print(f"Variance: RM{np.var(spend):,.2f}")
print(f"Skewness: {stats.skew(spend):,.2f}")
print(f"Kurtosis: {stats.kurtosis(spend):,.2f}")

Mean: RM1,530.55
Median: RM1,049.59
Standard Deviation: RM1,555.85
Variance: RM2,420,654.38
Skewness: 1.99
Kurtosis: 6.09


## Quantiles and Percentiles

Percentiles tell us about the disribution of data and are crucial for feature engineering and outlier detection

In [10]:
# Assing basic percentages

percentiles = [5, 10, 25, 50, 75, 90, 95, 99]

values = np.percentile(spend, percentiles)

percentile_df = pd.DataFrame({
    'Percentile': percentiles,
    'Annual Spend (RM)': values.round(2)
})
print(percentile_df.to_string(index=False))

 Percentile  Annual Spend (RM)
          5              86.81
         10             158.85
         25             416.51
         50            1049.58
         75            2114.76
         90            3552.55
         95            4506.29
         99            6549.96


In [15]:
# IQR - Interquartile Range (used in outlier detection)

Q1 = np.percentile(spend, 25)
Q3 = np.percentile(spend, 75)

IQR = Q3 - Q1

lower_fence = Q1 - 1.5 * IQR
upper_fence = Q3 + 1.5 * IQR

outliers = spend[(spend < lower_fence) | (spend > upper_fence)]

print(f"Q1: RM {Q1:.2f} or Q3: RM {Q3:.2f} or IQR: RM {IQR:.2f}")
print(f"\nOutlier fences: [RM {lower_fence:.2f}, RM {upper_fence:.2f}]")
print(f"\nNumber of outliers: {len(outliers)}")

Q1: RM 416.51 or Q3: RM 2114.76 or IQR: RM 1698.25

Outlier fences: [RM -2130.87, RM 4662.14]

Number of outliers: 22


## 3. Confidence Intervals

A confidence interval gives us a range in which the true population parameter likely to falls. This is crucial for reporting metrcis with statistical rigor.

In [16]:
# 95% Confidence Interval for mean annual spend

confidence = 0.95
n_samples = len(spend)
sample_mean = np.mean(spend)
sample_std = np.std(spend, ddof=1)
standard_error = sample_std / np.sqrt(n_samples)

# Using Scipy for t-distribution (more accurate for finite samples)

ci_lower, ci_upper = stats.t.interval(
    confidence,
    df = n_samples - 1,
    loc = sample_mean,
    scale = standard_error
)

print(f"Sample Mean: RM {sample_mean:.2f}")
print(f"\n95% CI: (RM{ci_lower:.2f}, RM {ci_upper:.2f})")
print(f"\nInterception: We are 95% confident the true average customer spend between RM {ci_lower:.2f} and RM {ci_upper:.2f}")

Sample Mean: RM 1530.55

95% CI: (RM1393.70, RM 1667.39)

Interception: We are 95% confident the true average customer spend between RM 1393.70 and RM 1667.39


In [18]:
# Compare CI by region
region_ci = []
for region, group in costumer_data.groupby('region'):
    spend_group =  group['annual_spend'].values
    lo, hi = stats.t.interval(
        0.95, df = len(spend_group) - 1,
        loc = np.mean(spend_group),
        scale = stats.sem(spend_group)
    )
    region_ci.append({
        'Region': region,
        'Mean': np.mean(spend_group).round(2),
        '95% CI lower': lo.round(2),
        '95% CI Higher': hi.round(2)
    })

pd.DataFrame(region_ci)

Unnamed: 0,Region,Mean,95% CI lower,95% CI Higher
0,East,1405.26,1154.69,1655.83
1,North,1719.53,1412.21,2026.84
2,South,1562.19,1273.31,1851.07
3,West,1432.91,1182.23,1683.6


## 4. Ranking and Segmentation

In [19]:
# Rank customers by annual spend (dense ranking)

costumer_data['spend_rank'] = costumer_data["annual_spend"].rank(ascending=False, method="dense").astype(int)

# Ranking Fram segmentation using pd.cut
costumer_data['spend_segment'] = pd.cut(
    costumer_data['annual_spend'],
    bins = [0, 500, 1500, 3000, np.inf],
    labels = ['Low', 'Mid', 'High', 'Premium']
)
print("Segment Distribution:")
print(costumer_data['spend_segment'].value_counts())

Segment Distribution:
spend_segment
Mid        180
Low        144
High        99
Premium     77
Name: count, dtype: int64


In [20]:
costumer_data.head()

Unnamed: 0,customer_id,age,annual_spend,num_purchases,region,cohort_month,spend_rank,spend_segment
0,1,56,4647.16,8,East,2025-01,23,Premium
1,2,69,2008.53,7,North,2025-02,139,High
2,3,46,1212.35,7,West,2025-02,225,Mid
3,4,32,1419.05,6,West,2025-04,194,Mid
4,5,60,816.06,5,East,2025-01,289,Mid


In [28]:
# Percentile-based segmentation (quartile tiers)
costumer_data['quartile_tier'] = pd.qcut(
    costumer_data['annual_spend'],
    q = 4,
    labels = ['Q1-Bottom', 'Q2', 'Q3', 'Q4-Top'] 
)
costumer_data[['customer_id', 'annual_spend', 'spend_segment', 'quartile_tier','spend_rank']]\
    .sort_values('spend_rank').head(10)

Unnamed: 0,customer_id,annual_spend,spend_segment,quartile_tier,spend_rank
207,208,12258.67,Premium,Q4-Top,1
208,209,8542.31,Premium,Q4-Top,2
151,152,7435.24,Premium,Q4-Top,3
66,67,6985.51,Premium,Q4-Top,4
381,382,6901.8,Premium,Q4-Top,5
194,195,6546.41,Premium,Q4-Top,6
117,118,6473.19,Premium,Q4-Top,7
337,338,6459.29,Premium,Q4-Top,8
175,176,6425.8,Premium,Q4-Top,9
122,123,6403.16,Premium,Q4-Top,10


In [29]:
costumer_data[['customer_id', 'annual_spend', 'spend_segment', 'quartile_tier','spend_rank']]\
    .sort_values('spend_rank').tail(10)

Unnamed: 0,customer_id,annual_spend,spend_segment,quartile_tier,spend_rank
53,54,35.32,Low,Q1-Bottom,491
316,317,30.41,Low,Q1-Bottom,492
47,48,27.58,Low,Q1-Bottom,493
470,471,27.41,Low,Q1-Bottom,494
239,240,21.98,Low,Q1-Bottom,495
8,9,21.75,Low,Q1-Bottom,496
146,147,18.34,Low,Q1-Bottom,497
132,133,17.13,Low,Q1-Bottom,498
83,84,16.35,Low,Q1-Bottom,499
497,498,6.96,Low,Q1-Bottom,500


## 5. Cohort Tables

Cohort analysis groups customers by when joined and tracks their behaviour over time.

In [31]:
# Average spend and purchase by cohort month and region
cohort_table = costumer_data.groupby(['cohort_month', 'region']).agg(
    num_customers = ('customer_id', 'count'),
    avg_spend = ('annual_spend', 'mean'),
    avg_purchases = ('num_purchases', 'mean')
).round(2)
cohort_table

Unnamed: 0_level_0,Unnamed: 1_level_0,num_customers,avg_spend,avg_purchases
cohort_month,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-01,East,31,1229.65,7.9
2025-01,North,28,1763.26,8.25
2025-01,South,29,1575.96,7.55
2025-01,West,39,1592.56,8.44
2025-02,East,29,1155.41,7.17
2025-02,North,40,2201.5,7.68
2025-02,South,31,1900.51,8.13
2025-02,West,32,1263.05,7.84
2025-03,East,33,1659.02,8.15
2025-03,North,30,1237.15,7.57


## 6. Statistical Inputs for ML

Before feeding data into a model, we often need to transform or standardize features.

In [34]:
# Z-score normalization (Standardazation)
costumer_data['spend_zscore'] = stats.zscore(costumer_data['annual_spend'])

# Min-Max Scaling to [0, 1]
spend_vals = costumer_data['annual_spend']
costumer_data['spend_minmax'] = (spend_vals - spend_vals.min()) / (spend_vals.max() - spend_vals.min())

# Log Transform for skewed data
costumer_data['spend_log'] = np.log1p(costumer_data['annual_spend'])

print("\nTransofrmed feature statistics:\n")
costumer_data[['annual_spend', 'spend_zscore', 'spend_minmax', 'spend_log']].describe().round(4)


Transofrmed feature statistics:



Unnamed: 0,annual_spend,spend_zscore,spend_minmax,spend_log
count,500.0,500.0,500.0,500.0
mean,1530.5462,-0.0,0.1244,6.7508
std,1557.4034,1.001,0.1271,1.2464
min,6.96,-0.9793,0.0,2.0744
25%,416.51,-0.716,0.0334,6.0343
50%,1049.585,-0.3091,0.0851,6.9571
75%,2114.76,0.3755,0.172,7.6572
max,12258.67,6.8954,1.0,9.4141


## 7.Correlation Analysis for Feature Selection

In [35]:
# Pearson correlation matrix between numeric features
numeric_cols = ['age', 'annual_spend', 'num_purchases']
corr_matrix = costumer_data[numeric_cols].corr().round(4)

print("\nCorrelation Matrix:\n")
corr_matrix


Correlation Matrix:



Unnamed: 0,age,annual_spend,num_purchases
age,1.0,0.0084,-0.004
annual_spend,0.0084,1.0,-0.0484
num_purchases,-0.004,-0.0484,1.0


## 8. Excercise:

    1. What is the 99th percentile of annual_spend
    2. Build 99% of confidence interval for mean numbers of purchases and compare it to 95% CI
    3. Add a new feature spend_per_purchase to customer_data
    4. Create a cohort table grouped by spend_segment and figure what segment has the highest avg_purchases