In [None]:

# Step 1: Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Visualization settings
sns.set(style="whitegrid")

# For date operations
import datetime as dt

import os
print(os.getcwd())
import os
print(os.listdir())
# Step 2:Load Dataset
df = pd.read_csv("online_retail_II.csv")
print(df)
df.info()
df.describe()
df.isnull().sum()

# step 3: Data Cleaning
# Remove duplicates
df.drop_duplicates(inplace=True)

# Drop missing CustomerID (very common in retail datasets)
df = df.dropna(subset=['Customer ID'])

# Remove cancelled/negative transactions
df = df[(df['Quantity'] > 0) & (df['Price'] > 0)]

# Create Total Sales column
df['Sales'] = df['Quantity'] * df['Price']

print("After cleaning:", df.shape)
df.head()


# Step 4: RFM Feature Engineering


# Step 4: Feature Engineering (RFM)

# Convert invoice_date to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

# Drop rows with missing invoice_date
df = df.dropna(subset=['InvoiceDate'])

# Reference date = 1 day after last transaction
ref_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

# Compute RFM
rfm = df.groupby('Customer ID').agg({
    'InvoiceDate': lambda x: (ref_date - x.max()).days,  # Recency
    'Invoice': 'nunique',                              # Frequency
    'Sales': 'sum'                                        # Monetary
})

# Rename columns
rfm.rename(columns={
    'InvoiceDate': 'recency',
    'Invoice': 'frequency',
    'Sales': 'monetary'
}, inplace=True)

rfm.head()

# Step 5: Data Scaling

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

# Step 6: K-Means Clustering

# Use 4 clusters (you can tune later)
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
rfm['cluster'] = kmeans.fit_predict(rfm_scaled)
rfm.head()

# Step 7: Analyze Clusters

cluster_summary = rfm.groupby('cluster').mean().round(2)
cluster_summary


# Step 8: Visualization

plt.figure(figsize=(8,6))
sns.scatterplot(data=rfm, x='recency', y='monetary', hue='cluster', palette='viridis', s=80)
plt.title("Customer Segmentation (RFM Clusters)", fontsize=14)
plt.xlabel("Recency (days)")
plt.ylabel("Monetary ($)")
plt.show()

# Step 9: Dynamic Pricing Simulation

# Example strategy:
# Cluster with Recency < median → +5% price, others → -2% price
rfm['simulated_revenue'] = np.where(rfm['recency'] < rfm['recency'].median(),
                                    rfm['monetary'] * 1.05,
                                    rfm['monetary'] * 0.98)

# Compare before vs after revenue by cluster
comparison = rfm.groupby('cluster')[['monetary','simulated_revenue']].mean().round(2)
comparison

# Step 10: Insights / Reporting

for cluster in rfm['cluster'].unique():
    data = rfm[rfm['cluster']==cluster]
    print(f"\nCluster {cluster}:")
    print(f"Avg Recency: {data['recency'].mean():.2f} days")
    print(f"Avg Frequency: {data['frequency'].mean():.2f} purchases")
    print(f"Avg Monetary: ${data['monetary'].mean():.2f}")
    print(f"Simulated Avg Revenue: ${data['simulated_revenue'].mean():.2f}")
