In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# 1. Setup Paths
os.makedirs("outputs", exist_ok=True)
os.makedirs("visuals", exist_ok=True)

# 2. Load Dataset
df = pd.read_csv("data/OnlineRetail.csv", encoding="latin1")

print("Shape of dataset:", df.shape)

# 3. Data Preprocessing
df.dropna(subset=["CustomerID"], inplace=True)  
df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

# 4. RFM Calculation
snapshot_date = df["InvoiceDate"].max() + pd.Timedelta(days=1)

rfm = df.groupby("CustomerID").agg({
    "InvoiceDate": lambda x: (snapshot_date - x.max()).days,  
    "InvoiceNo": "count",                                  
    "TotalPrice": "sum"                                     
})

rfm.rename(columns={
    "InvoiceDate": "Recency",
    "InvoiceNo": "Frequency",
    "TotalPrice": "Monetary"
}, inplace=True)

with open("outputs/rfm_summary.txt", "w") as f:
    f.write("RFM Head:\n")
    f.write(str(rfm.head()))
    f.write("\n\nRFM Description:\n")
    f.write(str(rfm.describe()))

# 5. Normalization
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

# 6. KMeans Clustering
wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(rfm_scaled)
    wcss.append(kmeans.inertia_)

# Elbow plot
plt.figure(figsize=(6, 4))
plt.plot(range(1, 11), wcss, marker="o")
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.savefig("visuals/elbow_method.png", dpi=300, bbox_inches="tight")
plt.close()

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)

# Save clustered stats
with open("outputs/cluster_summary.txt", "w") as f:
    f.write("Cluster Counts:\n")
    f.write(str(rfm["Cluster"].value_counts()))
    f.write("\n\nCluster Means:\n")
    f.write(str(rfm.groupby("Cluster").mean()))

# 7. Visualization
# Scatterplot
plt.figure(figsize=(10, 5))
sns.scatterplot(data=rfm, x="Recency", y="Monetary", hue="Cluster", palette="Set2", s=80)
plt.title("Customer Segmentation (RFM Clusters)")
plt.savefig("visuals/rfm_scatter.png", dpi=300, bbox_inches="tight")
plt.close()

# Boxplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.boxplot(data=rfm, x="Cluster", y="Recency", ax=axes[0], palette="Set2")
sns.boxplot(data=rfm, x="Cluster", y="Frequency", ax=axes[1], palette="Set2")
sns.boxplot(data=rfm, x="Cluster", y="Monetary", ax=axes[2], palette="Set2")

axes[0].set_title("Recency by Cluster")
axes[1].set_title("Frequency by Cluster")
axes[2].set_title("Monetary by Cluster")

plt.tight_layout()
plt.savefig("visuals/rfm_boxplots.png", dpi=300, bbox_inches="tight")
plt.close()


Shape of dataset: (541909, 8)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=rfm, x="Cluster", y="Recency", ax=axes[0], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=rfm, x="Cluster", y="Frequency", ax=axes[1], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=rfm, x="Cluster", y="Monetary", ax=axes[2], palette="Set2")
