In [1]:
import joblib
import os
import streamlit as st
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA               #PCA plots
from sklearn.cluster import KMeans, DBSCAN
from kneed import KneeLocator                       #Auto-detecting k value for elbow plot in kmeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE                   #t-SNE plots

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from warnings import filterwarnings
filterwarnings("ignore")



# Data Preprocesing 

In [21]:
# Loading Dataset, drop missing rows, remove negetive, convert to dateTime

def load_and_clean_data(file_path):
    # Load data
    df = pd.read_csv(file_path, encoding='latin1')

    # Drop rows with missing CustomerID
    df.dropna(subset=['CustomerID'], inplace=True)

    # Remove negative or zero Quantity and UnitPrice
    df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

    # Convert InvoiceDate to datetime
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

    return df

In [36]:
df = pd.read_csv("data\data.csv", encoding='latin1')

In [37]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France


In [38]:
df = df[["Quantity"]]
df

Unnamed: 0,Quantity
0,6
1,6
2,8
3,6
4,6
...,...
541904,12
541905,6
541906,4
541907,4


## Feature Enggineering 

In [3]:
#Create RFM
def create_rfm_features(df):
    # Define latest date for recency calculation
    latest_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

    # Group by CustomerID
    rfm = df.groupby('CustomerID').agg({
        'InvoiceDate': lambda x: (latest_date - x.max()).days,
        'InvoiceNo': 'nunique',
        'Quantity': 'sum',
        'UnitPrice': 'mean'
    }).reset_index()

    rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'TotalQuantity', 'AvgUnitPrice']

    # Create Monetary column
    df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
    monetary = df.groupby('CustomerID')['TotalPrice'].sum().reset_index()
    rfm = rfm.merge(monetary, on='CustomerID')
    rfm.rename(columns={'TotalPrice': 'Monetary'}, inplace=True)

    return rfm


## Checking Feature Distribution, Outlier handling and Scaling

In [4]:
# Plots for distributions
def plot_and_save_distributions(rfm_df, Before_After):
    plt.style.use('ggplot')
    features = ['Recency', 'Frequency', 'Monetary']

    for feature in features:
        plt.figure(figsize=(8, 5))
        sns.histplot(rfm_df[feature], kde=True, bins=30)
        plt.title(f'{feature} Distribution')
        plt.xlabel(feature)
        plt.ylabel('Count')
        plt.show()
        plt.savefig(f'plots/{feature}_distribution_{Before_After}.png')
        plt.close()


In [5]:
# Outlier handling
def detect_and_handle_outliers(rfm_df):
    def handle_outlier(col):
        Q1 = rfm_df[col].quantile(0.25)
        Q3 = rfm_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return rfm_df[(rfm_df[col] >= lower) & (rfm_df[col] <= upper)]

    filtered_df = rfm_df.copy()
    for col in ['Recency', 'Frequency', 'Monetary']:
        filtered_df = handle_outlier(col)

    return filtered_df

In [6]:
# scaler 
def scale_data(df, features):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df[features])
    return scaled, scaler

# Data visualization

In [7]:
# Ploting 3D 
def plot_3d_scatter(data, labels, title, filename):
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='viridis', s=50)
    ax.set_xlabel('Recency')
    ax.set_ylabel('Frequency')
    ax.set_zlabel('Monetary')
    ax.set_title(title)
    plt.show()
    plt.savefig(f'plots/{filename}')
    plt.close()

# Training 

### Kmeans Training (Kmeans Cluster and t-SNE, PCA plots, silhouette, Elbow method)

In [8]:
# Kmeans 

def train_kmeans(data, max_clusters=10):
    silhouette_scores = []
    inertias = []

    # Test k from 2 to max_clusters
    k_range = range(2, max_clusters + 1)
    for k in k_range:
        model = KMeans(n_clusters=k, random_state=42)
        labels = model.fit_predict(data)
        silhouette_scores.append(silhouette_score(data, labels))
        inertias.append(model.inertia_)

    # Auto-detect the elbow point
    kl = KneeLocator(k_range, inertias, curve="convex", direction="decreasing")
    best_k = kl.elbow
    best_model = KMeans(n_clusters=best_k, random_state=42).fit(data)

    # Plot Silhouette Score
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(k_range, silhouette_scores, marker='o', color='teal')
    plt.title('Silhouette Score vs. Number of Clusters')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.axvline(x=best_k, color='red', linestyle='--', label=f'Auto-selected k = {best_k}')
    plt.legend()
    plt.grid(True)

    # Plot Elbow Method (Inertia)
    plt.subplot(1, 2, 2)
    plt.plot(k_range, inertias, marker='s', color='orange')
    plt.title('Elbow Method: Inertia vs. Number of Clusters')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia (WCSS)')
    plt.axvline(x=best_k, color='red', linestyle='--', label=f'Elbow at k = {best_k}')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()
    plt.savefig('plots/kmeans_elbow_auto.png')
    plt.show()

    print(f"Auto-selected best k using elbow method: {best_k}")
    return best_model, best_model.labels_



In [9]:
# Kmeans Cluster and t-SNE, PCA plots

def plot_kmeans_clusters(data, labels, centers=None):
    if data.shape[1] > 2:
        pca = PCA(n_components=2)
        reduced_data = pca.fit_transform(data)
    else:
        reduced_data = data

    plt.figure(figsize=(8, 6))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='Set1', s=30, alpha=0.7, label='Points')

    if centers is not None:
        if data.shape[1] > 2:
            centers_2d = pca.transform(centers)
        else:
            centers_2d = centers
        plt.scatter(centers_2d[:, 0], centers_2d[:, 1], c='black', marker='X', s=200, label='Centroids')

    plt.title('KMeans Clustering Results')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    plt.savefig('plots/kmeans_clusters.png')
    plt.close()


In [10]:
# Kmeans t-SNE

def plot_tsne_clusters(data, labels, perplexity=30, learning_rate=200):
    tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate, random_state=42)
    reduced_data = tsne.fit_transform(data)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='Set1', s=30, alpha=0.7)
    
    # Legend based on unique labels
    unique_labels = sorted(set(labels))
    handles = [plt.Line2D([], [], marker='o', color='w', label=f'Cluster {lbl}',
                          markerfacecolor=scatter.cmap(scatter.norm(lbl)), markersize=8)
               for lbl in unique_labels]
    plt.legend(handles=handles)

    plt.title('t-SNE Visualization of Clusters')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    plt.savefig('plots/tsne_kmeans_clusters.png')
    plt.close()

### Hirarchial Training (Dendrogram, t-sne)

In [11]:
# Hirarchial 

def train_hierarchical(data, method='ward'):
    Z = linkage(data, method=method)

    # Auto-select cut height using distance gap
    last_10 = Z[-10:, 2]
    gaps = np.diff(last_10)
    best_gap_idx = gaps.argmax()
    cut_height = last_10[best_gap_idx]

    # Dendrogram with cut line
    plt.figure(figsize=(10, 6))
    dendrogram(Z)
    plt.axhline(y=cut_height, color='red', linestyle='--', label=f'Cut at {cut_height:.2f}')
    plt.legend()

    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Sample Index or Cluster Merge Step')
    plt.ylabel('Distance')
    plt.tight_layout()
    plt.show()
    plt.savefig('plots/hierarchical_dendrogram.png')
    

    # Assign clusters
    labels = fcluster(Z, t=cut_height, criterion='distance')
    return Z, labels


In [12]:
# Hierarchical t-SNE

def plot_tsne_hierarchical(data, labels, perplexity=30, learning_rate=200):
    tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate, random_state=42)
    reduced_data = tsne.fit_transform(data)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='Set2', s=30, alpha=0.7)

    # Legend based on unique labels
    unique_labels = sorted(np.unique(labels))
    handles = [plt.Line2D([], [], marker='o', color='w', label=f'Cluster {lbl}',
                          markerfacecolor=scatter.cmap(scatter.norm(lbl)), markersize=8)
               for lbl in unique_labels]
    plt.legend(handles=handles)

    plt.title('t-SNE Visualization of Hierarchical Clusters')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    plt.savefig('plots/tsne_hierarchical_clusters.png')
    


### DBSCAN Training (clustor , t-SNE)

In [13]:
#DBSCAN train

def train_dbscan(data, eps=0.5, min_samples=5):
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(data)

    # Dimensionality reduction for visualization if needed
    if data.shape[1] > 2:
        pca = PCA(n_components=2)
        reduced_data = pca.fit_transform(data)
    else:
        reduced_data = data

    # Plot DBSCAN results
    plt.figure(figsize=(8, 6))
    unique_labels = np.unique(labels)
    for label in unique_labels:
        mask = labels == label
        if label == -1:
            color = 'k'  # Black for noise
            label_name = 'Noise'
        else:
            color = plt.cm.Set1(label / max(unique_labels))  # Varying colors
            label_name = f'Cluster {label}'
        plt.scatter(reduced_data[mask, 0], reduced_data[mask, 1], c=[color], label=label_name, s=30)

    plt.title('DBSCAN Clustering Results')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    plt.savefig('plots/dbscan_clusters.png')
    

    return model, labels


In [14]:
# DBSCAN t-SNE

def plot_tsne_dbscan_clusters(scaled_data, labels, perplexity=30, learning_rate=200):
    tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate, random_state=42)
    reduced_data = tsne.fit_transform(scaled_data)

    plt.figure(figsize=(8, 6))
    unique_labels = sorted(set(labels))

    for label in unique_labels:
        mask = (labels == label)
        if label == -1:
            color = 'k'
            label_name = 'Noise'
        else:
            color = plt.cm.Set1(label / max(unique_labels))  # Distinct colors
            label_name = f'Cluster {label}'
        plt.scatter(reduced_data[mask, 0], reduced_data[mask, 1], c=[color], label=label_name, s=30, alpha=0.7)

    plt.title('t-SNE Visualization of DBSCAN Clusters')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    plt.savefig('plots/tsne_dbscan_clusters.png')
    


### Ploting PCA 

In [15]:
def plot_pca_clusters(data, labels, title, filename):
    pca = PCA(n_components=2)
    components = pca.fit_transform(data)
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=components[:, 0], y=components[:, 1], hue=labels, palette='Set2', s=60)
    plt.title(title)
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.show()
    plt.savefig(f'plots/{filename}')
    plt.close()

## Model performances - Comparing

In [16]:
# Comparing Model performances
def compare_models(data, label_sets):
    scores = {}
    for name, labels in label_sets.items():
        if len(set(labels)) > 1 and -1 not in set(labels):
            sil_score = silhouette_score(data, labels)
            db_score = davies_bouldin_score(data, labels)
            scores[name] = {"Silhouette": sil_score, "Davies-Bouldin": db_score}
    return scores

### Saving trained models

In [17]:
#Saving trained model
def save_model(model, name):
    os.makedirs('models', exist_ok=True)
    path = f'models/{name}.pkl'
    joblib.dump(model, path)
    print(f"Model saved at {path}")

# Calling Methods

pre-processing call

In [18]:
# data, info , descreption
raw = pd.read_csv('data/data.csv', encoding = 'latin1')
raw["InvoiceDate"] = pd.to_datetime(raw["InvoiceDate"])
raw.head()
print(f"Raw data with shape :{raw.shape}")
print("--------------------------------------------")
print(f"Info : {raw.info()}")
print("--------------------------------------------")
print(f"Descreption numeric : {raw.describe()}")

Raw data with shape :(541909, 8)
--------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB
Info : None
--------------------------------------------
Descreption numeric :             Quantity                    InvoiceDate      UnitPrice  \
count  541909.000000                         541909  541909.000000   
mean       

In [19]:
cleaned_df

NameError: name 'cleaned_df' is not defined

In [None]:
data_path = 'data/data.csv'

cleaned_df = load_and_clean_data(data_path)
cleaned_df.to_csv('data/cleaned_data.csv', index=False)
print(f"Data cleaned and saved to 'data/cleaned_data.csv' with shape: {cleaned_df.shape}")


Data cleaned and saved to 'data/cleaned_data.csv' with shape: (397884, 8)


RFM Call

In [None]:

rfm_df = create_rfm_features(cleaned_df)
rfm_df.to_csv('data/rfm_data.csv', index=False)
print(f"RFM features created and saved to 'data/rfm_data.csv' with shape: {rfm_df.shape}")


RFM features created and saved to 'data/rfm_data.csv' with shape: (4338, 6)


### Saving RFM distribution curves

In [None]:
before_after= "Before"
plot_and_save_distributions(rfm_df, before_after)
print("Initial distribution plots saved to 'plots/'.")

filtered_rfm_df = detect_and_handle_outliers(rfm_df)
filtered_rfm_df.to_csv('data/rfm_data_filtered.csv', index=False)
print(f"Outliers handled and data saved to 'data/rfm_data_filtered.csv' with shape: {filtered_rfm_df.shape}")

before_after= "After"
plot_and_save_distributions(filtered_rfm_df,before_after)
print("Filtered distribution plots saved to 'plots/'.")

Initial distribution plots saved to 'plots/'.
Outliers handled and data saved to 'data/rfm_data_filtered.csv' with shape: (3911, 6)
Filtered distribution plots saved to 'plots/'.


### Scaling and 3D scatter

In [None]:
# Scaling and 3D scatter
features = ['Recency', 'Frequency', 'Monetary']
scaled_data, scaler = scale_data(filtered_rfm_df, features)
plot_3d_scatter(scaled_data, labels=[0]*len(scaled_data), title='3D Scatter of Scaled RFM', filename='3d_rfm_scatter.png')
print("3D scatter plot of scaled RFM data saved to 'plots/3d_rfm_scatter.png'.")


3D scatter plot of scaled RFM data saved to 'plots/3d_rfm_scatter.png'.


### Kmeans -Calling (Kmeans Cluster and t-SNE, PCA plots, silhouette, Elbow method)

In [None]:
# Train KMeans
kmeans_model, kmeans_labels = train_kmeans(scaled_data)  # Your KMeans model
filtered_rfm_df['KMeans_Labels'] = kmeans_labels
print("KMeans clustering completed.")

#Plots Kmeans with centroids
plot_kmeans_clusters(scaled_data, kmeans_labels, centers=kmeans_model.cluster_centers_)

#Plots t-SNE
plot_tsne_clusters(scaled_data, kmeans_labels)

# PCA plot
plot_pca_clusters(scaled_data, kmeans_labels, 'KMeans Clusters (PCA)', 'pca_kmeans.png')


Auto-selected best k using elbow method: 4
KMeans clustering completed.


### Hierarchical  -Call (Dendrogram and t-SNE, PCA plots)

In [None]:
# calling Hierarchical Clustering

Z, hierarchical_labels = train_hierarchical(scaled_data)
filtered_rfm_df['Hierarchical_Labels'] = hierarchical_labels
print("Hierarchical clustering completed.")

#Dendrogram
# t-SNE
plot_tsne_hierarchical(scaled_data, hierarchical_labels)

#PCA plot
plot_pca_clusters(scaled_data, hierarchical_labels, 'Hierarchical Clusters (PCA)', 'pca_hierarchical.png')

Hierarchical clustering completed.


### DBSCAN  -Call (cluster and t-SNE, PCA plots)

In [None]:
# Train DBSCAN
dbscan_model, dbscan_labels = train_dbscan(scaled_data)
filtered_rfm_df['DBSCAN_Labels'] = dbscan_labels
print("DBSCAN clustering completed.")

#t-SNE
plot_tsne_dbscan_clusters(scaled_data, dbscan_labels)
filtered_rfm_df.to_csv('data/clustered_rfm.csv', index=False)
print("Clustered RFM data saved to 'data/clustered_rfm.csv'.")

#PCA plot
plot_pca_clusters(scaled_data, dbscan_labels, 'DBSCAN Clusters (PCA)', 'pca_dbscan.png')

DBSCAN clustering completed.
Clustered RFM data saved to 'data/clustered_rfm.csv'.


## Model Comparison - Call

In [None]:
# Model Comparison
label_sets = {
    'KMeans': kmeans_labels,
    'Hierarchical': hierarchical_labels,
    'DBSCAN': dbscan_labels
}
model_scores = compare_models(scaled_data, label_sets)
for model, scores in model_scores.items():
    print(f"{model} - Silhouette: {scores['Silhouette']:.4f}, Davies-Bouldin: {scores['Davies-Bouldin']:.4f}")

print("""
✅ Conclusion:
KMeans performs better than Hierarchical clustering on both metrics:

Higher Silhouette Score → better cluster cohesion and separation

Lower Davies-Bouldin Index → better-defined clusters""")

KMeans - Silhouette: 0.4255, Davies-Bouldin: 0.8735
Hierarchical - Silhouette: 0.3975, Davies-Bouldin: 0.7988

✅ Conclusion:
KMeans performs better than Hierarchical clustering on both metrics:

Higher Silhouette Score → better cluster cohesion and separation

Lower Davies-Bouldin Index → better-defined clusters


In [None]:
# Save models
save_model(kmeans_model, "kmeans_model")
save_model(dbscan_model, "dbscan_model")
save_model(Z, "hierarchical_model")
save_model(scaler, "scaler")

Model saved at models/kmeans_model.pkl
Model saved at models/dbscan_model.pkl
Model saved at models/hierarchical_model.pkl
Model saved at models/scaler.pkl


In [None]:
# Save labeled data
filtered_rfm_df['KMeans_Labels'] = kmeans_labels
filtered_rfm_df['Hierarchical_Labels'] = hierarchical_labels
filtered_rfm_df['DBSCAN_Labels'] = dbscan_labels
filtered_rfm_df.to_csv("data/clustered_rfm.csv", index=False)
print("Final labeled data saved and models stored.")

Final labeled data saved and models stored.


In [None]:
print(filtered_rfm_df[["KMeans_Labels"]].value_counts())
print(filtered_rfm_df[["Hierarchical_Labels"]].value_counts())
print(filtered_rfm_df[["DBSCAN_Labels"]].value_counts())

KMeans_Labels
3                1669
1                 952
0                 904
2                 386
Name: count, dtype: int64
Hierarchical_Labels
3                      2526
1                      1078
2                       307
Name: count, dtype: int64
DBSCAN_Labels
 0               3869
-1                 42
Name: count, dtype: int64
