### ***`Unsupervised Machine Learning`***

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from functools import lru_cache
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

##### `Reading modelpoints dataset`

In [2]:
# Reads model points data from local directory
def loading_modelpoint_data(file_name: str) -> pd.DataFrame:
    path = (Path('.').cwd().parent / 'data/processed') / file_name
    df = pd.read_parquet(path)
    return df

##### `Split data modelpoints`

In [3]:
# Reading Modepoints
modelpoint_train = loading_modelpoint_data('modelpoint_train.parquet')
modelpoint_train = modelpoint_train[modelpoint_train.item_id != 'no_item_id']
modelpoint_eval = loading_modelpoint_data('modelpoint_eval.parquet')
modelpoint_train.head()

Unnamed: 0,user_id,target,item_id,time_of_day,segment,beh_segment,total_user_interaction,total_user_interation_per_item,item_popularity,normalized_popularity,...,item_type_lifestyle,item_type_transact,active_mode_active,active_mode_cold start,active_mode_semi active,screen_page_screen1,screen_page_screen2,time_action_trans(days),trans_probability,transition_category
0,28951,checkout,ctln,1,0,0,18,6,52709,9.469817,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,12,0.5,0
1,28951,checkout,ctln,1,0,0,18,6,52709,9.469817,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,12,0.5,0
2,28951,checkout,ctln,1,0,0,18,6,52709,9.469817,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.5,1
3,28951,checkout,ctln,1,0,0,18,6,52709,9.469817,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.5,1
4,28951,checkout,ctln,1,0,0,18,6,52709,9.469817,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.5,1


In [4]:
X_points = modelpoint_train.copy()

y = X_points['target']
X = X_points.drop(columns=['target', 'item_id', 'user_id']).astype(float)

### **`Clustering Experiments`**

In [5]:
import json
import joblib
import logging
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Union, Dict, Any
from sklearn.base import BaseEstimator
from sklearn.cluster import DBSCAN, KMeans
from sklearn.mixture import GaussianMixture
from dataclasses import dataclass, asdict, field
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score
)

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

#### `Baseline implementation`

In [7]:
# Clustering models: baseline implementation
@dataclass
class EvaluationMetric:
    silhouette: float
    davies_bouldin: float
    calinski_harabasz: float

@dataclass
class ModelInfo:
    name: str
    params: Dict[str, Any] = field(default_factory=dict)

    @classmethod
    def from_kwargs(cls, name: str, **kwargs):
        return cls(name=name, params=kwargs)

@dataclass
class ClusteringMetadata:
    version: float
    data_size: int
    n_clusters: int
    model: ModelInfo
    metrics: EvaluationMetric

@dataclass
class ClusterModel:
    data: Union[pd.DataFrame, np.ndarray]
    data_size: int
    version: float = 1.0
    verbose: bool = False

    def _save_metadata_to_local(
        self,
        model: BaseEstimator,
        metadata: ClusteringMetadata,
        cluster_name: str
    ) -> None:
        file_directory = Path.cwd().parent / "models/clusters"
        file_directory.mkdir(parents=True, exist_ok=True)

        model_path = file_directory / f"{cluster_name}_model_v{self.version}.pkl"
        meta_path = file_directory / f"{cluster_name}_metadata_v{self.version}.json"

        joblib.dump(model, model_path)
        with open(meta_path, 'w', encoding='utf-8') as f:
            json.dump(asdict(metadata), f, indent=4)

        logger.info(f"Saved model to: {model_path}")
        logger.info(f"Saved metadata to: {meta_path}")

        if self.verbose:
            print(json.dumps(asdict(metadata), indent=2))

@dataclass            
class GMM_ClusterModel(ClusterModel):
    n_comp: int = 3
    cov_type: str = 'full'
    seed: int = 43
    cluster_name: str = 'gaussian-mixture'
    
    def train_model(self) -> None:
        logger.info(f"Training model: [{self.cluster_name}] with {self.n_comp} components")

        model = GaussianMixture(
            n_components=self.n_comp,
            covariance_type=self.cov_type,
            random_state=self.seed
        )

        X = self.data.values if isinstance(self.data, pd.DataFrame) else self.data
        labels = model.fit_predict(X)
        n_clusters = len(np.unique(labels))

        metrics = EvaluationMetric(
            silhouette=float(silhouette_score(X, labels)),
            davies_bouldin=float(davies_bouldin_score(X, labels)),
            calinski_harabasz=float(calinski_harabasz_score(X, labels))
        )

        metadata = ClusteringMetadata(
            version=self.version,
            data_size=self.data_size,
            n_clusters=n_clusters,
            model=ModelInfo.from_kwargs(
                name=self.cluster_name,
                n_components=self.n_comp,
                seed=self.seed,
                covariance_type=self.cov_type
            ),
            metrics=metrics
        )

        self._save_metadata_to_local(model, metadata, self.cluster_name)

@dataclass            
class DBSCAN_ClusterModel(ClusterModel):
    eps: float = 0.3
    min_samples: int = 5
    seed: int = 43
    cluster_name: str = 'dbscan'
    
    def train_model(self) -> None:
        logger.info(f"Training model: [{self.cluster_name}] with eps={self.eps}, min_samples={self.min_samples}")

        model = DBSCAN(eps=self.eps, min_samples=self.min_samples)

        X = self.data.values if isinstance(self.data, pd.DataFrame) else self.data
        labels = model.fit_predict(X)
        n_clusters = len(np.unique(labels)) - (1 if -1 in labels else 0)
        
        mask = labels != -1
        if np.sum(mask) > 1 and len(set(labels[mask])) > 1:
            sil = silhouette_score(X[mask], labels[mask])
            db = davies_bouldin_score(X[mask], labels[mask])
            ch = calinski_harabasz_score(X[mask], labels[mask])
        else:
            sil = db = ch = float('nan')
            
        metrics = EvaluationMetric(
            silhouette=float(sil),
            davies_bouldin=float(db),
            calinski_harabasz=float(ch)
        )

        metadata = ClusteringMetadata(
            version=self.version,
            data_size=self.data_size,
            n_clusters=n_clusters,
            model=ModelInfo.from_kwargs(
                name=self.cluster_name,
                eps=self.eps,
                min_samples=self.min_samples,
                seed=self.seed
            ),
            metrics=metrics
        )

        self._save_metadata_to_local(model, metadata, self.cluster_name)

@dataclass            
class Kmeans_ClusterModel(ClusterModel):
    n_cluster: int = 5
    seed: int = 43
    cluster_name: str = 'k-means'
    
    def train_model(self) -> None:
        logger.info(f"Training model: [{self.cluster_name}] with n_clusters={self.n_cluster}")

        model = KMeans(n_clusters=self.n_cluster, random_state=self.seed)

        X = self.data.values if isinstance(self.data, pd.DataFrame) else self.data
        labels = model.fit_predict(X)
        n_clusters = len(np.unique(labels))
        
        metrics = EvaluationMetric(
            silhouette=float(silhouette_score(X, labels)),
            davies_bouldin=float(davies_bouldin_score(X, labels)),
            calinski_harabasz=float(calinski_harabasz_score(X, labels))
        )
        
        metadata = ClusteringMetadata(
            version=self.version,
            data_size=self.data_size,
            n_clusters=n_clusters,
            model=ModelInfo.from_kwargs(
                name=self.cluster_name,
                n_clusters=self.n_cluster,
                seed=self.seed
            ),
            metrics=metrics
        )

        self._save_metadata_to_local(model, metadata, self.cluster_name)


`Clustering Experiments`

In [9]:
_size = 5000
x_training = X.copy()
x_training = x_training.iloc[:_size]
row_size = len(x_training)

In [10]:
cluster = GMM_ClusterModel(x_training, row_size, 1.0, verbose=True)
cluster.train_model()

2025-05-31 08:40:17,368 - INFO - Training model: [gaussian-mixture] with 3 components
2025-05-31 08:40:24,760 - INFO - Saved model to: /mnt/d/research-workspace/workx-projects/fnb-data-quest-v1/models/clusters/gaussian-mixture_model_v1.0.pkl
2025-05-31 08:40:24,762 - INFO - Saved metadata to: /mnt/d/research-workspace/workx-projects/fnb-data-quest-v1/models/clusters/gaussian-mixture_metadata_v1.0.json


{
  "version": 1.0,
  "data_size": 5000,
  "n_clusters": 3,
  "model": {
    "name": "gaussian-mixture",
    "params": {
      "n_components": 3,
      "seed": 43,
      "covariance_type": "full"
    }
  },
  "metrics": {
    "silhouette": 0.5374587944228525,
    "davies_bouldin": 0.5536497404230104,
    "calinski_harabasz": 20163.18579328827
  }
}


In [12]:
dbscan_cluster = DBSCAN_ClusterModel(x_training, row_size, 1.0, verbose=True)
dbscan_cluster.train_model()

2025-05-31 08:41:19,843 - INFO - Training model: [dbscan] with eps=0.3, min_samples=5
2025-05-31 08:41:20,493 - INFO - Saved model to: /mnt/d/research-workspace/workx-projects/fnb-data-quest-v1/models/clusters/dbscan_model_v1.0.pkl
2025-05-31 08:41:20,494 - INFO - Saved metadata to: /mnt/d/research-workspace/workx-projects/fnb-data-quest-v1/models/clusters/dbscan_metadata_v1.0.json


{
  "version": 1.0,
  "data_size": 5000,
  "n_clusters": 184,
  "model": {
    "name": "dbscan",
    "params": {
      "eps": 0.3,
      "min_samples": 5,
      "seed": 43
    }
  },
  "metrics": {
    "silhouette": 0.997201879386879,
    "davies_bouldin": 0.006046612848574331,
    "calinski_harabasz": 3419161830694.2065
  }
}


In [13]:
kmeans_cluster = Kmeans_ClusterModel(x_training, row_size, 1.0, verbose=True)
kmeans_cluster.train_model()

2025-05-31 08:42:24,725 - INFO - Training model: [k-means] with n_clusters=5
2025-05-31 08:42:25,370 - INFO - Saved model to: /mnt/d/research-workspace/workx-projects/fnb-data-quest-v1/models/clusters/k-means_model_v1.0.pkl
2025-05-31 08:42:25,371 - INFO - Saved metadata to: /mnt/d/research-workspace/workx-projects/fnb-data-quest-v1/models/clusters/k-means_metadata_v1.0.json


{
  "version": 1.0,
  "data_size": 5000,
  "n_clusters": 5,
  "model": {
    "name": "k-means",
    "params": {
      "n_clusters": 5,
      "seed": 43
    }
  },
  "metrics": {
    "silhouette": 0.7042568273525505,
    "davies_bouldin": 0.32180743749555596,
    "calinski_harabasz": 88067.9334157285
  }
}


#### ***`Optimizing Clustering Techniques`***

`DBSCAN Parameter-Tuning`

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def dbscan_grid_scores(X, eps_values, min_samples_values):
    results = []

    for eps in eps_values:
        for min_samples in min_samples_values:
            db = DBSCAN(eps=eps, min_samples=min_samples)
            labels = db.fit_predict(X)
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            if n_clusters > 1:
                mask = labels != -1
                sil_score = silhouette_score(X[mask], labels[mask])
                db_score = davies_bouldin_score(X[mask], labels[mask])
                ch_score = calinski_harabasz_score(X[mask], labels[mask])
            else:
                sil_score, db_score, ch_score = np.nan, np.nan, np.nan

            results.append({
                'eps': eps,
                'min_samples': min_samples,
                'n_clusters': n_clusters,
                'silhouette': sil_score,
                'db_index': db_score,
                'calinski_harabasz': ch_score
            })

    return pd.DataFrame(results)

In [None]:
# Model Parameter Scores
eps_range = np.round(np.linspace(0.3, 2.0, 3), 2)
min_samples_range = [3, 5]
# eps_range = np.round(np.linspace(0.3, 2.0, 5), 2)
# min_samples_range = [3, 5, 8, 10, 15]
dbscan_score_df = dbscan_grid_scores(X_dense, eps_range, min_samples_range)

In [None]:
# Heatmap for DBSCAN parameters
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

def plot_metric_heatmap(df, metric, sci_notation=False):
    pivot_table = df.pivot(index="eps", columns="min_samples", values=metric)
    plt.figure(figsize=(8, 6))

    fmt = ".2e" if sci_notation else ".2f"

    ax = sns.heatmap(
        pivot_table,
        annot=True,
        fmt=fmt,
        cmap="YlGnBu",
        linewidths=0.5,
        cbar_kws={'format': FuncFormatter(lambda x, _: f'{x:.0e}') if sci_notation else None}
    )

    plt.title(f"DBSCAN: {metric} Score")
    plt.xlabel("min_samples")
    plt.tight_layout()
    plt.show()


In [None]:
plot_metric_heatmap(dbscan_score_df, "silhouette")
plot_metric_heatmap(dbscan_score_df, "db_index")
plot_metric_heatmap(dbscan_score_df, "calinski_harabasz", True)

##### *Kmeans Paremter-Tuning*

In [None]:
# Apply the Elbow Method to determine the optimal number of clusters
def elbow_method_kmeans(X, min_value=2, max_value=10):
    inertias = []
    silhouette_scores = []
    k_range = range(min_value, max_value + 1)
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
        # Silhouette score is only defined for k > 1
        if k > 1:
            score = silhouette_score(X_dense, kmeans.labels_)
            silhouette_scores.append(score)
        else:
            silhouette_scores.append(None)

    # Plot Inertia vs. K (Elbow Method)
    plt.figure(figsize=(12, 5))
    plt.style.use('ggplot')

    plt.subplot(1, 2, 1)
    plt.plot(k_range, inertias, marker='o')
    plt.title("Elbow Method: Inertia vs. K")
    plt.xlabel("Number of Clusters (K)")
    plt.ylabel("Inertia")
    plt.grid(True)

    # Plot Silhouette Score vs. K
    plt.subplot(1, 2, 2)
    plt.plot(k_range[1:], silhouette_scores[1:], marker='o', color='green')
    plt.title("Silhouette Score vs. K")
    plt.xlabel("Number of Clusters (K)")
    plt.ylabel("Silhouette Score")
    plt.grid(True)

    plt.tight_layout()
    plt.show()

In [None]:
elbow_method_kmeans(X_dense, min_value=1, max_value=50)

##### *GMM Clustering*

In [None]:
# Plot Scores Across n_components
n_components_range = range(2, 8)  # Try 2 to 10 clusters
gmm_results = evaluate_gmm(X_dense, n_components_range)

In [None]:
import matplotlib.pyplot as plt

def plot_gmm_scores(results_df):
    plt.figure(figsize=(14, 4))

    plt.subplot(1, 3, 1)
    sns.lineplot(x='n_components', y='silhouette', data=results_df, marker='o')
    plt.title('Silhouette Score')
    plt.xlabel('n_components')
    plt.ylabel('Score')

    plt.subplot(1, 3, 2)
    sns.lineplot(x='n_components', y='db_index', data=results_df, marker='o')
    plt.title('Davies-Bouldin Index')
    plt.xlabel('n_components')
    plt.ylabel('Score (lower is better)')

    plt.subplot(1, 3, 3)
    sns.lineplot(x='n_components', y='calinski_harabasz', data=results_df, marker='o')
    plt.title('Calinski-Harabasz Index')
    plt.xlabel('n_components')
    plt.ylabel('Score (higher is better)')

    plt.tight_layout()
    plt.show()


In [None]:
plot_gmm_scores(gmm_results)

#### ***`Hybrid RecoSys`***: Supervised Learning + Clustering

In [None]:
# Personalized Recommendation system
def personalized_recosys_(data: pd.DataFrame, user_id: int, predicted_target: str, predicted_cluster: int) -> pd.DataFrame:
    # Group and count item frequency by cluster, target, and item
    group_recosys = (
        data
        .groupby(['gmm_cluser', 'target', 'item_id'])
        .size()
        .reset_index(name='ranked_similarity')
    )

    # Filter for the user's predicted cluster and target
    items = group_recosys[
        (group_recosys['gmm_cluser'] == predicted_cluster) &
        (group_recosys['target'] == predicted_target)
    ].sort_values(by='ranked_similarity', ascending=False)
    print(items)
    # Add user_id column and return top 3 items
    top_ranked_items = items[['item_id', 'ranked_similarity']]
    # top_ranked_items.insert(0, 'user_id', user_id)
    return top_ranked_items.reset_index(drop=True)

In [None]:
# Supervised Learning Model to predict user interaction (click or checkout)
file_name_model = Path('.').cwd().parent / 'models/classifiers/xgbclassifier_model_v0.1.pkl'
xgb_classifier = joblib.load(file_name_model)
x_val = modelpoint_eval.iloc[99:100]
predicted_val = xgb_classifier.predict(x_val.drop(['user_id', 'target','item_id'], axis=1))
predicted_int = le.inverse_transform(predicted_val)[0]
predicted_int

In [None]:
gmm_labels = gm_cluser_model['model'].predict(data_cluster)
data_ = data_cluster1.copy()
data_['gmm_cluser'] = gmm_labels


user_id = 14454
predicted_target = 'checkout'
predicted_cluster = 1
personalized_recosys_(data_, user_id, predicted_target, predicted_cluster)