In [1]:
import os

In [2]:
%pwd

'd:\\Repositories\\ml-churn\\research'

In [3]:
os.chdir("d://Repositories/ml-churn/")

In [4]:
%pwd

'd:\\Repositories\\ml-churn'

In [46]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataClusteringConfig:
    root_dir: Path
    train_data_path: Path
    model_name: str
    n_clustering: int
    target_column: str

In [47]:
from Churn_analysis.constants import *
from Churn_analysis.utils.common import read_yaml, create_directories

In [48]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])


    def get_data_clustering_config(self) -> DataClusteringConfig:
        config = self.config.data_clustering
        params = self.params.KPrototypes
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        data_clustering_config = DataClusteringConfig(
            root_dir = config.root_dir,
            train_data_path = config.train_data_path,
            model_name = config.model_name,
            n_clustering = params.N_CLUSTERS,
            target_column= schema.target_1
            )
        return data_clustering_config
    

In [49]:
import os 
from Churn_analysis import logger
import pandas as pd
import joblib
from kmodes.kprototypes import KPrototypes

In [50]:
class DataClustering():
    def __init__(self, config: DataClusteringConfig):
        self.config = config

    def get_data_clustering_object(self):

        """
        This function is responsible clustering the data
        by KPrototypes method
        """

        # Read data
        train_data_df      = pd.read_csv(self.config.train_data_path)

        # Drop customerID
        train_data_df.drop(columns = "customerID", inplace= True)

        # Params of Kprototype
        params_clustering  =  {"n_clusters": self.config.n_clustering}
    
        # Initialize the model and .fit()
        kP = KPrototypes(**params_clustering, init='Huang', n_init=1, verbose=True)
        clustering_model = kP.fit(train_data_df.drop(columns="Churn"),
                                  categorical=[0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16])
        
        # Predict groups and create
        cluster_predicted_data = pd.DataFrame(kP.predict(train_data_df.drop(columns = "Churn"),
                                                         categorical=[0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16]),
                                                         columns=["Groups"])
        

        clustered_data = pd.concat([train_data_df , cluster_predicted_data], axis = 1)
        clustered_data.to_csv(os.path.join(self.config.root_dir, "Clustered_data.csv"), index = False)


        joblib.dump(clustering_model, os.path.join(self.config.root_dir,self.config.model_name))

        logger.info(f"Clustering model is saved in {os.path.join(self.config.root_dir,self.config.model_name)}")
        logger.info(f"Clustered data is saved in {os.path.join(self.config.root_dir,'Clustered_data')}")


        

In [51]:
try:
    config = ConfigurationManager()
    data_clustering_config = config.get_data_clustering_config()
    data_clustering = DataClustering(config = data_clustering_config)
    data_clustering.get_data_clustering_object()

except Exception as e:
    raise e

[2023-12-15 15:44:46,842: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-15 15:44:46,844: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-15 15:44:46,847: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-15 15:44:46,848: INFO: common: created directory at: artifacts]
[2023-12-15 15:44:46,850: INFO: common: created directory at: artifacts/data_clustering]
Init: initializing centroids
Init: initializing clusters
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 33, ncost: 3722822436.20571
Run: 1, iteration: 2/100, moves: 37, ncost: 3720369628.596716
Run: 1, iteration: 3/100, moves: 24, ncost: 3719464456.08979
Run: 1, iteration: 4/100, moves: 4, ncost: 3719444405.9605927
Run: 1, iteration: 5/100, moves: 0, ncost: 3719444405.9605927
[2023-12-15 15:44:49,531: INFO: 2900050764: Clustering model is saved in artifacts/data_clustering\clustering_model.joblib]
[2023-12-1