## Data Ingestion

In [2]:
import os
os.chdir(r'E:\MLOps\Churn MLOps\churn-mlops')

In [3]:
%pwd

'E:\\MLOps\\Churn MLOps\\churn-mlops'

In [16]:
import pandas as pd
import numpy as np
import argparse
from churn_mlops.utils.common import read_yaml
from churn_mlops.constants import *

def load_data(data_path : Path, model_var):
    df = pd.read_csv(data_path)
    df = df[model_var]
    return df

def load_raw_data(config_path):
    config = read_yaml(config_path)
    extrenal_data_path = config.data_config.external_data_path
    raw_data_path = config.data_config.raw_data_path
    model_var = config.data_config.model_var

    df = load_data(extrenal_data_path, model_var)
    df.to_csv(raw_data_path, index=False)

if __name__ == "__main__":
    load_raw_data(CONFIG_FILE_PATH)

## Data Transformation

In [6]:
import pandas as pd
import numpy as np
from churn_mlops import logger
from churn_mlops.constants import *
from churn_mlops.utils.common import read_yaml
from sklearn.model_selection import train_test_split

#function to split data
def split_data(df, train_data_path, test_data_path, split_ratio, random_state):
    train, test = train_test_split(df, test_size=split_ratio, random_state=random_state)
    train.to_csv(train_data_path, sep=",", index=False, encoding="utf-8")
    test.to_csv(test_data_path, sep=",", index=False, encoding="utf-8")

#function to save the split data
def split_and_save(config_path):
    config = read_yaml(config_path)

    raw_data = config.data_config.raw_data_path
    train_data_path = config.transformed_data_config.churn_train_data_path
    test_data_path = config.transformed_data_config.churn_test_data_path

    split_ratio = config.data_config.split_ratio
    rnadom_state = config.data_config.random_state

    df = pd.read_csv(raw_data)
    split_data(df, train_data_path, test_data_path, split_ratio, rnadom_state)

    logger.info(f"Data split is done and saved at {train_data_path} and {test_data_path}")

if __name__ == "__main__":
    split_and_save(CONFIG_FILE_PATH)

[2024-03-26 21:57:14,534: INFO: 3170729842: Data split is done and saved at data\transformed_data\churn_train_data.csv and data\transformed_data\churn_test_data.csv]


## Train Model

In [7]:
import joblib
import mlflow
import pandas as pd


from churn_mlops import logger
from churn_mlops.constants import *
from churn_mlops.utils.common import read_yaml

from sklearn.ensemble import RandomForestClassifier

# function to load features and target
def load_features_target(df, target):
    x = df.drop(target, axis=1)
    y = df[[target]]
    return x,y

# function to traine and evaluate the model
def train_and_evaluate(config_path):
    # read the config
    config = read_yaml(config_path)
 
    #data paths and target variable
    train_data_path = config.transformed_data_config.churn_train_data_path
    target = config.data_config.target

    #model params
    max_depth = config.random_forest.max_depth
    n_estimators = config.random_forest.n_estimators

    # read the train and test data
    train_df = pd.read_csv(train_data_path)
    x_train, y_train = load_features_target(train_df, target)
   
    model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)
    model.fit(x_train, y_train)

    joblib.dump(model, config.model_dir)



if __name__ == "__main__":
    train_and_evaluate(CONFIG_FILE_PATH) 
        
    

  return fit_method(estimator, *args, **kwargs)


## Model Training and Evaluation with MLflow

In [None]:

import joblib
import mlflow
import pandas as pd
from urllib.parse import urlparse

from churn_mlops import logger
from churn_mlops.constants import *
from churn_mlops.utils.common import read_yaml

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score,recall_score, precision_score, confusion_matrix, classification_report


# function to evaluate the model
def evaluation_metrics(y_test, pred, avg_method):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average=avg_method)
    recall = recall_score(y_test, pred, average=avg_method) 
    f1 = f1_score(y_test, pred, average=avg_method)
    cm = confusion_matrix(y_test, pred) 

    target_names = ['0', '1']
    classification_rep = classification_report(y_test, pred, target_names=target_names)

    return accuracy, precision, recall, f1, cm, classification_rep

# function to load features and target
def load_features_target(df, target):
    x = df.drop(target, axis=1)
    y = df[[target]]
    return x,y


def train_and_evaluate(config_path):
    # read the config and params
    config = read_yaml(config_path)

    #data paths and target variable
    train_data_path = config.transformed_data_config.churn_train_data_path
    test_data_path = config.transformed_data_config.churn_test_data_path
    target = config.data_config.target

    #model params
    max_depth = config.random_forest.max_depth
    n_estimators = config.random_forest.n_estimators

    # read the train and test data
    train_df = pd.read_csv(train_data_path)
    x_train, y_train = load_features_target(train_df, target)

    # read the test data
    test_df = pd.read_csv(test_data_path)
    x_test, y_test = load_features_target(test_df, target)

    # random_forest_model = joblib.load(config.model_dir)
    random_forest_model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)
    random_forest_model.fit(x_train, y_train)
    

    mlflow.set_tracking_uri(config.mlflow_config.mlflow_tracking_uri)
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    with mlflow.start_run():
        pred = random_forest_model.predict(x_test)

        accuracy, precision, recall, f1, cm, classification_rep = evaluation_metrics(y_test, pred, "weighted")
    
        mlflow.log_param("max_depth",max_depth)
        mlflow.log_param("n_estimators", n_estimators)

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(random_forest_model, "model", registered_model_name="RandomForestModel")
        else:
            mlflow.sklearn.log_model(random_forest_model, "model")

if __name__ == "__main__":
    train_and_evaluate(CONFIG_FILE_PATH)

In [None]:

set MLFLOW_TRACKING_URI=tracking-uri

set MLFLOW_TRACKING_USERNAME=user-name 

set MLFLOW_TRACKING_PASSWORD=your-password


## Model Selection for Production

In [None]:
import joblib
import mlflow
from churn_mlops import logger
from churn_mlops.constants import *
from churn_mlops.utils.common import read_yaml
from mlflow.tracking import MlflowClient
from pprint import pprint

# function to select the best model
def production_model(config_path):
    config = read_yaml(config_path)

    mlflow_tracking_uri = config.mlflow_config.mlflow_tracking_uri
    model_dir = config.model_dir
    model_name = config.mlflow_config.registered_model_name

    mlflow.set_tracking_uri(mlflow_tracking_uri)
    runs = mlflow.search_runs(search_all_experiments=True)
    max_accuracy = max(runs["metrics.accuracy"])
    # default_run_id = "None"  # Or any other default value
    # max_accuracy_run_id = list(runs[runs["metrics.accuracy"] == max_accuracy]["run_id"] or [default_run_id])[0]

    max_accuracy_run_id = list(runs[runs["metrics.accuracy"] == max_accuracy]["run_id"])[0]

    client = MlflowClient()
    for mv in client.search_model_versions(f"name='{model_name}'"):

        if mv.run_id == max_accuracy_run_id:
            model_version = mv.version
            logged_model = mv.source
            pprint(mv, indent=4)
            client.transition_model_version_stage(
                name=model_name,
                version=model_version,
                stage="Production"
            )
            logger.info(f"Model with name {model_name} and version {model_version} is set to Production stage")
        else:
            model_version = mv.version
            client.transition_model_version_stage(
                name= model_name,
                version=model_version,
                stage="Staging"
            )
            logger.info(f"Model with name {model_name} and version {model_version} is set to Staging stage")

    loaded_model = mlflow.pyfunc.load_model(logged_model)
    joblib.dump(loaded_model, model_dir)
    logger.info(f"Best model is saved at {model_dir}")

if __name__ == "__main__":
    production_model(CONFIG_FILE_PATH)


