In [15]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
import mlflow
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
import pickle

In [2]:
df= pd.read_csv(r'C:\Users\punee\Projects\MLflow starter\Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


LABELENCODING AND ONEHOTENCODER

In [3]:
from sklearn.preprocessing import LabelEncoder
label_encoder= LabelEncoder()
df['Gender']=label_encoder.fit_transform(df['Gender'])

In [4]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder= OneHotEncoder()
encoded_data=one_hot_encoder.fit_transform(df[['Geography']]).toarray()
encoded_table=pd.DataFrame(encoded_data,columns=one_hot_encoder.get_feature_names_out(['Geography']))
encoded_table

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,Geography_nan
0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
9997,1.0,0.0,0.0,0.0
9998,0.0,1.0,0.0,0.0
9999,0.0,1.0,0.0,0.0
10000,1.0,0.0,0.0,0.0


DATA CLEANING

In [5]:
df= pd.concat([df,encoded_table],axis=1)


In [6]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Geography_nan
0,1,15634602,Hargrave,619,France,0,42.0,2,0.0,1,1.0,1.0,101348.88,1,1.0,0.0,0.0,0.0
1,2,15647311,Hill,608,Spain,0,41.0,1,83807.86,1,0.0,1.0,112542.58,0,0.0,0.0,1.0,0.0
2,3,15619304,Onio,502,France,0,42.0,8,159660.8,3,1.0,0.0,113931.57,1,1.0,0.0,0.0,0.0
3,4,15701354,Boni,699,France,0,39.0,1,0.0,2,0.0,0.0,93826.63,0,1.0,0.0,0.0,0.0
4,5,15737888,Mitchell,850,Spain,0,43.0,2,125510.82,1,,1.0,79084.1,0,0.0,0.0,1.0,0.0


In [7]:
df.isnull().sum()

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            1
Gender               0
Age                  1
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            1
IsActiveMember       1
EstimatedSalary      0
Exited               0
Geography_France     0
Geography_Germany    0
Geography_Spain      0
Geography_nan        0
dtype: int64

In [8]:
df=df.dropna(axis=1)

In [9]:
df = df.drop(['Surname','RowNumber'], axis='columns')


In [10]:
df.drop(['Geography_nan'],axis=1)

Unnamed: 0,CustomerId,CreditScore,Gender,Tenure,Balance,NumOfProducts,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,15634602,619,0,2,0.00,1,101348.88,1,1.0,0.0,0.0
1,15647311,608,0,1,83807.86,1,112542.58,0,0.0,0.0,1.0
2,15619304,502,0,8,159660.80,3,113931.57,1,1.0,0.0,0.0
3,15701354,699,0,1,0.00,2,93826.63,0,1.0,0.0,0.0
4,15737888,850,0,2,125510.82,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
9997,15584532,709,0,7,0.00,1,42085.58,1,1.0,0.0,0.0
9998,15682355,772,1,3,75075.31,2,92888.52,1,0.0,1.0,0.0
9999,15682355,772,1,3,75075.31,2,92888.52,1,0.0,1.0,0.0
10000,15628319,792,0,4,130142.79,1,38190.78,0,1.0,0.0,0.0


INTIALIZE GRID SEARCH CV 

In [11]:
clf = RandomForestClassifier()
param_grid= {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(estimator=clf,param_grid=param_grid,cv=5,n_jobs=-1,verbose=2,
                             scoring={'precision': 'precision_macro', 
                                    'recall': 'recall_macro',
                                    'f1': 'f1_macro'}, refit='f1')


TRAIN-TEST-SPLIT

In [12]:
X=df.drop('Exited',axis=1)
y=df['Exited']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)


SMOTE-HANDLING IMBALANCED DATA

In [25]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE on training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Now X_train_smote and y_train_smote are your balanced datasets



MLFLOW DEVELOPMENT

In [26]:

from mlflow.models import infer_signature
signature=infer_signature(X_train,y_train)

# Set experiment (create a new experiment or use an existing one)
mlflow.set_experiment('Churn Model')
# Now start logging the experiment details
## Tracking uri 
tracking_uri=mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
with mlflow.start_run():
    grid_search.fit(X_train,y_train)
    ##Get the best model
    best_model=grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_true=y_test
    
    mlflow.log_param('best_n_estimators', best_model.n_estimators)
    mlflow.log_param('best_max_depth', best_model.max_depth)
    mlflow.log_param('best_min_samples_split', best_model.min_samples_split)
    mlflow.log_param('best_min_samples_leaf', best_model.min_samples_leaf)
    # Log metrics to MLflow
    mlflow.log_metric('precision', precision_score(y_true, y_pred, average='macro'))
    mlflow.log_metric('recall', recall_score(y_true, y_pred, average='macro'))
    mlflow.log_metric('f1', f1_score(y_true, y_pred, average='macro'))
    
    # If you want to log individual metrics
    report_dict = classification_report(y_test, y_pred, output_dict=True)

    # Log precision, recall, f1 for class 0 and class 1
    mlflow.log_metric("precision_class_0", report_dict['0']['precision'])
    mlflow.log_metric("recall_class_0", report_dict['0']['recall'])
    mlflow.log_metric("f1_class_0", report_dict['0']['f1-score'])

    mlflow.log_metric("precision_class_1", report_dict['1']['precision'])
    mlflow.log_metric("recall_class_1", report_dict['1']['recall'])
    mlflow.log_metric("f1_class_1", report_dict['1']['f1-score'])

    mlflow.sklearn.log_model(best_model, "random_forest_model", registered_model_name="Churn_RF_Model")

    mlflow.end_run()



Fitting 5 folds for each of 24 candidates, totalling 120 fits


Registered model 'Churn_RF_Model' already exists. Creating a new version of this model...
2025/02/13 15:21:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Churn_RF_Model, version 2


🏃 View run rumbling-trout-454 at: http://127.0.0.1:5000/#/experiments/647740690061947287/runs/0182f2426cc84bb69e2e8063098bd053
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/647740690061947287


Created version '2' of model 'Churn_RF_Model'.
