In [1]:
import pandas as pd


In [3]:
## main
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, argparse
from imblearn.over_sampling import SMOTE
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn_features.transformers import DataFrameSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, roc_curve, auc

import warnings
warnings.filterwarnings('ignore')

In [4]:
## --------------------- Data Preparation ---------------------------- ##

## Read the Dataset
df = pd.read_csv("E:\projects/dataset.csv")

## Drop first 3 features
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

## Filtering using Age Feature using threshold
df.drop(index=df[df['Age'] > 80].index.tolist(), axis=0, inplace=True)


## To features and target
X = df.drop(columns=['Exited'], axis=1)
y = df['Exited']

## Split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=45, stratify=y)

In [5]:
X_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2591,634,Germany,Male,38,2,148430.55,1,1,1,56055.72
1980,554,France,Female,30,9,0.0,2,1,1,40320.3
9866,667,France,Male,24,4,0.0,2,0,0,180329.83
7871,676,France,Female,36,3,91711.59,1,1,1,95393.43
5504,786,France,Male,32,2,120452.4,2,0,0,79602.86


In [6]:
## Slice the lists
num_cols = ['Age', 'CreditScore', 'Balance', 'EstimatedSalary']
categ_cols = ['Gender', 'Geography']

ready_cols = list(set(X_train.columns.tolist()) - set(num_cols) - set(categ_cols))

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# تعريف الأعمدة
num_cols = ['Age', 'CreditScore', 'Balance', 'EstimatedSalary']
categ_cols = ['Gender', 'Geography']
ready_cols = list(set(X_train.columns) - set(num_cols) - set(categ_cols))

# إنشاء المعالج الرئيسي
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first', sparse_output=False))
        ]), categ_cols),
        ('ready', SimpleImputer(strategy='most_frequent'), ready_cols)
    ],
    remainder='passthrough'  # لأي أعمدة إضافية لم يتم تحديدها
)

# تطبيق المعالجة
X_train_final = preprocessor.fit_transform(X_train)
X_test_final = preprocessor.transform(X_test)

In [15]:
## --------------------- Impalancing ---------------------------- ##

# 1. use algorithm without taking the effect of imbalancing

## 2. prepare class_weights for solving imbalance dataset
#The code calculates **class weights** to address class imbalance:

#1. `np.bincount(y_train) / len(y_train)`: Finds the relative frequency of each class.
#2. `1 - (...)`**: Inverts the frequencies, giving more weight to underrepresented classes.
#3. `vals_count / np.sum(vals_count)`: Normalizes the weights so they sum to 1.

#This ensures less frequent classes get higher weights, helping the model focus on them during training.

vals_count = 1 - (np.bincount(y_train) / len(y_train))
vals_count = vals_count / np.sum(vals_count)  ## normalizing


dict_weights = {}
for i in range(2):  ## 2 classes (0, 1)
    dict_weights[i] = vals_count[i]

## 3. Using SMOTE for over sampling
over = SMOTE(sampling_strategy=0.7)
X_train_resmapled, y_train_resampled = over.fit_resample(X_train_final, y_train)

In [17]:
## --------------------- Modeling ---------------------------- ##

def train_model(X_train, y_train, plot_name, C: float, penalty: str, class_weight=None):

    mlflow.set_experiment(f'churn-detection')
    with mlflow.start_run() as run:
        mlflow.set_tag('clf', 'logistic')

        # Try LR
        clf = LogisticRegression(C=C, penalty=penalty, random_state=45, class_weight=class_weight)
        clf.fit(X_train, y_train)
        y_pred_test = clf.predict(X_test_final)
        
        ## metrics
        f1_test = f1_score(y_test, y_pred_test)
        acc_test = accuracy_score(y_test, y_pred_test)

        # Log params, metrics, and model 
        mlflow.log_params({'C': C, 'penalty': penalty})
        mlflow.log_metrics({'accuracy': acc_test, 'f1-score': f1_test})
        mlflow.sklearn.log_model(clf, f'{clf.__class__.__name__}/{plot_name}')

        ## Plot the confusion matrix and save it to mlflow
        plt.figure(figsize=(10, 6))
        sns.heatmap(confusion_matrix(y_test, y_pred_test), annot=True, cbar=False, fmt='.2f', cmap='Blues')
        plt.title(f'{plot_name}')
        plt.xticks(ticks=np.arange(2) + 0.5, labels=[False, True])
        plt.yticks(ticks=np.arange(2) + 0.5, labels=[False, True])


        # Save the plot to MLflow
        conf_matrix_fig = plt.gcf()
        mlflow.log_figure(figure=conf_matrix_fig, artifact_file=f'{plot_name}_conf_matrix.png')
        plt.close()


        # Compute ROC curve and AUC
        fpr, tpr, _ = roc_curve(y_test, y_pred_test)
        roc_auc = auc(fpr, tpr)

        # Plot ROC curve and save it to mlflow
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")

        # Save the plot to MLflow
        roc_fig = plt.gcf()
        mlflow.log_figure(figure=roc_fig, artifact_file=f'{plot_name}_roc_curve.png')
        plt.close()
