In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:

import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))


In [50]:
import numpy as np
import pandas as pd
import optuna
import re
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score,accuracy_score,recall_score,f1_score,precision_score
from optimization.optimize_ensemble import *

In [51]:
transformed_featured_train_set = pd.read_csv(r"../data/processed/transformed_featured_train_set.csv").drop(columns="Unnamed: 0")
transformed_featured_val_set = pd.read_csv(r"../data/processed/transformed_featured_val_set.csv").drop(columns="Unnamed: 0")
transformed_featured_test_set = pd.read_csv(r"../data/processed/transformed_featured_test_set.csv").drop(columns="Unnamed: 0")
transformed_featured_final_train_set = pd.read_csv(r"../data/processed/transformed_featured_final_train_set.csv").drop(columns="Unnamed: 0")
lgb_featured_study = pd.read_csv(r"../data/processed/lgb_featured_study.csv").drop(columns="Unnamed: 0")
xgb_featured_study = pd.read_csv(r"../data/processed/xgb_featured_study.csv").drop(columns="Unnamed: 0")
catboost_featured_study = pd.read_csv(r"../data/processed/catboost_featured_study.csv").drop(columns="Unnamed: 0")
nn_featured_study = pd.read_csv(r"../data/processed/nn_featured_study.csv").drop(columns="Unnamed: 0")

In [52]:

def objective(trial):
    X_train, y_train = transformed_featured_train_set.drop('Churn', axis=1), transformed_featured_train_set['Churn']  # Extract training features and labels
    X_val, y_val = transformed_featured_val_set.drop('Churn', axis=1), transformed_featured_val_set['Churn']  # Extract validation features and labels
    
    # Sample indices for hyperparameters from different models
    lgb_study = drop_unnessesary_columns(lgb_featured_study.copy())  # Prepare LightGBM study data
    xgb_study = drop_unnessesary_columns(xgb_featured_study.copy())  # Prepare XGBoost study data
    cat_study = drop_unnessesary_columns(catboost_featured_study.copy())  # Prepare CatBoost study data
    nn_study = drop_unnessesary_columns(nn_featured_study.copy())  # Prepare Neural Network study data
    
    lgb_idx = trial.suggest_int('lgb_idx', 0, len(lgb_study) - 1)  # Sample index for LightGBM hyperparameters
    xgb_idx = trial.suggest_int('xgb_idx', 0, len(xgb_study) - 1)  # Sample index for XGBoost hyperparameters
    cat_idx = trial.suggest_int('cat_idx', 0, len(cat_study) - 1)  # Sample index for CatBoost hyperparameters
    nn_idx = trial.suggest_int('nn_idx', 0, len(nn_study) - 1)  # Sample index for Neural Network hyperparameters
    
    # Fetch hyperparameters and clean them
    lgb_params = clean_hyperparameters(lgb_study.iloc[lgb_idx].to_dict())  # Get LightGBM parameters
    xgb_params = clean_hyperparameters(xgb_study.iloc[xgb_idx].to_dict())  # Get XGBoost parameters
    cat_params = clean_hyperparameters(cat_study.iloc[cat_idx].to_dict())  # Get CatBoost parameters
    nn_params = clean_hyperparameters(nn_study.iloc[nn_idx].to_dict())  # Get Neural Network parameters
    
    # Set additional fixed hyperparameters
    lgb_params['verbose'] = -1  # Silence LightGBM output
    xgb_params['verbose'] = 0  # Silence XGBoost output
    cat_params['early_stopping_rounds'] = 3000  # Set early stopping rounds for CatBoost
    cat_params['iterations'] = 200  # Set number of iterations for CatBoost

    # Weights for the voting classifier
    lgb_weight = trial.suggest_float('lgb_weight', 0.1, 1.0)  # Suggest weight for LightGBM
    xgb_weight = trial.suggest_float('xgb_weight', 0.1, 1.0)  # Suggest weight for XGBoost
    cat_weight = trial.suggest_float('cat_weight', 0.1, 1.0)  # Suggest weight for CatBoost
    nn_weight = trial.suggest_float('nn_weight', 0.1, 1.0)  # Suggest weight for Neural Network

    weights = {
        'lgb': lgb_weight,
        'xgb': xgb_weight,
        'cat': cat_weight,
        'nn': nn_weight
    }  # Store weights in a dictionary
    
    #Let’s give these models some workout time. First up, LightGBM. 
    #It's like the gym but for data—let’s see if it can lift those predictions high!
    
    # Train and predict with LightGBM
    lgb_train = lgb.Dataset(X_train, label=y_train)  # Prepare LightGBM dataset
    lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)  # Train LightGBM model
    lgb_preds = lgb_model.predict(X_val)  # Predict on validation set with LightGBM
    
    #Phew, LightGBM is done. Now let’s see if XGBoost can boost our mood... or just our predictions!"
    
    # Train and predict with XGBoost
    xgb_model = xgb.XGBClassifier(**xgb_params)  # Initialize XGBoost model with parameters
    xgb_model.fit(X_train, y_train)  # Train XGBoost model
    xgb_preds = xgb_model.predict_proba(X_val)[:, 1]  # Predict on validation set with XGBoost and get probabilities

    # Train and predict with CatBoost
    cat_model = cb.CatBoostClassifier(**cat_params, verbose=0)  # Initialize CatBoost model with parameters
    cat_model.fit(X_train, y_train)  # Train CatBoost model
    cat_preds = cat_model.predict_proba(X_val)[:, 1]  # Predict on validation set with CatBoost and get probabilities

    # Train and predict with Neural Network using TensorFlow/Keras
    nn_model = create_nn_model(nn_params, transformed_featured_train_set.shape[1] - 1)  # Create Neural Network model
    nn_model.fit(X_train, y_train, epochs=50, batch_size=int(nn_params['batch_size']), verbose=0)  # Train Neural Network model
    nn_preds = nn_model.predict(transformed_featured_val_set.drop('Churn', axis=1)).ravel()  # Predict on validation set with Neural Network

    # Combine predictions using weighted soft voting
    predictions = {
        'lgb': lgb_preds,
        'xgb': xgb_preds,
        'cat': cat_preds,
        'nn': nn_preds
    }  # Store predictions in a dictionary
    combined_preds = weighted_voting(predictions, weights)  # Perform weighted voting to combine predictions
    preds_digits = [1 if pred >= 0.4 else 0 for pred in combined_preds]  # Convert probabilities to binary predictions with a threshold of 0.4
    
    # Calculate evaluation metrics
    roc_auc = roc_auc_score(y_val, combined_preds)  # Calculate ROC AUC score
    f1 = f1_score(y_val, preds_digits)  # Calculate F1 score
    recall = recall_score(y_val, preds_digits)  # Calculate recall score
    accuracy = accuracy_score(y_val, preds_digits)  # Calculate accuracy score
    weighted_recall = 0.65 * recall + 0.35 * f1  # Calculate weighted recall combining recall and F1 score
    prec = precision_score(y_val, preds_digits)  # Calculate precision score
    
    # Store metrics as trial user attributes
    trial.set_user_attr('roc', roc_auc)  # Store ROC AUC score in the study
    trial.set_user_attr('f1', f1)  # Store F1 score in the study object
    trial.set_user_attr('accuracy', accuracy)  # Store accuracy score
    trial.set_user_attr('recall', recall)  # Store recall score
    trial.set_user_attr('precision', prec)  # Store precision score
    
    return weighted_recall  # Return weighted recall as the objective value for optimization

In [60]:
run_ensemble_trials = False

In [54]:

if run_ensemble_trials:
    # Create Optuna study and optimize
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=5 , n_jobs =-1)

    # Best trial
    best_trial = study.best_trial
    print(f'Best metric: {best_trial.value}')
    print('Best hyperparameters and weights:', best_trial.params)
    trials = study.trials

    # Extract trial data
    data = {
        'trial_number': [trial.number for trial in trials],
        'value': [trial.value for trial in trials],
        'params': [trial.params for trial in trials],
        'datetime_start': [trial.datetime_start for trial in trials],
        'datetime_complete': [trial.datetime_complete for trial in trials],
        'f1': [trial.user_attrs.get('f1', None) for trial in trials],
        'accuracy': [trial.user_attrs.get('accuracy', None) for trial in trials],
        'roc': [trial.user_attrs.get('roc', None) for trial in trials],
        'recall': [trial.user_attrs.get('recall', None) for trial in trials],
        'precision': [trial.user_attrs.get('precision', None) for trial in trials]
        
    }

    # Convert to DataFrame
    ensemble_results_df = pd.DataFrame(data)
    
else:
    ensemble_results_df = pd.read_csv(r"../data/processed/ensemble_study.csv")
    ensemble_results_df = ensemble_results_df.drop(columns="Unnamed: 0")

[I 2024-08-29 10:31:56,538] A new study created in memory with name: no-name-0ccd6f9f-d607-43c2-a064-ac19ff9dbc40
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2024-08-29 10:32:52,130] Trial 1 finished with value: 0.4530418297455969 and parameters: {'lgb_idx': 34, 'xgb_idx': 4, 'cat_idx': 79, 'nn_idx': 23, 'lgb_weight': 0.14430690847063413, 'xgb_weight': 0.19893934753510428, 'cat_weight': 0.6449578457217432, 'nn_weight': 0.22432652495515237}. Best is trial 1 with value: 0.4530418297455969.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


[I 2024-08-29 10:33:07,148] Trial 4 finished with value: 0.5272149725274725 and parameters: {'lgb_idx': 88, 'xgb_idx': 29, 'cat_idx': 27, 'nn_idx': 27, 'lgb_weight': 0.8682825141787712, 'xgb_weight': 0.1829775645894997, 'cat_weight': 0.6873994836835665, 'nn_weight': 0.7328736277672686}. Best is trial 4 with value: 0.5272149725274725.


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


[I 2024-08-29 10:33:11,558] Trial 2 finished with value: 0.5619196428571429 and parameters: {'lgb_idx': 75, 'xgb_idx': 20, 'cat_idx': 70, 'nn_idx': 68, 'lgb_weight': 0.8638740862850363, 'xgb_weight': 0.7853652920061945, 'cat_weight': 0.6638401876054422, 'nn_weight': 0.6302199777678311}. Best is trial 2 with value: 0.5619196428571429.


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2024-08-29 10:33:18,459] Trial 0 finished with value: 0.6274205524205525 and parameters: {'lgb_idx': 95, 'xgb_idx': 74, 'cat_idx': 24, 'nn_idx': 87, 'lgb_weight': 0.7650223732630748, 'xgb_weight': 0.41515159469352747, 'cat_weight': 0.728874514092429, 'nn_weight': 0.31687785685727277}. Best is trial 0 with value: 0.6274205524205525.


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


[I 2024-08-29 10:33:19,259] Trial 3 finished with value: 0.5491712235133288 and parameters: {'lgb_idx': 75, 'xgb_idx': 48, 'cat_idx': 85, 'nn_idx': 74, 'lgb_weight': 0.6683614651814584, 'xgb_weight': 0.4295297337539553, 'cat_weight': 0.40635174015618913, 'nn_weight': 0.29029642162073444}. Best is trial 0 with value: 0.6274205524205525.


Best metric: 0.6274205524205525
Best hyperparameters and weights: {'lgb_idx': 95, 'xgb_idx': 74, 'cat_idx': 24, 'nn_idx': 87, 'lgb_weight': 0.7650223732630748, 'xgb_weight': 0.41515159469352747, 'cat_weight': 0.728874514092429, 'nn_weight': 0.31687785685727277}


In [55]:
# model's performance which has highest weighted recall
ensemble_results_df.iloc[ensemble_results_df['value'].idxmax()]

trial_number                                                         0
value                                                         0.627421
params               {'lgb_idx': 95, 'xgb_idx': 74, 'cat_idx': 24, ...
datetime_start                              2024-08-29 10:31:56.540584
datetime_complete                           2024-08-29 10:33:18.459210
f1                                                            0.598753
accuracy                                                      0.771327
roc                                                           0.830314
recall                                                        0.642857
precision                                                     0.560311
Name: 0, dtype: object

In [56]:
# model's performance which has highest recall
ensemble_results_df.iloc[ensemble_results_df['recall'].idxmax()]

trial_number                                                         0
value                                                         0.627421
params               {'lgb_idx': 95, 'xgb_idx': 74, 'cat_idx': 24, ...
datetime_start                              2024-08-29 10:31:56.540584
datetime_complete                           2024-08-29 10:33:18.459210
f1                                                            0.598753
accuracy                                                      0.771327
roc                                                           0.830314
recall                                                        0.642857
precision                                                     0.560311
Name: 0, dtype: object

In [57]:
# model's performance which has highest precision
ensemble_results_df.iloc[ensemble_results_df['precision'].idxmax()]

trial_number                                                         1
value                                                         0.453042
params               {'lgb_idx': 34, 'xgb_idx': 4, 'cat_idx': 79, '...
datetime_start                              2024-08-29 10:31:56.543494
datetime_complete                           2024-08-29 10:32:52.130023
f1                                                            0.515068
accuracy                                                      0.790284
roc                                                            0.83202
recall                                                        0.419643
precision                                                     0.666667
Name: 1, dtype: object

In [58]:
ann_model = create_nn_model(clean_hyperparameters(nn_featured_study.iloc[87].to_dict()),input_shape=transformed_featured_train_set.drop(columns='Churn').shape[1])
ann_model.fit(transformed_featured_final_train_set.drop(columns='Churn'),transformed_featured_final_train_set['Churn'],epochs=20,validation_data=(transformed_featured_test_set.drop(columns='Churn'),transformed_featured_test_set['Churn']))

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - AUC: 0.7633 - loss: 0.5030 - val_AUC: 0.8357 - val_loss: 0.4270
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.8274 - loss: 0.4392 - val_AUC: 0.8384 - val_loss: 0.4263
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.8564 - loss: 0.4058 - val_AUC: 0.8357 - val_loss: 0.4279
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - AUC: 0.8448 - loss: 0.4235 - val_AUC: 0.8382 - val_loss: 0.4274
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - AUC: 0.8574 - loss: 0.4042 - val_AUC: 0.8374 - val_loss: 0.4258
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - AUC: 0.8547 - loss: 0.4083 - val_AUC: 0.8339 - val_loss: 0.4295
Epoch 7/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - AUC: 0.8638

<keras.src.callbacks.history.History at 0x287dd6bd2d0>

In [59]:
#let's see final model's performance
preds = ann_model.predict(transformed_featured_test_set.drop(columns='Churn'))
print('roc_auc score is:-',roc_auc_score(transformed_featured_test_set['Churn'],preds))
preds_digits = [1 if pred >= 0.4 else 0 for pred in preds]
print('recall score is :-',recall_score(transformed_featured_test_set['Churn'],preds_digits))
print('precision score is:-',precision_score(transformed_featured_test_set['Churn'],preds_digits))

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
roc_auc score is:- 0.8271686743869421
recall score is :- 0.6577540106951871
precision score is:- 0.5747663551401869
