In [88]:
import pandas as pd
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score

In [89]:
data_train = pd.read_csv("../data/processed/train.csv")
data_test = pd.read_csv("../data/processed/test.csv")
xtrain = data_train.drop('Churn', axis=1)
ytrain = data_train['Churn']
xtest = data_test.drop('Churn', axis=1)
ytest = data_test['Churn']

In [90]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(xtrain, ytrain)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [91]:
svc_model = SVC(kernel='rbf', class_weight='balanced')
svc_model.fit(xtrain, ytrain)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [92]:
xgb_model = XGBClassifier()
xgb_model.fit(xtrain, ytrain)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [93]:
lgbm_model = LGBMClassifier(n_estimators=100, random_state=42)
lgbm_model.fit(xtrain, ytrain)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [94]:
def evaluate_model(model, xtest, ytest):
    ypred = model.predict(xtest)
    cm = confusion_matrix(ytest, ypred)
    tn, fp, fn, tp = cm.ravel()
    results = {
        'Confusion Matrix': cm,
        'True Positive': tp,
        'True Negative': tn,
        'False Positive': fp,
        'False Negative': fn,
        'Accuracy': accuracy_score(ytest, ypred),
        'Precision': precision_score(ytest, ypred),
        'F1 Score': f1_score(ytest, ypred),
        'Recall': recall_score(ytest, ypred)
    }
    return results

results = {
    "Random Forest": evaluate_model(rf_model, xtest, ytest),
    "SVC": evaluate_model(svc_model, xtest, ytest),
    "XGBoost": evaluate_model(xgb_model, xtest, ytest),
    "LightGBM": evaluate_model(lgbm_model, xtest, ytest)
}

summary = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'F1 Score', 'Recall'])
row = []
for model, result in results.items():
    row.append({
        'Model': model,
        'Accuracy': f"{result['Accuracy']:.2f}",
        'Precision': f"{result['Precision']:.2f}",
        'F1 Score': f"{result['F1 Score']:.2f}",
        'Recall': f"{result['Recall']:.2f}"
    })
summary = pd.DataFrame(row)
summary

Unnamed: 0,Model,Accuracy,Precision,F1 Score,Recall
0,Random Forest,0.78,0.59,0.54,0.49
1,SVC,0.72,0.49,0.6,0.78
2,XGBoost,0.77,0.58,0.54,0.52
3,LightGBM,0.79,0.62,0.57,0.52


In [95]:
def objective(trial):
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "class_weight": "balanced"
    }

    model = LGBMClassifier(**param)
    model.fit(xtrain, ytrain)
    preds = model.predict(xtest)
    accuracy = accuracy_score(ytest, preds)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(f"Best Score: {study.best_value:.2f}")
print(f"Best Params: {study.best_params}")

[I 2025-08-16 18:03:48,176] A new study created in memory with name: no-name-0b6ce366-ff07-40a4-87ef-71a361ea4295
[I 2025-08-16 18:03:48,803] Trial 0 finished with value: 0.7412935323383084 and parameters: {'lambda_l1': 0.003208081306745122, 'lambda_l2': 1.5623336895632847e-08, 'num_leaves': 137, 'feature_fraction': 0.9920478547573967, 'bagging_fraction': 0.700875883201283, 'bagging_freq': 1, 'min_child_samples': 81}. Best is trial 0 with value: 0.7412935323383084.
[I 2025-08-16 18:03:51,189] Trial 1 finished with value: 0.7604832977967306 and parameters: {'lambda_l1': 1.9586679323187948e-08, 'lambda_l2': 1.4718600307354756e-05, 'num_leaves': 200, 'feature_fraction': 0.508661736585823, 'bagging_fraction': 0.8483668427766801, 'bagging_freq': 4, 'min_child_samples': 11}. Best is trial 1 with value: 0.7604832977967306.
[I 2025-08-16 18:03:51,257] Trial 2 finished with value: 0.7277896233120114 and parameters: {'lambda_l1': 0.15429988634377984, 'lambda_l2': 0.27143322904927975, 'num_leaves

Best Score: 0.77
Best Params: {'lambda_l1': 1.7645672489453802e-08, 'lambda_l2': 1.31432629783643e-06, 'num_leaves': 200, 'feature_fraction': 0.7279713486652487, 'bagging_fraction': 0.8128867322497486, 'bagging_freq': 2, 'min_child_samples': 5}


In [97]:
import joblib
joblib.dump(study, "../models/model.pkl")

['../models/model.pkl']