In [1]:
import pandas as pd

# Load train and validation datasets to use all available labeled data for tuning
train_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/train.csv"
val_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/val.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

# Combine train and validation for better hyperparameter search coverage
full_train = pd.concat([train_df, val_df], axis=0)

X_full = full_train.drop(columns=["Churn"])
y_full = full_train["Churn"]

print("Combined train and validation datasets for hyperparameter tuning.")


Combined train and validation datasets for hyperparameter tuning.


In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Define search space for Logistic Regression
logreg_params = {
    "C": [0.001, 0.01, 0.1, 1, 10],
    "solver": ["lbfgs", "liblinear"],
    "max_iter": [100, 200, 500]
}

# Define search space for Decision Tree
tree_params = {
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}

print("Defined parameter grids for Logistic Regression and Decision Tree.")


Defined parameter grids for Logistic Regression and Decision Tree.


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Build pipeline: scaling + logistic regression
logreg_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression())
])

# Set up GridSearchCV for Logistic Regression
logreg_grid = GridSearchCV(
    estimator=logreg_pipe,
    param_grid={
        "clf__C": logreg_params["C"],
        "clf__solver": logreg_params["solver"],
        "clf__max_iter": logreg_params["max_iter"]
    },
    scoring="f1",
    cv=5,
    n_jobs=-1
)

logreg_grid.fit(X_full, y_full)

print("Best Logistic Regression Params:", logreg_grid.best_params_)


Best Logistic Regression Params: {'clf__C': 0.001, 'clf__max_iter': 100, 'clf__solver': 'liblinear'}


In [4]:
# Set up GridSearchCV for Decision Tree
tree_clf = DecisionTreeClassifier(random_state=42)

tree_grid = GridSearchCV(
    estimator=tree_clf,
    param_grid=tree_params,
    scoring="f1",
    cv=5,
    n_jobs=-1
)

tree_grid.fit(X_full, y_full)

print("Best Decision Tree Params:", tree_grid.best_params_)


Best Decision Tree Params: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [5]:
# Retrain Logistic Regression with best params on full train data
best_logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        C=logreg_grid.best_params_["clf__C"],
        solver=logreg_grid.best_params_["clf__solver"],
        max_iter=logreg_grid.best_params_["clf__max_iter"]
    ))
])
best_logreg.fit(X_full, y_full)

# Retrain Decision Tree with best params on full train data
best_tree = DecisionTreeClassifier(**tree_grid.best_params_, random_state=42)
best_tree.fit(X_full, y_full)

print("Final models retrained with best hyperparameters.")


Final models retrained with best hyperparameters.


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load test set
test_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/test.csv"
test_df = pd.read_csv(test_path)
X_test = test_df.drop(columns=["Churn"])
y_test = test_df["Churn"]

# Make predictions with tuned models
log_test_pred = best_logreg.predict(X_test)
tree_test_pred = best_tree.predict(X_test)

# Evaluate
def evaluate(y_true, y_pred, name):
    print(f"\n{name}")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1-Score :", f1_score(y_true, y_pred))
    print("ROC-AUC  :", roc_auc_score(y_true, y_pred))

evaluate(y_test, log_test_pred, "Tuned Logistic Regression - Test")
evaluate(y_test, tree_test_pred, "Tuned Decision Tree - Test")



Tuned Logistic Regression - Test
Accuracy : 0.7886524822695036
Precision: 0.5979381443298969
Recall   : 0.6203208556149733
F1-Score : 0.6089238845144357
ROC-AUC  : 0.7348708525179113

Tuned Decision Tree - Test
Accuracy : 0.750354609929078
Precision: 0.526829268292683
Recall   : 0.5775401069518716
F1-Score : 0.5510204081632653
ROC-AUC  : 0.6951407098465923
