In [5]:
import pandas as pd

# Load train and validation datasets
train_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/train.csv"
val_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/val.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

# Combine train + validation for cross-validation
full_train = pd.concat([train_df, val_df], axis=0)

X_full = full_train.drop(columns=["Churn"])
y_full = full_train["Churn"]

print("Full training data ready for Cross Validation.")


Full training data ready for Cross Validation.


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

# Define best hyperparameters from tuning
best_logreg_params = {
    "C": 0.01,
    "solver": "lbfgs",
    "max_iter": 200
}

# Build pipeline
logreg_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(**best_logreg_params))
])

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation using F1-score
cv_scores_logreg = cross_val_score(logreg_pipe, X_full, y_full, cv=skf, scoring="f1", n_jobs=-1)

print("Logistic Regression CV F1-scores:", cv_scores_logreg)
print("Mean F1:", np.mean(cv_scores_logreg))
print("Standard Deviation F1:", np.std(cv_scores_logreg))


Logistic Regression CV F1-scores: [0.58422939 0.55363322 0.58940397 0.59567388 0.58387097]
Mean F1: 0.5813622853595666
Standard Deviation F1: 0.014511411765628292


In [7]:
from sklearn.tree import DecisionTreeClassifier

# Define best hyperparameters from tuning
best_tree_params = {
    "max_depth": 5,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "criterion": "gini"
}

# Build Decision Tree
best_tree = DecisionTreeClassifier(**best_tree_params, random_state=42)

# Cross-validation using F1-score
cv_scores_tree = cross_val_score(best_tree, X_full, y_full, cv=skf, scoring="f1", n_jobs=-1)

print("Decision Tree CV F1-scores:", cv_scores_tree)
print("Mean F1:", np.mean(cv_scores_tree))
print("Standard Deviation F1:", np.std(cv_scores_tree))


Decision Tree CV F1-scores: [0.515625   0.52059308 0.52       0.58751903 0.56585366]
Mean F1: 0.5419181530273305
Standard Deviation F1: 0.02925351967002681


In [8]:
print("Logistic Regression Mean F1: {:.3f}, Std: {:.3f}".format(np.mean(cv_scores_logreg), np.std(cv_scores_logreg)))
print("Decision Tree Mean F1: {:.3f}, Std: {:.3f}".format(np.mean(cv_scores_tree), np.std(cv_scores_tree)))

if np.mean(cv_scores_logreg) > np.mean(cv_scores_tree):
    print("Logistic Regression shows better generalization and stability.")
else:
    print("Decision Tree shows better generalization and stability.")


Logistic Regression Mean F1: 0.581, Std: 0.015
Decision Tree Mean F1: 0.542, Std: 0.029
Logistic Regression shows better generalization and stability.


In [9]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Best hyperparameters from tuning
best_logreg_params = {
    "C": 0.01,
    "solver": "lbfgs",
    "max_iter": 200
}

# Pipeline with scaling + Logistic Regression
final_logreg_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(**best_logreg_params))
])

# Train on full data (Train + Validation)
final_logreg_pipe.fit(X_full, y_full)

# Save the final trained model
model_path = "/home/danial/Data Science/Churn Prediction/Models/logreg_final.pkl"
joblib.dump(final_logreg_pipe, model_path)

print(f"Final Logistic Regression model trained and saved at: {model_path}")


Final Logistic Regression model trained and saved at: /home/danial/Data Science/Churn Prediction/Models/logreg_final.pkl
