In [4]:
# --- Cell 1: Load train and validation datasets for model selection ---
import pandas as pd

# File paths
train_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/train.csv"
val_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/val.csv"

# Load datasets
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

# Split features and target
X_train = train_df.drop(columns=['Churn'])
y_train = train_df['Churn']
X_val = val_df.drop(columns=['Churn'])
y_val = val_df['Churn']

print("Train shape:", X_train.shape, "Validation shape:", X_val.shape)


Train shape: (5634, 29) Validation shape: (704, 29)


In [5]:
# --- Cell 2: Train and evaluate a baseline Logistic Regression model ---
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Predict and evaluate on validation set
y_pred_val = log_reg.predict(X_val)
acc_log = accuracy_score(y_val, y_pred_val)

print(f"Validation Accuracy (Logistic Regression): {acc_log:.4f}")


Validation Accuracy (Logistic Regression): 0.8054


In [6]:
# --- Cell 3: Train and evaluate a baseline Decision Tree model ---
from sklearn.tree import DecisionTreeClassifier

# Initialize and train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predict and evaluate on validation set
y_pred_val_dt = dt.predict(X_val)
acc_dt = accuracy_score(y_val, y_pred_val_dt)

print(f"Validation Accuracy (Decision Tree): {acc_dt:.4f}")


Validation Accuracy (Decision Tree): 0.7358


In [7]:
# --- Cell 4: Train and evaluate a baseline Naive Bayes model ---
from sklearn.naive_bayes import GaussianNB

# Initialize and train Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict and evaluate on validation set
y_pred_val_nb = nb.predict(X_val)
acc_nb = accuracy_score(y_val, y_pred_val_nb)

print(f"Validation Accuracy (Naive Bayes): {acc_nb:.4f}")


Validation Accuracy (Naive Bayes): 0.7074


In [8]:
# --- Cell 5: Compare baseline model performances and choose the best candidate ---
results = {
    "Logistic Regression": acc_log,
    "Decision Tree": acc_dt,
    "Naive Bayes": acc_nb
}

# Print all accuracies
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}")

# Find the best model
best_model = max(results, key=results.get)
print(f"\nBest candidate model for next steps: {best_model}")


Logistic Regression: 0.8054
Decision Tree: 0.7358
Naive Bayes: 0.7074

Best candidate model for next steps: Logistic Regression
