# Customer Churn Prediction Project
# Notebook 2: Model Building
# Goal: Clean data and train a churn prediction model


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# Load the dataset
df = pd.read_csv("../data/churn.csv")

# Quick check
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST100000,Male,0,No,Yes,61,Yes,Fiber optic,Two year,Bank transfer,68.29,4208.67,No
1,CUST100001,Female,0,No,No,68,Yes,Fiber optic,Two year,Electronic check,71.31,4868.98,No
2,CUST100002,Male,0,No,No,62,Yes,DSL,Two year,Electronic check,77.3,4857.57,No
3,CUST100003,Male,1,Yes,No,1,Yes,DSL,Month-to-month,Credit card,86.3,63.0,Yes
4,CUST100004,Male,0,Yes,Yes,53,Yes,DSL,Month-to-month,Credit card,26.29,1172.3,No


In [6]:
df.isnull().sum()

customerID         0
gender             0
SeniorCitizen      0
Partner            0
Dependents         0
tenure             0
PhoneService       0
InternetService    0
Contract           0
PaymentMethod      0
MonthlyCharges     0
TotalCharges       0
Churn              0
dtype: int64

In [4]:
# Fill missing TotalCharges with median
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

In [5]:
# Convert target to binary
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

In [7]:
# Separate features and target
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Convert categorical columns to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y)


In [9]:
# Scale features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)

model.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000)

In [11]:
# Predict on test data
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

In [16]:
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print(classification_report(y_test, y_pred))

# ROC-AUC
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.753
              precision    recall  f1-score   support

           0       0.75      1.00      0.86       753
           1       0.00      0.00      0.00       247

    accuracy                           0.75      1000
   macro avg       0.38      0.50      0.43      1000
weighted avg       0.57      0.75      0.65      1000

ROC-AUC: 0.9464866579565678


  _warn_prf(average, modifier, msg_start, len(result))
