# Predicting Customer Churn: Final Milestone
This notebook explores customer churn prediction using the Telco dataset. It includes data cleaning, exploratory visualizations, model building (Logistic Regression, Random Forest, SVM), and hyperparameter tuning.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

# Load data
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn (1).csv")
print(df.info())


In [None]:
# Clean TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])

# Drop customerID
df.drop('customerID', axis=1, inplace=True)

# Convert target to binary
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# One-hot encode
df = pd.get_dummies(df, drop_first=True)


In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True)
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    print(f"{name}: Mean AUC = {scores.mean():.3f}, Std = {scores.std():.3f}")


In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='roc_auc')
grid_rf.fit(X_train, y_train)

print("Best Score:", grid_rf.best_score_)
print("Best Params:", grid_rf.best_params_)


In [None]:
best_model = grid_rf.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))

RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.show()


In [None]:
raw_data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn (1).csv")
plt.figure(figsize=(10, 6))
sns.histplot(data=raw_data, x='MonthlyCharges', hue='Churn', bins=30, kde=False, multiple='stack')
plt.title("Monthly Charges Distribution by Churn Status")
plt.tight_layout()
plt.show()


## Insights & Recommendations
- Customers with high monthly charges are more likely to churn.
- Fiber optic service and electronic check payments are associated with higher churn.
- Long-term contracts and bundling services may reduce churn risk.
- Use churn probability scoring for targeted retention campaigns.
