# Load the data 

In [None]:
# Import basic libraries

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
# Load the dataset 

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
# Read the first few rows from the dataset

df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

# Data Cleaning

In [30]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

print(df['TotalCharges'].isna().sum())

11


In [31]:
df[df['TotalCharges'].isna()]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [32]:
# Drop rows with missing TotalCharges (these are usually new customers with tenure = 0)

df = df[df['TotalCharges'].notna()]

In [33]:
df.drop('customerID', axis=1, inplace=True)

df.reset_index(drop=True, inplace=True)

df.shape

(7032, 20)

# Encode categorical variables

In [None]:
# First, convert target column
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Binary columns to label encode
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 
               'PaperlessBilling']

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})

# One-hot encode multi-class categorical columns
multi_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
              'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
              'Contract', 'PaymentMethod']

df = pd.get_dummies(df, columns=multi_cols)

# Check final data types and shape
df.info()


# Exploratory Data Analysis

In [None]:
sns.set_theme(style="whitegrid")

# Plot churn distribution
plt.figure(figsize=(6,4))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.xticks([0, 1], ['No', 'Yes'])
plt.show()

# Plot churn rate by contract type
contract_cols = [col for col in df.columns if 'Contract' in col]
contract_df = df[['Churn'] + contract_cols]

# Melt dataframe for easier plotting
contract_melt = contract_df.melt(id_vars='Churn', var_name='ContractType', value_name='Value')
contract_melt = contract_melt[contract_melt['Value'] == True]

plt.figure(figsize=(8,5))
sns.barplot(x='ContractType', y='Churn', data=contract_melt, estimator=np.mean)
plt.xticks(rotation=45)
plt.title('Churn Rate by Contract Type')
plt.show()

# Churn vs tenure (continuous)
plt.figure(figsize=(8,5))
sns.kdeplot(df[df['Churn']==0]['tenure'], label='No Churn', fill=True)
sns.kdeplot(df[df['Churn']==1]['tenure'], label='Churn', fill=True)
plt.title('Distribution of Tenure by Churn')
plt.xlabel('Tenure (months)')
plt.legend()
plt.show()

# Churn vs MonthlyCharges
plt.figure(figsize=(8,5))
sns.kdeplot(df[df['Churn']==0]['MonthlyCharges'], label='No Churn', fill=True)
sns.kdeplot(df[df['Churn']==1]['MonthlyCharges'], label='Churn', fill=True)
plt.title('Distribution of Monthly Charges by Churn')
plt.xlabel('Monthly Charges')
plt.legend()
plt.show()


# Handling imbalance

In [None]:
df['Churn'].value_counts(normalize=True)

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42, 
                                                    stratify=y)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


# Modeling

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, ConfusionMatrixDisplay

# Initialize model with class_weight balanced
lr_model = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)

# Train
lr_model.fit(X_train, y_train)

# Predict
y_pred = lr_model.predict(X_test)
y_prob = lr_model.predict_proba(X_test)[:, 1]

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Churn', 'Churn'])
disp.plot(cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.show()


### Feature Importance

In [None]:
# Get coefficients
coeff_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lr_model.coef_[0]
})

# Sort by absolute value
coeff_df['abs_coeff'] = coeff_df['Coefficient'].abs()
coeff_df = coeff_df.sort_values(by='abs_coeff', ascending=False)

# Show top 10
print(coeff_df[['Feature', 'Coefficient']].head(10))


#### Business Insights:

- Incentivize customers on month-to-month plans to switch to longer contracts (eg., discounts, loyalty perks).
- Promote security and tech support services to increase stickiness.
- Target Fiber customers with special retention offers (they’re more at risk).
- Address potential churn risk in customers using electronic checks (e.g., offer smoother auto-pay options).

## Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay

# Initialize Random Forest with class_weight balanced
rf_model = RandomForestClassifier(n_estimators=100, 
                                  random_state=42, 
                                  class_weight='balanced')

# Train
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_rf))

# Confusion matrix plot
cm_rf = confusion_matrix(y_test, y_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=['No Churn', 'Churn'])
disp_rf.plot(cmap='Greens')
plt.title('Random Forest Confusion Matrix')
plt.show()


## Feature Importance

In [None]:
# Get feature importances
importances = rf_model.feature_importances_
feat_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Show top 10
print(feat_df.head(10))


#### Business Insights

- TotalCharges and tenure: Customers who have spent more historically are less likely to churn. Focus on nurturing these relationships.

- High monthly charges: Consider revisiting pricing or providing added value for high spenders.

- Contract type: Encourage long-term contracts (e.g., loyalty discounts, extra perks).

- Tech support & security services: Promote as retention tools — customers without them are more at risk.

## XGBoost

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, ConfusionMatrixDisplay

# Initialize model
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')


# Train
xgb_model.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_xgb))

# Confusion matrix
ConfusionMatrixDisplay.from_estimator(xgb_model, X_test, y_test, 
                                      display_labels=['No Churn', 'Churn'],
                                      cmap='Blues')
plt.title('XGBoost Confusion Matrix')
plt.show()


## Feature Importance

In [None]:
# Get feature importances
importances = xgb_model.feature_importances_
feat_df_xgb = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Show top 10
print(feat_df_xgb.head(10))


#### Business Insights

- Fiber optic customers churn more → consider targeted retention offers (e.g., bundles, loyalty discounts).
- Month-to-month contracts highly risky → incentivize moving to 1- or 2-year contracts.
- Streaming service users churn more → could be due to higher bills; create custom packages or value-added offers.
- No security or support services linked to churn → upsell or bundle support/security to increase loyalty.
- Electronic check users churn more → encourage them to switch to auto-pay or easier payment methods.

In [None]:
# Save using XGBoost native Booster save
xgb_model.get_booster().save_model('app/model/churn_model.json')

In [None]:
feature_names = X_train.columns.tolist()

import json
with open('app/model/feature_names.json', 'w') as f:
    json.dump(feature_names, f)
