In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score
import pickle
import warnings
warnings.filterwarnings('ignore')

# --- Configuration ---
TELCO_FILE_NAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
MODEL_SAVE_PATH = "telco_churn_voting_model.pkl"

# Load dataset
df = pd.read_csv(TELCO_FILE_NAME)
print("Data loaded successfully. Total Rows:", df.shape[0])
df.head(5)

Data loaded successfully. Total Rows: 7043


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
## 2.1 Handling 'TotalCharges' (String to Numeric)
# 'TotalCharges' column mein khaali spaces hote hain, jo errors='coerce' se NaN ban jaate hain.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

## 2.2 Missing Value Imputation (Telco data mein sirf TotalCharges mein missing values hain)
# Missing values ko median se fill karna (KNN Imputer ya Simple Imputer bhi use kar sakte hain)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

## 2.3 Dropping Irrelevant ID
df.drop('customerID', axis=1, inplace=True)

## 2.4 Target Variable Encoding
# 'Churn' column (Yes/No) ko 1/0 mein badalna
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

print("Data Cleaning complete. TotalCharges converted and Target variable encoded.")

Data Cleaning complete. TotalCharges converted and Target variable encoded.


In [7]:
# Yeh step data ki detailed, interactive report banata hai (browser mein khul jayegi)
print("\n--- Generating YData Profile Report ---")
profile = ProfileReport(
    df, 
    title="Telco Churn Detailed Profile", 
    explorative=True
)
profile.to_file("Telco_Churn_Profile_Report.html")
print("Telco_Churn_Profile_Report.html file save ho gayi hai.")

# --- Key EDA Insight (Imbalance Check) ---
churn_counts = df['Churn'].value_counts(normalize=True)
print("\n--- Churn Distribution ---")
print(churn_counts)
# Churn Rate: ~26.5%, Non-Churn Rate: ~73.5%. This is a significant imbalance.

plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution (0: No, 1: Yes)')
plt.show()


--- Generating YData Profile Report ---


Summarize dataset:  20%|█████▊                       | 5/25 [00:00<00:01, 19.17it/s, Describe variable: OnlineSecurity]
Summarize dataset:  52%|███████████████              | 13/25 [00:00<00:00, 24.91it/s, Describe variable: PaymentMethod][A
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 79.91it/s][A
Summarize dataset: 100%|████████████████████████████████████████████████████| 38/38 [00:02<00:00, 12.88it/s, Completed]
Generate report structure: 100%|█████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.04s/it]
Render HTML: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.32it/s]
Export report to file: 100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


Telco_Churn_Profile_Report.html file save ho gayi hai.

--- Churn Distribution ---
Churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64


In [8]:
# --- Feature Definition ---
X = df.drop('Churn', axis=1)
y = df['Churn']

# Separate features by type
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
# Features like SeniorCitizen (0/1) are numeric but often treated as categorical
categorical_features = X.columns.drop(numerical_features).tolist()

# --- Preprocessing Pipeline ---
preprocessor = ColumnTransformer(
    transformers=[
        # Numerical features ko scale karna
        ('num', StandardScaler(), numerical_features),
        # Categorical features ko One-Hot Encode karna
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

# --- Train-Test Split ---
# Stratify=y use karna zaroori hai taki train aur test set mein Churn ka proportion maintain rahe
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTraining set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")


Training set size: 5634, Test set size: 1409


In [9]:
# --- Model Definition with Class Weighting (FIX for Imbalance) ---
# FIX: class_weight='balanced' use kiya ja raha hai kyunki Churn rate 26.5% hai.
# Isse model Churn (Minority Class) ki galtiyon ko zyada importance dega.

lr = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
gb = GradientBoostingClassifier(n_estimators=100, random_state=42) # GB does not support class_weight directly

# Voting Classifier (Soft voting probability par aadharit hai)
voting_clf = VotingClassifier(
    estimators=[('lr', lr), ('rf', rf), ('gb', gb)],
    voting='soft',
    weights=[1.2, 1.0, 1.0] # LR ko halka sa zyada weight diya gaya, jo imbalance handling mein accha perform karta hai
)

# --- Final ML Pipeline ---
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_clf)
])

# --- Training ---
print("\n--- Training Voting Classifier Pipeline ---")
model_pipeline.fit(X_train, y_train)
print("Training finished.")


--- Training Voting Classifier Pipeline ---
Training finished.


In [10]:
# --- Prediction and Evaluation ---
y_pred = model_pipeline.predict(X_test)

print("\n--- Model Evaluation Results ---")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Churn (0)', 'Churn (1)'], 
            yticklabels=['No Churn (0)', 'Churn (1)'])
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

# Key Insight: Recall for Churn (1) is the most important metric. 
# Class weighting ke baad, Recall for 1 (Churn) mein behtari aani chahiye (ideally > 0.65).
churn_recall = recall_score(y_test, y_pred, pos_label=1)
print(f"\nCritical Metric: Churn Recall (Sensitivity) = {churn_recall:.4f}")


--- Model Evaluation Results ---
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      1035
           1       0.60      0.63      0.62       374

    accuracy                           0.79      1409
   macro avg       0.73      0.74      0.74      1409
weighted avg       0.79      0.79      0.79      1409


Critical Metric: Churn Recall (Sensitivity) = 0.6337


In [11]:
# --- Save the Model Pipeline ---
save_data = {
    "model_pipeline": model_pipeline,              
    "feature_names": list(X.columns) # Yeh features Streamlit app mein use honge
}

with open(MODEL_SAVE_PATH, "wb") as f:
    pickle.dump(save_data, f)

print(f"\n✅ End-to-End ML Project Complete! Model saved as '{MODEL_SAVE_PATH}'.")
print("Ab aap is model ko use karke Streamlit UI (Telco Churn App) bana sakte hain.")


✅ End-to-End ML Project Complete! Model saved as 'telco_churn_voting_model.pkl'.
Ab aap is model ko use karke Streamlit UI (Telco Churn App) bana sakte hain.
