In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score
import joblib

In [2]:
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
print("🔥 Dataset Shape:", df.shape)
print("\n📊 First 5 Rows:")
print(df.head())

print("\n🧾 Data Info (dtypes, missing values):")
print(df.info())

🔥 Dataset Shape: (7043, 21)

📊 First 5 Rows:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport Str

In [3]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors="coerce")
df.dropna(inplace=True)
df.drop('customerID',axis=1,inplace=True)
df['Churn']=df['Churn'].map({'Yes':1,'No':0})
print(df.shape)
print(df.head(3))

(7032, 20)
   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1              Yes          No          No              No        One year   
2               No          No          No              No  Month-to-month   

  PaperlessBilling     PaymentMethod  MonthlyCharges  TotalCharges  Churn  
0              Yes  Electronic check           29.85      

In [4]:
x=df.drop('Churn',axis=1)
y=df['Churn']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)
print(x_train.shape,x_test.shape)
print(y_train.value_counts(normalize=True).map(lambda x: f"{x:.1%}"))

(4922, 19) (2110, 19)
Churn
0    73.4%
1    26.6%
Name: proportion, dtype: object


In [9]:
numerical_features=x.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = x.select_dtypes(include=['object']).columns.tolist()
print(numerical_features)
print(categorical_features)

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [10]:
preprocessor=ColumnTransformer(
    transformers=[('num',StandardScaler(),numerical_features),
                  ('cat',OneHotEncoder(),categorical_features)],
                  remainder='passthrough'
)

In [19]:
model_pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression(random_state=42,max_iter=1000))
])

#Training
model_pipeline.fit(x_train,y_train)

#Evaluate
y_pred_lr=model_pipeline.predict(x_test)
print("Random forest performance")
print(classification_report(y_test,y_pred_lr))

Random forest performance
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1549
           1       0.66      0.57      0.61       561

    accuracy                           0.81      2110
   macro avg       0.75      0.73      0.74      2110
weighted avg       0.80      0.81      0.80      2110



In [13]:
rf_pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

#Training
rf_pipeline.fit(x_train,y_train)

#Evaluate
y_pred_rf=rf_pipeline.predict(x_test)
print("Random forest performance")
print(classification_report(y_test,y_pred_rf))

Random forest performance
              precision    recall  f1-score   support

           0       0.82      0.89      0.86      1549
           1       0.61      0.47      0.53       561

    accuracy                           0.78      2110
   macro avg       0.72      0.68      0.69      2110
weighted avg       0.77      0.78      0.77      2110



In [20]:
#Logistic Regression
y_pred_lr=model_pipeline.predict(x_test)
y_pred_proba_lr=model_pipeline.predict_proba(x_test)[:,1]
# Random Forest
y_pred_rf = rf_pipeline.predict(x_test)
y_pred_proba_rf = rf_pipeline.predict_proba(x_test)[:, 1]

# AUC scores
auc_lr = roc_auc_score(y_test, y_pred_proba_lr)
auc_rf = roc_auc_score(y_test, y_pred_rf)

print("📊 Model Comparison")
print(f"Logistic Regression AUC: {auc_lr:.4f}, Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Random Forest AUC:      {auc_rf:.4f}, Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")

📊 Model Comparison
Logistic Regression AUC: 0.8379, Accuracy: 0.8062
Random Forest AUC:      0.6796, Accuracy: 0.7791


In [23]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 15, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__class_weight': ['balanced', None]
}
grid = GridSearchCV(
    rf_pipeline,           # your Random Forest pipeline
    param_grid,
    cv=5,                  # 5-fold cross-validation
    scoring='roc_auc',     # optimize for AUC
    n_jobs=-1,
    verbose=1
)
grid.fit(x_train,y_train)
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation AUC:", grid.best_score_.round(4))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best Cross-Validation AUC: 0.8468


In [24]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred_best = grid.best_estimator_.predict(x_test)
y_pred_proba_best = grid.best_estimator_.predict_proba(x_test)[:, 1]

print("Final Model - Classification Report:")
print(classification_report(y_test, y_pred_best))

print(f"Final Test AUC: {roc_auc_score(y_test, y_pred_proba_best):.4f}")

Final Model - Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.79      0.83      1549
           1       0.55      0.71      0.62       561

    accuracy                           0.77      2110
   macro avg       0.72      0.75      0.73      2110
weighted avg       0.79      0.77      0.78      2110

Final Test AUC: 0.8318


In [25]:
import joblib
joblib.dump(grid.best_estimator_, 'churn_prediction_pipeline.pkl')
print("💾 Model saved!")

💾 Model saved!
