In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [2]:
df = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
# 🧹 Clean Data
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)
df.drop("customerID", axis=1, inplace=True)
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

In [4]:
# 🔀 Split Features and Target
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [5]:
# 🔍 Feature Types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

In [6]:
# ⚙️ Preprocessing Pipelines
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [7]:
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])


In [8]:
# 🔁 Pipeline Base
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))  # Placeholder
])

In [9]:
# 🔧 Grid Search Parameters
param_grid = [
    {"classifier": [LogisticRegression(max_iter=1000)],
     "classifier__C": [0.1, 1.0, 10.0]},

    {"classifier": [RandomForestClassifier(random_state=42)],
     "classifier__n_estimators": [100, 200],
     "classifier__max_depth": [5, 10]}
]


In [10]:
# 📊 Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# 🔍 Grid Search
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring="f1", verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


0,1,2
,estimator,Pipeline(step..._iter=1000))])
,param_grid,"[{'classifier': [LogisticRegre...max_iter=1000)], 'classifier__C': [0.1, 1.0, ...]}, {'classifier': [RandomForestC...ndom_state=42)], 'classifier__max_depth': [5, 10], 'classifier__n_estimators': [100, 200]}]"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [12]:
# ✅ Best Model
best_model = grid_search.best_estimator_
print("✅ Best Parameters:", grid_search.best_params_)

✅ Best Parameters: {'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 10.0}


In [13]:
# 📈 Evaluate
y_pred = best_model.predict(X_test)
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))


📊 Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.62      0.52      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [14]:
# 💾 Export Model
joblib.dump(best_model, "customer_churn_pipeline.pkl")
print("✅ Pipeline saved as 'customer_churn_pipeline.pkl'")

✅ Pipeline saved as 'customer_churn_pipeline.pkl'
