END TO END ML PIPLINE 

Libraries For Machine Learning

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


Read dataset

In [5]:
df=pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

Dataset information

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [9]:
df.shape

(7043, 21)

Target

In [12]:
target = "Churn"

In [13]:
X = df.drop(columns=[target])
y = df[target].map({"Yes": 1, "No": 0}) 

 Train-test split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Preprocessing

In [15]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ])

Pipeline with Models

In [17]:
logreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

In [18]:
# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

 Hyperparameter Tuning with GridSearchCV

In [19]:
logreg_params = {
    "classifier__C": [0.01, 0.1, 1, 10]
}

In [20]:
rf_params = {
    "classifier__n_estimators": [50, 100],
    "classifier__max_depth": [5, 10, None]
}

In [21]:
# Run GridSearchCV
logreg_grid = GridSearchCV(logreg_pipeline, logreg_params,cv=5, scoring="accuracy", n_jobs=-1)
rf_grid = GridSearchCV(rf_pipeline, rf_params,cv=5, scoring="accuracy", n_jobs=-1)

# Fit both models
logreg_grid.fit(X_train, y_train)
rf_grid.fit(X_train, y_train)

Evaluate Models

In [22]:
# Logistic Regression Results
print("Best Logistic Regression Params:", logreg_grid.best_params_)
y_pred_lr = logreg_grid.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# Random Forest Results
print("Best Random Forest Params:", rf_grid.best_params_)
y_pred_rf = rf_grid.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Best Logistic Regression Params: {'classifier__C': 0.01}
Logistic Regression Accuracy: 0.7977288857345636
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.66      0.50      0.57       374

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Best Random Forest Params: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Random Forest Accuracy: 0.7906316536550745
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409



Export Best Pipeline

In [23]:
best_pipeline = logreg_grid if logreg_grid.best_score_ >= rf_grid.best_score_ else rf_grid

joblib.dump(best_pipeline.best_estimator_, "customer_churn_pipeline.pkl")
print("Pipeline saved as customer_churn_pipeline.pkl")

Pipeline saved as customer_churn_pipeline.pkl


In [None]:
# preprocessor = joblib.load("preprocessor.pkl")
# from tensorflow.keras.models import load_model
# model = load_model("housing_model.h5")
