# 🛠️ End-to-End ML Pipeline with Scikit-learn
Predict customer churn using a production-ready ML pipeline with preprocessing, training, tuning, and export.

In [None]:
# ✅ Step 0: Install Required Libraries
!pip install pandas scikit-learn joblib --quiet

In [None]:
# 📥 Step 1: Load Dataset
import pandas as pd
url = "https://raw.githubusercontent.com/blastchar/telco-churn/master/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(url)
df = df.dropna()
df = df[df['TotalCharges'] != ' ']
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])
df.head()

In [None]:
# 🧹 Step 2: Define Preprocessing Pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

# Identify columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

# Preprocessing steps
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [None]:
# 🤖 Step 3: Train Models with Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline for Logistic Regression
logreg_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Pipeline for Random Forest
rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [None]:
# 🔍 Step 4: Hyperparameter Tuning with GridSearchCV
param_grid_logreg = {
    'classifier__C': [0.1, 1.0, 10.0]
}

param_grid_rf = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [None, 10, 20]
}

grid_logreg = GridSearchCV(logreg_pipeline, param_grid_logreg, cv=5, scoring='accuracy')
grid_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring='accuracy')

# Fit both models
grid_logreg.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)

print("Best Logistic Regression Parameters:", grid_logreg.best_params_)
print("Best Random Forest Parameters:", grid_rf.best_params_)

In [None]:
# 📊 Step 5: Evaluate Best Model
from sklearn.metrics import accuracy_score, f1_score

best_model = grid_rf.best_estimator_  # You can switch to grid_logreg.best_estimator_
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# 💾 Step 6: Save Final Pipeline
import joblib
joblib.dump(best_model, "churn_model_pipeline.joblib")
print("Pipeline saved as 'churn_model_pipeline.joblib'")