<a href="https://colab.research.google.com/github/emaantech99/Auto-Tagging-Support-Tickets-Using-LLM/blob/main/Task_2_End_to_End_ML_Pipeline_with_Scikit_learn_Pipeline_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib

# --- 1. Data Loading and Initial Exploration ---

# Load the dataset from the provided CSV file
# In a real scenario, you would replace 'WA_Fn-UseC_-Telco-Customer-Churn.csv' with your file path.
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

print("--- Initial Data Overview ---")
print(df.info())
print("\n--- First 5 Rows of the Dataset ---")
print(df.head())

# --- 2. Data Cleaning and Preprocessing ---

print("\n--- Starting Data Cleaning and Preprocessing ---")

# Drop customerID as it is not a useful feature for prediction
df = df.drop('customerID', axis=1)

# The 'TotalCharges' column is of object type and contains spaces.
# Convert it to a numeric type, coercing errors to NaN (Not a Number).
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for missing values after conversion
print(f"\nMissing values in TotalCharges after conversion: {df['TotalCharges'].isnull().sum()}")

# Separate features (X) from the target variable (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Encode the target variable 'Churn' into numerical format (Yes=1, No=0)
y = y.apply(lambda x: 1 if x == 'Yes' else 0)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# --- 3. Pipeline Construction ---

print("\n--- Building Preprocessing Pipelines ---")

# Identify categorical and numerical feature columns
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=np.number).columns

print(f"\nNumerical features: {list(numerical_features)}")
print(f"Categorical features: {list(categorical_features)}")

# Create a preprocessing pipeline for numerical data
# This will fill missing values with the median and then scale the data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create a preprocessing pipeline for categorical data
# This will fill missing values with the most frequent value and apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine numerical and categorical transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# --- 4. Model Training and Hyperparameter Tuning ---

# === Logistic Regression ===
print("\n--- Training and Tuning Logistic Regression Model ---")

# Create the full pipeline with the preprocessor and the Logistic Regression model
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression(solver='liblinear', random_state=42))])

# Define the parameter grid for GridSearchCV
param_grid_lr = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2']
}

# Instantiate and fit GridSearchCV
grid_search_lr = GridSearchCV(lr_pipeline, param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

print(f"Best parameters for Logistic Regression: {grid_search_lr.best_params_}")
print(f"Best cross-validation accuracy: {grid_search_lr.best_score_:.4f}")

# === Random Forest ===
print("\n--- Training and Tuning Random Forest Model ---")

# Create the full pipeline with the preprocessor and the Random Forest model
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])

# Define a smaller parameter grid for efficiency in this example
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None]
}

# Instantiate and fit GridSearchCV
grid_search_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best cross-validation accuracy: {grid_search_rf.best_score_:.4f}")


# --- 5. Model Evaluation ---

print("\n--- Evaluating Models on the Test Set ---")

# Evaluate Logistic Regression
lr_best_model = grid_search_lr.best_estimator_
y_pred_lr = lr_best_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print(f"Logistic Regression Test Accuracy: {accuracy_lr:.4f}")
print(f"Logistic Regression Test F1-Score: {f1_lr:.4f}")

# Evaluate Random Forest
rf_best_model = grid_search_rf.best_estimator_
y_pred_rf = rf_best_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"\nRandom Forest Test Accuracy: {accuracy_rf:.4f}")
print(f"Random Forest Test F1-Score: {f1_rf:.4f}")


# --- 6. Exporting the Final Pipeline ---

# Based on the evaluation, Logistic Regression performed slightly better on accuracy and F1-score.
# We will select it as our final model.
final_pipeline = grid_search_lr.best_estimator_

# Export the final trained pipeline to a file
pipeline_filename = 'churn_pipeline.joblib'
joblib.dump(final_pipeline, pipeline_filename)

print(f"\n--- Final Model Exported ---")
print(f"The trained Logistic Regression pipeline has been saved to '{pipeline_filename}'.")

# Example of how to load and use the pipeline
print("\n--- Example of Loading and Using the Pipeline ---")
loaded_pipeline = joblib.load(pipeline_filename)

# Create a sample data point for prediction
sample_data = X_test.iloc[0:1]
prediction = loaded_pipeline.predict(sample_data)
prediction_proba = loaded_pipeline.predict_proba(sample_data)

print(f"Sample Data:\n{sample_data}")
print(f"\nPrediction (0=No Churn, 1=Churn): {prediction[0]}")
print(f"Prediction Probability ([No Churn, Churn]): {prediction_proba[0]}")

--- Initial Data Overview ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBil