## Telco Customer Churn - Part 4 :  Production-Ready Pipeline

In [1]:
!pip install -q -r requirements.txt

In [2]:
import imblearn
import sklearn

print(f"imblearn version: {imblearn.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")

imblearn version: 0.14.0
scikit-learn version: 1.7.0


#### 0. Setup: imports, config, logger

In [4]:
import os
import pprint
import pandas as pd
from joblib import load
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from src.utils import load_config, get_logger, ensure_dir
from src.data_ingestion import DataIngestion
from src.handle_missing_values import MissingValueHandler
from src.outlier_detection import iqr_outliers, zscore_outliers, remove_outliers
from src.data_splitter import stratified_split
from src.data_pipeline import build_preprocessing_pipeline
from src.data_splitter import stratified_split
from src.model_building import build_model
from src.model_training import Trainer
from src.model_evaluation import evaluate, threshold_search, compute_business_cost
from src.model_inference import InferenceService

In [5]:
# set project root
PROJECT_ROOT = Path('.').resolve()
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'config.yaml'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# Define trained models directory
TRAINED_MODELS_DIR = PROJECT_ROOT / 'models' / 'trained_models'
TRAINED_MODELS_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
# import modules from src
logger = get_logger('Part4_Maximum')
print('Imported src modules successfully')

Imported src modules successfully


In [7]:
# load config
if CONFIG_PATH.exists():
    config = load_config(str(CONFIG_PATH))
else:
    # fallback
    config = {
        'seed': 42,
        'data': {'raw_path': '/mnt/data/telco-customer-churn_cleaned.csv', 'output_dir': str(ARTIFACTS_DIR)},
        'processing': {'numeric_impute': 'median', 'categorical_impute': 'most_frequent', 'scaler': 'standard', 'encoder': 'onehot'},
        'model': {'rf': {'n_estimators': 300, 'max_depth': 12, 'min_samples_split': 5}, 'xgb': {'n_estimators': 300, 'learning_rate': 0.05}, 'catboost': {'iterations': 300}},
        'training': {'cv_folds': 5, 'test_size': 0.2, 'scoring': 'roc_auc'},
        'business': {'cost_fp': 10.0, 'months_lost': 6}
    }
print('Using fallback config')

pprint.pprint(config)

Using fallback config
{'business': {'cost_fp': 10.0, 'months_lost': 6},
 'data': {'output_dir': 'artifacts',
          'raw_path': 'src/data/processed/telco-customer-churn_cleaned.csv'},
 'model': {'catboost': {'iterations': 300},
           'rf': {'max_depth': 12, 'min_samples_split': 5, 'n_estimators': 300},
           'xgb': {'learning_rate': 0.05, 'n_estimators': 300}},
 'processing': {'categorical_impute': 'most_frequent',
                'encoder': 'onehot',
                'numeric_impute': 'median',
                'scaler': 'standard'},
 'seed': 42,
 'training': {'cv_folds': 5, 'scoring': 'roc_auc', 'test_size': 0.2}}


### 1. Data Ingestion

In [9]:
data_path = config['data']['raw_path']
print('Loading data from', data_path)

ingestor = DataIngestion(data_path)
df = ingestor.load_csv()
print('\nLoaded DataFrame shape:', df.shape)
print('\nColumns:', df.columns.tolist()[:30])

2025-08-23 15:16:19,026 - data_ingestion - INFO - Loading data from src/data/processed/telco-customer-churn_cleaned.csv
2025-08-23 15:16:19,049 - data_ingestion - INFO - Loaded dataframe shape: (7043, 24)


Loading data from src/data/processed/telco-customer-churn_cleaned.csv

Loaded DataFrame shape: (7043, 24)

Columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn', 'tenure_group', 'services_count', 'avg_charge_per_month']


### 2. Missing Value Handling

In [10]:
print('\nMissing values before:')
print(df.isnull().sum()[lambda s: s>0])

mv_handler = MissingValueHandler(numeric_strategy=config['processing']['numeric_impute'], categorical_strategy=config['processing']['categorical_impute'])
# fit-transform on entire dataframe
mv_handler.fit(df)
df_imputed = mv_handler.transform(df)
print('\nMissing values after imputation:')
print(df_imputed.isnull().sum()[lambda s: s>0])


Missing values before:
TotalCharges            11
avg_charge_per_month    11
dtype: int64

Missing values after imputation:
Series([], dtype: int64)


### 3. Outlier detection

In [11]:
numeric_cols = df_imputed.select_dtypes(include=['number']).columns.tolist()
print('Numeric cols:', numeric_cols)

outlier_counts = {c: iqr_outliers(df_imputed[c]).sum() for c in ['tenure','MonthlyCharges','TotalCharges'] if c in df_imputed.columns}
print('\nIQR outlier counts (example):')
print(outlier_counts)

Numeric cols: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'services_count', 'avg_charge_per_month']

IQR outlier counts (example):
{'tenure': 0, 'MonthlyCharges': 0, 'TotalCharges': 0}


### 4. Feature Engineering

In [12]:
# Create engineered features
df_fe = df_imputed.copy()

In [13]:
# tenure_category
def tenure_category(t):
    if t <= 12:
        return 'New'
    elif t <= 48:
        return 'Established'
    else:
        return 'Loyal'


if 'tenure' in df_fe.columns:
    df_fe['tenure_category'] = df_fe['tenure'].apply(tenure_category)

In [14]:
# services_count
SERVICES = ['PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
service_map = {'Yes':1,'No':0,'No phone service':0,'No internet service':0}
for s in SERVICES:
    if s not in df_fe.columns:
        # if some services missing, skip
        pass
    
service_flags = df_fe[SERVICES].map(lambda x: service_map.get(x,0))
df_fe['services_count'] = service_flags.sum(axis=1)
df_fe['service_adoption_score'] = df_fe['services_count'] / len(SERVICES)

In [15]:
# avg charge per active service
if 'MonthlyCharges' in df_fe.columns:
    df_fe['avg_charge_per_service'] = df_fe['MonthlyCharges'] / df_fe['services_count'].replace(0,1)

In [16]:
# payment proxies
if 'PaymentMethod' in df_fe.columns:
    df_fe['is_electronic_check'] = df_fe['PaymentMethod'].str.contains('Electronic check', na=False).astype(int)
if 'PaperlessBilling' in df_fe.columns:
    df_fe['paperless_flag'] = df_fe['PaperlessBilling'].map({'Yes':1,'No':0}).fillna(0).astype(int)

In [17]:
# contract-tenure interaction
if 'Contract' in df_fe.columns and 'tenure_category' in df_fe.columns:
    df_fe['contract_tenure_interaction'] = df_fe['Contract'].astype(str) + '_' + df_fe['tenure_category'].astype(str)
    
print('\nFeature engineering completed. Sample new columns:')
print(df_fe[['tenure_category','services_count','service_adoption_score','avg_charge_per_service','is_electronic_check','paperless_flag']].head())


Feature engineering completed. Sample new columns:
  tenure_category  services_count  service_adoption_score  \
0             New               1                0.111111   
1     Established               3                0.333333   
2             New               3                0.333333   
3     Established               3                0.333333   
4             New               1                0.111111   

   avg_charge_per_service  is_electronic_check  paperless_flag  
0               29.850000                    1               1  
1               18.983333                    0               0  
2               17.950000                    0               1  
3               14.100000                    0               0  
4               70.700000                    1               1  


### 5. Train/Validation/Test Split (stratified)

In [18]:
# prepare X/y
if 'Churn' in df_fe.columns:
    df_fe['Churn_flag'] = df_fe['Churn'].map({'No':0,'Yes':1}).astype(int)
    y = df_fe['Churn_flag']
    X = df_fe.drop(columns=['Churn','Churn_flag'] if 'Churn' in df_fe.columns else ['Churn_flag'])
else:
    raise ValueError('Churn column missing — ensure cleaned dataset includes Churn')


X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(X, y, test_size=config['training'].get('test_size',0.2), val_size=0.2, seed=config['seed'])
print('\nShapes -> X_train:', X_train.shape, 'X_val:', X_val.shape, 'X_test:', X_test.shape)


Shapes -> X_train: (4225, 29) X_val: (1409, 29) X_test: (1409, 29)


### 6. Build Preprocessing Pipeline

In [19]:
# Identify numeric & categorical features from X_train
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object','category']).columns.tolist()
print('Numeric features:', numeric_features[:20])
print('Categorical features (sample):', categorical_features[:20])

# Create numeric transformer based on config
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy=config['processing']['numeric_impute'])),
    ('scaler', StandardScaler() if config['processing']['scaler'] == 'standard' else 
             (MinMaxScaler() if config['processing']['scaler'] == 'minmax' else RobustScaler()))
])

# Create categorical transformer with direct OneHotEncoder initialization
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy=config['processing']['categorical_impute'], fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

print('\nPreprocessor created:')
print(preprocessor)

# Demonstrate transform on a small sample
sample_transformed = preprocessor.fit_transform(X_train.iloc[:5])
print('\nTransformed sample shape:', getattr(sample_transformed, 'shape', 'unknown'))

Numeric features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'services_count', 'avg_charge_per_month', 'service_adoption_score', 'avg_charge_per_service', 'is_electronic_check', 'paperless_flag']
Categorical features (sample): ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_group', 'tenure_category', 'contract_tenure_interaction']

Preprocessor created:
ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['SeniorCitizen', 'tenure', 'MonthlyCharges',
                                  'TotalCharges', 'services_count

### 7. Build & Train Models

In [21]:
# Build models
rf = build_model('random_forest', config)

print('RandomForest model:', rf)
try:
    xgb = build_model('xgboost', config)
    print('XGB model:', xgb)
except Exception as e:
    print('XGBoost not available:', e)
try:
    cat = build_model('catboost', config)
    print('CatBoost model:', cat)
except Exception as e:
    print('CatBoost not available:', e)

RandomForest model: RandomForestClassifier(class_weight='balanced', max_depth=12,
                       min_samples_split=5, n_estimators=300, n_jobs=-1,
                       random_state=42)
XGB model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=-1,
              num_parallel_tree=None, ...)
CatBoost model: <

In [22]:
# Train RandomForest with Trainer (no hyperparam search for speed in demo)
artifacts_dir = str(TRAINED_MODELS_DIR)  # Use trained_models directory instead of artifacts
trainer = Trainer(preprocessor, rf, config, artifacts_dir)
rf_pipeline, _ = trainer.fit(X_train, y_train, param_distributions=None)
print('\nTrained and saved RandomForest pipeline to:', artifacts_dir)


Trained and saved RandomForest pipeline to: E:\Projects\mini-project\models\trained_models


### 8. Evaluate Models (imbalanced-aware)

In [23]:
# load trained pipeline artifact
rf_artifact_path = os.path.join(artifacts_dir, f"{type(rf).__name__}_pipeline.joblib")
print('Attempting to load', rf_artifact_path)
rf_loaded = load(rf_artifact_path)
print('Loaded pipeline:', rf_loaded)

Attempting to load E:\Projects\mini-project\models\trained_models\RandomForestClassifier_pipeline.joblib
Loaded pipeline: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['SeniorCitizen', 'tenure',
                                                   'MonthlyCharges',
                                                   'TotalCharges',
                                                   'services_count',
                                                   'avg_charge_per_month',
                                                   'service_adoption_score',
        

In [24]:
# get probs on test set
probs_rf = rf_loaded.predict_proba(X_test)[:,1]
metrics_rf = evaluate(y_test, probs_rf, threshold=0.5)
best_thresh, best_f1 = threshold_search(y_test, probs_rf)
metrics_rf['best_threshold'] = best_thresh
metrics_rf['best_f1'] = best_f1
print('\nRandomForest metrics:')
print(metrics_rf)


RandomForest metrics:
{'roc_auc': 0.8389198894313983, 'pr_auc': 0.6368315308530927, 'precision': 0.5207207207207207, 'recall': 0.7727272727272727, 'f1': 0.6221743810548978, 'confusion_matrix': array([[769, 266],
       [ 85, 289]], dtype=int64), 'best_threshold': 0.49144896231389945, 'best_f1': 0.6277836686624196}


In [25]:
# Initialize the InferenceService with the loaded model
inference_service = InferenceService(rf_artifact_path)
print("Initialized InferenceService with model from:", rf_artifact_path)

Initialized InferenceService with model from: E:\Projects\mini-project\models\trained_models\RandomForestClassifier_pipeline.joblib


In [26]:
# compute business cost at best threshold
cost_fp = config['business'].get('cost_fp',10.0)
months_lost = config['business'].get('months_lost',6)
avg_monthly = df_fe['MonthlyCharges'].mean()
cost_fn = avg_monthly * months_lost
business = compute_business_cost(y_test, probs_rf, best_thresh, cost_fp, cost_fn)
print('\nBusiness cost at best threshold:', business)


Business cost at best threshold: {'tn': 762, 'fp': 273, 'fn': 78, 'tp': 296, 'total_cost': 33038.47207156041}


### 9. Save results & Artifacts

In [27]:
perf_summary = pd.DataFrame([{
    'model':'RandomForest',
    'roc_auc':metrics_rf['roc_auc'],
    'pr_auc':metrics_rf['pr_auc'],
    'precision':metrics_rf['precision'],
    'recall':metrics_rf['recall'],
    'f1':metrics_rf['f1'],
    'best_threshold':metrics_rf['best_threshold'],
    'best_f1':metrics_rf['best_f1'],
    'artifact_path': rf_artifact_path
}])

# Create data/external directory if it doesn't exist
data_external_dir = PROJECT_ROOT / 'src' / 'data' / 'external'
os.makedirs(data_external_dir, exist_ok=True)

# Save to data/external directory
perf_summary.to_csv(data_external_dir / 'part4_performance_summary.csv', index=False)
print('Saved performance summary to', data_external_dir / 'part4_performance_summary.csv')
print('\nModel artifact saved in:', artifacts_dir)
print('File:', f"{type(rf).__name__}_pipeline.joblib")

Saved performance summary to E:\Projects\mini-project\src\data\external\part4_performance_summary.csv

Model artifact saved in: E:\Projects\mini-project\models\trained_models
File: RandomForestClassifier_pipeline.joblib


### 10. Inference Demo (single sample and batch)

In [28]:
# single-sample: take first row from X_test
single_row = X_test.iloc[[0]]
prob = inference_service.predict_proba(single_row)[0]
pred = inference_service.predict(single_row, threshold=metrics_rf['best_threshold'])[0]
print('\nSingle sample -> probability:', float(prob), 'prediction_at_best_threshold:', int(pred))

# small batch
batch_probs = inference_service.predict_proba(X_test.iloc[:10])
print('\nBatch probs (first 10):', batch_probs)


Single sample -> probability: 0.16105109932931244 prediction_at_best_threshold: 0

Batch probs (first 10): [0.1610511  0.72711156 0.27533644 0.57941758 0.17603401 0.6758816
 0.59797809 0.22466842 0.0588652  0.62657955]
