# Training Model

In [5]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import joblib

In [7]:
# Apply Settings
pd.set_option("display.max_column", None)

In [8]:
# Load Data 
data = pd.read_csv(r'cleaned_data.csv')
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
# BAcking up the data
df = data.copy()
# Explore
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7024 entries, 0 to 7023
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7022 non-null   object 
 1   SeniorCitizen     7024 non-null   object 
 2   Partner           7021 non-null   object 
 3   Dependents        7024 non-null   object 
 4   tenure            7024 non-null   int64  
 5   PhoneService      7022 non-null   object 
 6   MultipleLines     7024 non-null   object 
 7   InternetService   7024 non-null   object 
 8   OnlineSecurity    7024 non-null   object 
 9   OnlineBackup      7024 non-null   object 
 10  DeviceProtection  7024 non-null   object 
 11  TechSupport       7024 non-null   object 
 12  StreamingTV       7024 non-null   object 
 13  StreamingMovies   7024 non-null   object 
 14  Contract          7020 non-null   object 
 15  PaperlessBilling  7024 non-null   object 
 16  PaymentMethod     7019 non-null   object 


In [10]:
# Encode Target Column
df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})
df['Churn'].value_counts()

Churn
0    5166
1    1858
Name: count, dtype: int64

In [11]:
# Seperate Target and Feature
X = df.drop(columns = ['Churn'], axis = 1)
y = df['Churn']

num_columns = X.select_dtypes(include= 'number').columns.to_list()
cat_columns = X.select_dtypes(include = 'object').columns.to_list()

print('The Numeric Columns are: \n', num_columns)
print('The Categorical Columns are: \n', cat_columns)

The Numeric Columns are: 
 ['tenure', 'MonthlyCharges', 'TotalCharges']
The Categorical Columns are: 
 ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [18]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [14]:
# Processing the data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy= 'median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown= 'ignore', sparse_output= False)) # sparse_ouput = False is to get dense matrix due to Machine Learning
])

# Combine Steps
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_columns),
    ('cat', cat_transformer, cat_columns)
])

# Model Training

In [25]:
# Models
model_dict = {
    'Logistic_Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')
}

# Hyperparameter
# Generic search space
search_space ={
    'C': [0.1,1,10],
    'kernal': ['linear', 'rgf'],
    'n_estimators': [50,100,200],
    'max_depth': [None,5,10],
    'learning_rate': [0.5,1]
}

# Function to filter hyperparameter
def filter_hyperparameter(model, space):
    valid_keys = model.get_params().keys()
    return {k:v for k, v in space.items() if k in valid_keys}

In [36]:
# Grid Search for each model
result = []
best_pipeline = {}

for name, model in model_dict.items():
    print(f'Tuning {model}...')
    pipe = Pipeline(steps = [
        ('processor', preprocessor),
        ('model', model)
    ])
    hyperparameter = filter_hyperparameter(model, search_space)
    # Prefix model name
    param_grid = {f'model__{k}':v for k,v in hyperparameter.items()}
    grid = GridSearchCV(estimator= pipe, param_grid=param_grid, cv =5, scoring= 'accuracy', n_jobs=-1)
    grid.fit(X_train,y_train)

    y_pred = grid.predict(X_test)
    report = metrics.classification_report(y_test,y_pred, output_dict= True) # output_dict = True because the result will come in dictionary
    
    result.append({
        'model_name': name, 
        'best_parameters': param_grid,
        'accuracy': round(metrics.accuracy_score(y_test,y_pred), 4),
        'f1-score': round(report['weighted avg']['f1-score'], 4)
    })

    best_pipeline[name] = grid.best_estimator_

Tuning LogisticRegression(max_iter=1000)...
Tuning SVC(probability=True)...
Tuning RandomForestClassifier()...
Tuning AdaBoostClassifier()...
Tuning XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...)...


In [37]:
print(result)

[{'model_name': 'Logistic_Regression', 'best_parameters': {'model__C': [0.1, 1, 10]}, 'accuracy': 0.7886, 'f1-score': 0.7826}, {'model_name': 'SVM', 'best_parameters': {'model__C': [0.1, 1, 10]}, 'accuracy': 0.7893, 'f1-score': 0.7771}, {'model_name': 'RandomForestClassifier', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 5, 10]}, 'accuracy': 0.7872, 'f1-score': 0.7783}, {'model_name': 'AdaBoostClassifier', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.5, 1]}, 'accuracy': 0.7915, 'f1-score': 0.7853}, {'model_name': 'XGBoost', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 5, 10], 'model__learning_rate': [0.5, 1]}, 'accuracy': 0.7779, 'f1-score': 0.7717}]


In [38]:
# Comapare the models
result_df = pd.DataFrame(result)
sorted_result_df = result_df.sort_values(by = 'accuracy', ascending = False)
print('\nModel Comparisons: \n', sorted_result_df)

# Best Model
best_row = sorted_result_df.iloc[0]
best_model = best_row['model_name']
print('\nBest Model:', best_model)
print('\nBest Hyperparameter:', best_row['best_parameters'])


Model Comparisons: 
                model_name                                    best_parameters  \
3      AdaBoostClassifier  {'model__n_estimators': [50, 100, 200], 'model...   
1                     SVM                         {'model__C': [0.1, 1, 10]}   
0     Logistic_Regression                         {'model__C': [0.1, 1, 10]}   
2  RandomForestClassifier  {'model__n_estimators': [50, 100, 200], 'model...   
4                 XGBoost  {'model__n_estimators': [50, 100, 200], 'model...   

   accuracy  f1-score  
3    0.7915    0.7853  
1    0.7893    0.7771  
0    0.7886    0.7826  
2    0.7872    0.7783  
4    0.7779    0.7717  

Best Model: AdaBoostClassifier

Best Hyperparameter: {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.5, 1]}


In [41]:
# Retraining best model on full data set
final_pipeline= best_pipeline[best_model]
final_pipeline.fit(X,y)

# Save pipeline
joblib.dump(final_pipeline, "churn_pipeline.pkl")
print("The deployment model is saved as: churn_pipeline.pkl")

The deployment model is saved as: churn_pipeline.pkl
