# Training Model

In [1]:
!pip install scikit-learn



In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import joblib

In [2]:
# Apply Settings
pd.set_option("display.max_column", None)

In [3]:
data = pd.read_csv(r'clean_data.csv')
data.head()

Unnamed: 0,Gender,Geography,Tenure,Contract,MonthlyCharges,TotalCharges,PaymentMethod,IsActiveMember,Churn
0,Male,France,14,Two-year,21.58,7933.34,Bank transfer,Yes,No
1,Female,Spain,14,Month-to-month,27.71,5869.34,Credit card,No,Yes
2,Male,Germany,57,Two-year,111.12,6321.2,Bank transfer,Yes,No
3,Male,Spain,34,Month-to-month,55.49,7956.44,Bank transfer,No,Yes
4,Male,Spain,53,Two-year,62.48,4922.75,Direct debit,Yes,No


In [4]:
# Backing up the data
df = data.copy()
# Explore data 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          1000 non-null   object 
 1   Geography       1000 non-null   object 
 2   Tenure          1000 non-null   int64  
 3   Contract        1000 non-null   object 
 4   MonthlyCharges  1000 non-null   float64
 5   TotalCharges    1000 non-null   float64
 6   PaymentMethod   1000 non-null   object 
 7   IsActiveMember  1000 non-null   object 
 8   Churn           1000 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 70.4+ KB


In [5]:
# Encode Target Columns 
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df['Churn'].value_counts()

Churn
1    502
0    498
Name: count, dtype: int64

In [6]:
# Seperate Target
X = df.drop(columns= ['Churn'], axis =1)
y = df['Churn']

num_columns = X.select_dtypes(include= 'number').columns.to_list()
cat_columns = X.select_dtypes(include= 'object').columns.to_list()

print('The Numeric columns are \n', num_columns)
print('The Categoric columns are \n', cat_columns)

The Numeric columns are 
 ['Tenure', 'MonthlyCharges', 'TotalCharges']
The Categoric columns are 
 ['Gender', 'Geography', 'Contract', 'PaymentMethod', 'IsActiveMember']


In [7]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
# Processing the data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy= 'median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown= 'ignore', sparse_output= False)) # sparse_ouput = False is to get dense matrix due to Machine Learning
])

# Combine Steps
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_columns),
    ('cat', cat_transformer, cat_columns)
])

## Model Training

In [9]:
# Models
model_dict = {
    'Logistic_Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')
}

# Hyperparameter
# Generic search space
search_space ={
    'C': [0.1,1,10],
    'kernal': ['linear', 'rgf'],
    'n_estimators': [50,100,200],
    'max_depth': [None,5,10],
    'learning_rate': [0.5,1]
}

# Function to filter hyperparameter
def filter_hyperparameter(model, space):
    valid_keys = model.get_params().keys()
    return {k:v for k, v in space.items() if k in valid_keys}

In [10]:
# Grid Search for each model
result = []
best_pipeline = {}

for name, model in model_dict.items():
    print(f'Tuning {model}...')
    pipe = Pipeline(steps = [
        ('processor', preprocessor),
        ('model', model)
    ])
    hyperparameter = filter_hyperparameter(model, search_space)
    # Prefix model name
    param_grid = {f'model__{k}':v for k,v in hyperparameter.items()}
    grid = GridSearchCV(estimator= pipe, param_grid=param_grid, cv =5, scoring= 'accuracy', n_jobs=-1)
    grid.fit(X_train,y_train)

    y_pred = grid.predict(X_test)
    report = metrics.classification_report(y_test,y_pred, output_dict= True) # output_dict = True because the result will come in dictionary
    
    result.append({
        'model_name': name, 
        'best_parameters': param_grid,
        'accuracy': round(metrics.accuracy_score(y_test,y_pred), 4),
        'f1-score': round(report['weighted avg']['f1-score'], 4)
    })

    best_pipeline[name] = grid.best_estimator_

Tuning LogisticRegression(max_iter=1000)...
Tuning SVC(probability=True)...
Tuning RandomForestClassifier()...
Tuning AdaBoostClassifier()...
Tuning XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...)...


In [11]:
print(result)

[{'model_name': 'Logistic_Regression', 'best_parameters': {'model__C': [0.1, 1, 10]}, 'accuracy': 0.49, 'f1-score': 0.49}, {'model_name': 'SVM', 'best_parameters': {'model__C': [0.1, 1, 10]}, 'accuracy': 0.53, 'f1-score': 0.5283}, {'model_name': 'RandomForestClassifier', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 5, 10]}, 'accuracy': 0.555, 'f1-score': 0.5549}, {'model_name': 'AdaBoostClassifier', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.5, 1]}, 'accuracy': 0.525, 'f1-score': 0.5215}, {'model_name': 'XGBoost', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 5, 10], 'model__learning_rate': [0.5, 1]}, 'accuracy': 0.505, 'f1-score': 0.5044}]


In [12]:
# Comapare the models
result_df = pd.DataFrame(result)
sorted_result_df = result_df.sort_values(by = 'accuracy', ascending = False)
print('\nModel Comparisons: \n', sorted_result_df)

# Best Model
best_row = sorted_result_df.iloc[0]
best_model = best_row['model_name']
print('\nBest Model:', best_model)
print('\nBest Hyperparameter:', best_row['best_parameters'])


Model Comparisons: 
                model_name                                    best_parameters  \
2  RandomForestClassifier  {'model__n_estimators': [50, 100, 200], 'model...   
1                     SVM                         {'model__C': [0.1, 1, 10]}   
3      AdaBoostClassifier  {'model__n_estimators': [50, 100, 200], 'model...   
4                 XGBoost  {'model__n_estimators': [50, 100, 200], 'model...   
0     Logistic_Regression                         {'model__C': [0.1, 1, 10]}   

   accuracy  f1-score  
2     0.555    0.5549  
1     0.530    0.5283  
3     0.525    0.5215  
4     0.505    0.5044  
0     0.490    0.4900  

Best Model: RandomForestClassifier

Best Hyperparameter: {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 5, 10]}


In [13]:
# Retraining best model on full data set
final_pipeline= best_pipeline[best_model]
final_pipeline.fit(X,y)

# Save pipeline
joblib.dump(final_pipeline, "churn_pipeline.pkl")
print("The deployment model is saved as: churn_pipeline.pkl")

The deployment model is saved as: churn_pipeline.pkl
