In [41]:
import pandas as pd
import numpy as np

churn = pd.read_csv('output/churn_cleaned.csv')
churn.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Male
0,619,42,2,0.0,1,1,1,101348.88,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0
3,699,39,1,0.0,2,0,0,93826.63,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0


# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

X = churn.drop('Exited', axis = 1)
y = churn['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=200)

In [7]:
print('Training sample size:')
print('\t X:', X_train.shape)
print('\t y:', y_train.shape)
print('Testing sample size:')
print('\t X:', X_test.shape)
print('\t y:', y_test.shape)

Training sample size:
	 X: (6700, 9)
	 y: (6700,)
Testing sample size:
	 X: (3300, 9)
	 y: (3300,)


# Models

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

lr = LogisticRegression()
ls_res = lr.fit(X_train, y_train)
lr_score = ls_res.score(X_test, y_test)

forest = RandomForestClassifier()
forest_res = forest.fit(X_train, y_train)
forest_score = forest_res.score(X_test, y_test)

gb = GradientBoostingClassifier()
gb_res = gb.fit(X_train, y_train)
gb_score = gb_res.score(X_test, y_test)

print('Scores')
print('  Logistic Regression: \t', lr_score)
print('  Random Forest: \t', forest_score)
print('  Gradient Boosting: \t', gb_score)

Scores
  Logistic Regression: 	 0.7863636363636364
  Random Forest: 	 0.8484848484848485
  Gradient Boosting: 	 0.853030303030303


## Random Forest

### Grid Search

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=300, max_features='sqrt')

param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_features': [0.5, 1, 5, 10, 'sqrt']
}

grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Retrieve the best parameter combination
best_params = grid_search.best_params_
print(best_params)

{'max_features': 'sqrt', 'n_estimators': 100}


In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
np.random.seed(100)

forest = RandomForestClassifier(n_estimators=100, max_features='sqrt')
forest_res = forest.fit(X_train, y_train)
forest_score = forest_res.score(X_test, y_test)

print('Random Forest\n  Score: \t', forest_score)

Random Forest
  Score: 	 0.8506060606060606


## Cross Validation

### K-Fold

In [50]:
np.random.seed(30)
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest, X_train, y_train, 
                         cv = 20)

def display_scores(scores):
    print("Scores:", scores)
    print("\nMean:", scores.mean(), f"({scores.mean():.2%})")
    print("\nStandard deviation:", scores.std(), f"({scores.std():.2%})")

display_scores(scores)

Scores: [0.85970149 0.88059701 0.85373134 0.85373134 0.86567164 0.85074627
 0.84477612 0.86268657 0.83880597 0.84477612 0.85074627 0.86268657
 0.85373134 0.84776119 0.8358209  0.8358209  0.87164179 0.83283582
 0.84477612 0.85373134]

Mean: 0.8522388059701493 (85.22%)

Standard deviation: 0.012107044651148213 (1.21%)


### Repreated K-Fold

In [51]:
np.random.seed(30)
from sklearn.model_selection import cross_val_score, RepeatedKFold

cv = RepeatedKFold(n_splits=20, n_repeats=5, random_state=2)
scores = cross_val_score(forest, X_train, y_train, cv=cv)

display_scores(scores)

Scores: [0.88358209 0.85074627 0.85074627 0.86567164 0.85074627 0.85074627
 0.85074627 0.87761194 0.82686567 0.83283582 0.85671642 0.84477612
 0.88059701 0.84179104 0.85074627 0.84776119 0.86268657 0.83880597
 0.86567164 0.85373134 0.85970149 0.85970149 0.88059701 0.84179104
 0.84776119 0.84179104 0.87462687 0.84179104 0.83283582 0.84477612
 0.85970149 0.85373134 0.88059701 0.8358209  0.84776119 0.88656716
 0.87761194 0.83283582 0.80597015 0.84179104 0.83880597 0.87462687
 0.8358209  0.80895522 0.87164179 0.84179104 0.84776119 0.87164179
 0.85373134 0.85373134 0.84179104 0.86865672 0.86865672 0.88656716
 0.84776119 0.85373134 0.84477612 0.87761194 0.84477612 0.84179104
 0.86268657 0.85074627 0.81791045 0.85671642 0.85074627 0.88059701
 0.85373134 0.81492537 0.84477612 0.82686567 0.88955224 0.85970149
 0.87761194 0.86567164 0.84477612 0.85074627 0.84477612 0.85373134
 0.85373134 0.87761194 0.86567164 0.85373134 0.87462687 0.87164179
 0.83880597 0.85373134 0.83880597 0.8358209  0.8656716

## Gradient Boosting

### Grid Search

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()

param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [1, 3, 6, 10], 
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Retrieve the best parameter combination
best_params = grid_search.best_params_
print(best_params)

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}


In [54]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
np.random.seed(100)

gb = GradientBoostingClassifier(n_estimators=100, max_depth = 3, learning_rate = 0.1)
gb_res = gb.fit(X_train, y_train)
gb_score = gb_res.score(X_test, y_test)

print('Gradient Boosting \n  Score: \t', gb_score)

Gradient Boosting 
  Score: 	 0.853030303030303


## Cross Validation

### K-Fold

In [55]:
np.random.seed(30)
from sklearn.model_selection import cross_val_score

scores = cross_val_score(gb, X_train, y_train, 
                         cv = 20)

def display_scores(scores):
    print("Scores:", scores)
    print("\nMean:", scores.mean(), f"({scores.mean():.2%})")
    print("\nStandard deviation:", scores.std(), f"({scores.std():.2%})")

display_scores(scores)

Scores: [0.87462687 0.87462687 0.86865672 0.85671642 0.86567164 0.86865672
 0.87164179 0.86268657 0.85671642 0.83283582 0.85373134 0.86268657
 0.84776119 0.86567164 0.83283582 0.86865672 0.87164179 0.84776119
 0.86268657 0.87164179]

Mean: 0.8608955223880598 (86.09%)

Standard deviation: 0.012176763017535 (1.22%)


### Repreated K-Fold

In [56]:
np.random.seed(30)
from sklearn.model_selection import cross_val_score, RepeatedKFold

cv = RepeatedKFold(n_splits=20, n_repeats=5, random_state=2)
scores = cross_val_score(gb, X_train, y_train, cv=cv)

display_scores(scores)

Scores: [0.90149254 0.84477612 0.85074627 0.89253731 0.85970149 0.86567164
 0.85373134 0.87164179 0.84179104 0.85074627 0.85074627 0.85970149
 0.87761194 0.86567164 0.85671642 0.85970149 0.85373134 0.85671642
 0.86567164 0.84477612 0.87164179 0.88955224 0.88358209 0.85373134
 0.85074627 0.86268657 0.86567164 0.85671642 0.82985075 0.85671642
 0.85373134 0.88059701 0.88656716 0.85074627 0.83880597 0.88059701
 0.87761194 0.84179104 0.8119403  0.85970149 0.83283582 0.88059701
 0.83283582 0.82985075 0.88955224 0.85671642 0.86268657 0.87462687
 0.86865672 0.85671642 0.84776119 0.88059701 0.87761194 0.87761194
 0.86268657 0.87164179 0.85373134 0.85970149 0.84776119 0.83880597
 0.85373134 0.86865672 0.84179104 0.85373134 0.85671642 0.87761194
 0.86567164 0.80597015 0.85671642 0.84776119 0.89253731 0.85970149
 0.88656716 0.87462687 0.84179104 0.85074627 0.86567164 0.84776119
 0.88059701 0.88656716 0.87761194 0.84776119 0.86567164 0.87462687
 0.85074627 0.85074627 0.85074627 0.84776119 0.8746268