In [1]:
from sklearn.externals import joblib
df=joblib.load('churn_train_dataset')

In [2]:
y=df['churn'].values
df.drop('churn', inplace=True, axis=1)
X=df.values

In [3]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(ratio='auto', random_state=12345)#here ratio='auto' means the same as 'not majority' which in turn means that resample all classes but the majority class;
X_smt, y_smt = smt.fit_sample(X, y)

In [4]:
len(X_smt)

49848

In [5]:
from sklearn.model_selection import train_test_split
X_train_smotetomek, X_valid_smotetomek, y_train_smotetomek, y_valid_smotetomek = train_test_split(X_smt, y_smt, test_size=0.20, random_state=42)

In [6]:
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score

In [55]:


def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    estimator = RFC(n_estimators=n_estimators,min_samples_split=min_samples_split, max_features=max_features,random_state=2)
    cval = cross_val_score(estimator, data, targets,scoring='neg_log_loss', cv=10)
    return cval.mean()



In [56]:
def optimize_rfc(data, targets):
    def rfc_crossval(n_estimators, min_samples_split, max_features):
        return rfc_cv(n_estimators=int(n_estimators),min_samples_split=int(min_samples_split), max_features=int(max_features),data=data,targets=targets)

    optimizer = BayesianOptimization(f=rfc_crossval,pbounds={
            "n_estimators": (200, 1500),
            "min_samples_split": (2, 25),
            "max_features": (3,7)
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(init_points=0, n_iter=5)

    return("Final result:", optimizer.max)



In [57]:

optimize_rfc(X_smt, y_smt)

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m-0.009795[0m | [0m 3.766   [0m | [0m 16.31   [0m | [0m 769.0   [0m |
| [95m 2       [0m | [95m-0.000303[0m | [95m 6.069   [0m | [95m 2.351   [0m | [95m 552.8   [0m |
| [0m 3       [0m | [0m-0.004463[0m | [0m 4.279   [0m | [0m 24.76   [0m | [0m 200.1   [0m |
| [0m 4       [0m | [0m-0.000315[0m | [0m 6.221   [0m | [0m 2.059   [0m | [0m 1.499e+0[0m |
| [0m 5       [0m | [0m-0.001037[0m | [0m 5.415   [0m | [0m 2.023   [0m | [0m 200.8   [0m |
| [0m 6       [0m | [0m-0.006965[0m | [0m 3.018   [0m | [0m 2.151   [0m | [0m 394.4   [0m |


('Final result:',
 {'params': {'max_features': 6.069008407167784,
   'min_samples_split': 2.351185095311341,
   'n_estimators': 552.76937566267},
  'target': -0.00030370503790383934})

In [None]:


#'max_depth': [20, 40, 60, 80, 90, 100, 120],'max_features':[3,5,2,6,7],'min_samples_leaf': [1, 2, 3, 4],'min_samples_split': [2, 3, 4, 5, 7, 10, 15, 20],'n_estimators': [250, 300, 750, 1500, 2500, 3200]


In [None]:
#It turns out that we cannot include the following in RFC using Bayesian Optimization
#"min_samples_leaf", max_depth"

In [58]:
def random_search(m, p):
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import KFold
    kf = KFold(10, shuffle=True, random_state=42)
    #models1 = {'Lasso': Lasso(),'Elastic Net Regression': ElasticNet(),'Kernel Ridge Regression': KernelRidge(),'Gradient Boosting Regression': GradientBoostingRegressor()}

    #params1 = {'Lasso': { 'alpha': [0.0005, 0.001, 0.0003] },'Elastic Net Regression': {'alpha': [0.0005, 0.003, 0.00045] , 'l1_ratio':[.9, .5, .75]},'Kernel Ridge Regression':{ 'alpha':[0.6, 0.5, 0.35], 'kernel':'polynomial', 'degree':[2, 3], 'coef0':[0, 1.5, 2.5]},'Gradient Boosting Regression': { 'n_estimators':[3000, 2500, 1200], 'learning_rate':[0.05, 0.1, 0.004], 'max_depth':4, 'max_features':['sqrt', 'log2'],'min_samples_leaf':[15, 5, 10, 4], 'min_samples_split':[10, 5, 8], 'loss':'huber'}}
 
    rs = RandomizedSearchCV(m, p, n_iter=10, cv=kf, scoring="neg_log_loss", refit=True)
    rs.fit(X_smt, y_smt)
    return rs.best_params_

In [59]:
#Trying to optimize max_depth and min_samples_leaf of the RandomForestClassifier as I couldn't use the Bayesian Optimization
#to optimize those attributes of the classifier
#9.13 - 
from sklearn.ensemble import RandomForestClassifier
models_rf = RandomForestClassifier(random_state = 42, n_jobs=-1)
param_rf={'max_depth': [20, 40, 80, 90, 100],'min_samples_leaf': [1, 2, 3, 4]}
random_search(models_rf, param_rf)







{'max_depth': 40, 'min_samples_leaf': 1}

In [49]:
#I am going to finish optimizing the values for the list of classifiers I have and then cross-validate to verify which one
#of those is doing better

#So in that regard moving on to LogisticRegression which also seemed to be performing well while building the baseline models


from sklearn.linear_model import LogisticRegression
models_logr=LogisticRegression(tol=0.0001, random_state = 42, n_jobs=-1)
param_logr={'C': [0.001, 0.001, 0.1, 1, 100, 1000], 'solver':['newton-cg', 'liblinear']}








In [50]:
#For logistic regression the grid search should work fast, let's see

def random_search(m, p):
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import KFold
    kf = KFold(10, shuffle=True, random_state=42)
    #models1 = {'Lasso': Lasso(),'Elastic Net Regression': ElasticNet(),'Kernel Ridge Regression': KernelRidge(),'Gradient Boosting Regression': GradientBoostingRegressor()}

    #params1 = {'Lasso': { 'alpha': [0.0005, 0.001, 0.0003] },'Elastic Net Regression': {'alpha': [0.0005, 0.003, 0.00045] , 'l1_ratio':[.9, .5, .75]},'Kernel Ridge Regression':{ 'alpha':[0.6, 0.5, 0.35], 'kernel':'polynomial', 'degree':[2, 3], 'coef0':[0, 1.5, 2.5]},'Gradient Boosting Regression': { 'n_estimators':[3000, 2500, 1200], 'learning_rate':[0.05, 0.1, 0.004], 'max_depth':4, 'max_features':['sqrt', 'log2'],'min_samples_leaf':[15, 5, 10, 4], 'min_samples_split':[10, 5, 8], 'loss':'huber'}}
 
    rs = RandomizedSearchCV(m, p, n_iter=10, cv=kf, scoring="neg_log_loss", refit=True)
    rs.fit(X_smt, y_smt)
    return rs.best_params_



random_search(models_logr, param_logr)

  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


{'C': 1000, 'solver': 'newton-cg'}

In [8]:
#For SVC(probability=True) and MLPClassifier(), we have to normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_smt)
X_smt_transform=scaler.transform(X_smt)



In [6]:
len(X_smt_transform)

49848

In [6]:
#After scaling splitting X, y into training and validation set to verify the cross-validation result
from sklearn.model_selection import train_test_split
X_train_st, X_valid_st, y_train_st, y_valid_st = train_test_split(X_smt_transform, y_smt, test_size=0.20, random_state=42)

In [7]:
def classifier_performance(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X_train_st, y_train_st, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)

In [64]:
#svc
import numpy as np
np.logspace(-3, 2, 6)

array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [None]:
{'C': np.logspace(-1, 1), 'gamma': np.logspace(-1, 2, 6)}

In [19]:
def grid_search(m, p):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import KFold, cross_val_score, train_test_split
    kf = KFold(10, shuffle=True, random_state=42)
    #models1 = {'Lasso': Lasso(),'Elastic Net Regression': ElasticNet(),'Kernel Ridge Regression': KernelRidge(),'Gradient Boosting Regression': GradientBoostingRegressor()}

    #params1 = {'Lasso': { 'alpha': [0.0005, 0.001, 0.0003] },'Elastic Net Regression': {'alpha': [0.0005, 0.003, 0.00045] , 'l1_ratio':[.9, .5, .75]},'Kernel Ridge Regression':{ 'alpha':[0.6, 0.5, 0.35], 'kernel':'polynomial', 'degree':[2, 3], 'coef0':[0, 1.5, 2.5]},'Gradient Boosting Regression': { 'n_estimators':[3000, 2500, 1200], 'learning_rate':[0.05, 0.1, 0.004], 'max_depth':4, 'max_features':['sqrt', 'log2'],'min_samples_leaf':[15, 5, 10, 4], 'min_samples_split':[10, 5, 8], 'loss':'huber'}}
 
    gs = GridSearchCV(m, p, cv=kf, verbose=10, scoring="neg_log_loss", refit=False)
    gs.fit(X_smt_transform, y_smt)
    return gs.best_params_

In [None]:
# 7.44-7.57
#second: 10.37
from sklearn.svm import SVC
import numpy as np

model_svc=SVC(kernel='rbf', probability=True)
param_svc=({'C': np.logspace(-1, 1), 'gamma': np.logspace(-1, 1)})
grid_search(model_svc, param_svc)

Fitting 10 folds for each of 2500 candidates, totalling 25000 fits
[CV] gamma=0.001, C=0.001 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [9]:
def svc_cv(C, gamma, data, targets):
    from sklearn.svm import SVC
    estimator = SVC(C=C, gamma=gamma, random_state=2, probability=True)
    cval = cross_val_score(estimator, data, targets,scoring='neg_log_loss', cv=10)
    return cval.mean()



In [10]:
def optimize_svc(data, targets):
    def svc_crossval(expC, expGamma):
        C = 10 ** expC
        gamma = 10 ** expGamma
        return svc_cv(C=C, gamma=gamma, data=data, targets=targets)

    optimizer = BayesianOptimization(f=svc_crossval,pbounds={"expC": (-3, 2), "expGamma": (-3, 2)},random_state=1234,verbose=2) 
    optimizer.maximize(init_points=0, n_iter=5)
    return("Final result:", optimizer.max)




In [None]:
#8.13 - 8.54
#second: 9.09 - 
#third: 4:11 pm - 4.54
optimize_svc(X_smt_transform, y_smt)

|   iter    |  target   |   expC    | expGamma  |
-------------------------------------------------


In [9]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(random_state = 42, n_jobs=-1, max_depth=40, min_samples_leaf=1, max_features=6, min_samples_split=2, n_estimators=553)

In [12]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression(tol=0.0001, random_state = 42, n_jobs=-1, C=1000, solver="newton-cg")

In [10]:
#Cross validation to check which classifier is performing better
def classifier_performance(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X_train_smotetomek, y_train_smotetomek, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)

In [11]:
#Let's check the cross validation score for RandomForestClassifier
#started: 9.41 - 9.42
classifier_performance(rfc)

(-0.0004286352697759248,
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=40, max_features=6, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=553, n_jobs=-1,
             oob_score=False, random_state=42, verbose=0, warm_start=False))

In [13]:
#Let's check the cross validation score for LogisticRegression
#started at: 9.42 - 9.43
#LogisticRegression outperformed RandomForestClassifier
classifier_performance(lr)

(-0.00038751015078396277,
 LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
           penalty='l2', random_state=42, solver='newton-cg', tol=0.0001,
           verbose=0, warm_start=False))