In [1]:
from sklearn.externals import joblib
df=joblib.load('churn_train_dataset')

In [2]:
#Our target variable is imbalanced
import pandas as pd
df['churn'].value_counts()

1    24968
0    15032
Name: churn, dtype: int64

In [3]:
#Let's balance the imbalanced target variable using random oversampling 

# Class count
count_class_1, count_class_0 = df['churn'].value_counts()

# Divide by class
df_class_0 = df[df['churn'] == 0]
df_class_1 = df[df['churn'] == 1]

In [10]:
count_class_1

24968

In [12]:
#Random oversampling
import pandas as pd
df_class_0_over = df_class_0.sample(count_class_1, replace=True)


df_over=pd.concat([df_class_1, df_class_0_over], ignore_index=True)




In [14]:
len(df_class_1)

24968

In [16]:
len(df_over)

49936

In [17]:
#Splitting X and y values
y=df_over['churn'].values
df_over.drop('churn', inplace=True, axis=1)
X=df_over.values

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
def classifier_performance(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X_train, y_train, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [21]:
def make_pipeline(m):
    from sklearn.pipeline import make_pipeline
    res=[]
    for i in m:
        res.append(make_pipeline(StandardScaler(), i))
    return res

In [22]:
#GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), LogisticRegression()
for m in make_pipeline(m=[GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), LogisticRegression()]):
    print(classifier_performance(m))

(-0.2689804196097928, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gaussiannb', GaussianNB(priors=None, var_smoothing=1e-09))]))




(-0.009754211043693266, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]))
(-9.992007221626413e-16, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))]))




(-0.02496751275360399, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]))


In [23]:
#KNeighborsClassifier(), MLPClassifier(), DecisionTreeClassifier()
for m1 in make_pipeline(m=[KNeighborsClassifier(), MLPClassifier(), DecisionTreeClassifier()]):
    print(classifier_performance(m1))

(-0.5227548358470867, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]))
(-0.006664482859105922, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlpclassifier', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
 ...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]))
(-9.992007221626413e-16, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criteri

In [24]:
clf_ada=AdaBoostClassifier()
clf_ada.fit(X_train, y_train)
y_pred_ada=clf_ada.predict(X_valid)

In [25]:
#AdaBoostClassifier
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import confusion_matrix
recallscore_ada=recall_score(y_valid, y_pred_ada, average=None)
precisionscore_ada=precision_score(y_valid, y_pred_ada, average=None)
recallscore_ada.mean(), precisionscore_ada.mean()

(1.0, 1.0)

In [26]:
confusion_matrix(y_valid, y_pred_ada)

array([[8207,    0],
       [   0, 8272]])

In [27]:
#Decision Tree Clasifier
clf_dc=DecisionTreeClassifier()
clf_dc.fit(X_train, y_train)
y_pred_dc=clf_dc.predict(X_valid)

In [28]:
#Decision Tree Clasifier
recallscore_dc=recall_score(y_valid, y_pred_dc, average=None)
precisionscore_dc=precision_score(y_valid, y_pred_dc, average=None)
recallscore_dc.mean(), precisionscore_dc.mean()

(1.0, 1.0)

In [29]:
confusion_matrix(y_valid, y_pred_dc)

array([[8207,    0],
       [   0, 8272]])