In [17]:
from sklearn.externals import joblib
df=joblib.load('churn_train_dataset')

In [18]:
#Our target variable is imbalanced
import pandas as pd
df['churn'].value_counts()

1    24968
0    15032
Name: churn, dtype: int64

In [4]:
#Let's balance the imbalanced target variable using random oversampling 

# Class count
count_class_1, count_class_0 = df['churn'].value_counts()

# Divide by class
df_class_0 = df[df['churn'] == 0]
df_class_1 = df[df['churn'] == 1]

In [10]:
df_class_0.values

array([[ 5.42,  4.7 ,  5.  , ...,  0.  ,  0.  ,  1.  ],
       [ 4.28,  5.  ,  4.  , ...,  1.  ,  1.  ,  0.  ],
       [11.44,  4.6 ,  4.7 , ...,  0.  ,  0.  ,  1.  ],
       ...,
       [ 3.88,  4.8 ,  5.  , ...,  0.  ,  1.  ,  0.  ],
       [ 3.04,  5.  ,  5.  , ...,  1.  ,  1.  ,  0.  ],
       [ 4.25,  4.7 ,  5.  , ...,  0.  ,  0.  ,  1.  ]])

In [19]:
y=df['churn'].values
df.drop('churn', inplace=True, axis=1)
X=df.values

In [25]:
len(X)

40000

In [26]:
len(y)

40000

In [21]:
#Random under-sampling and over-sampling with imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_sample(X,y)


In [24]:
len(X_resampled)

30064

In [23]:
len(y_resampled)

30064

In [34]:
len(y_resampled[y_resampled==0])

15032

In [35]:
len(y_resampled[y_resampled==1])

15032

In [36]:
#The idea is to check how intentionally removing the values perform compared to random sampling
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=42)

In [37]:
def classifier_performance(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X_train, y_train, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [39]:
def make_pipeline(m):
    from sklearn.pipeline import make_pipeline
    res=[]
    for i in m:
        res.append(make_pipeline(StandardScaler(), i))
    return res

In [40]:
#GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), LogisticRegression()
for m in make_pipeline(m=[GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), LogisticRegression()]):
    print(classifier_performance(m))

(-0.2746884498122989, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gaussiannb', GaussianNB(priors=None, var_smoothing=1e-09))]))




(-0.016862486504493797, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]))
(-9.992007221626413e-16, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))]))




(-0.029411006357432556, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]))


In [41]:
#KNeighborsClassifier(), MLPClassifier(), DecisionTreeClassifier()
for m1 in make_pipeline(m=[KNeighborsClassifier(), MLPClassifier(), DecisionTreeClassifier()]):
    print(classifier_performance(m1))

(-0.557998751504131, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]))
(-0.010156989099579636, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlpclassifier', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
 ...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]))
(-9.992007221626413e-16, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterio