In [1]:
from sklearn.externals import joblib
df=joblib.load('churn_train_dataset')

In [2]:
y=df['churn'].values
df.drop('churn', inplace=True, axis=1)
X=df.values

In [3]:
#Under-sampling: Tomek links
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(ratio='majority')
X_tl, y_tl= tl.fit_sample(X, y)

In [4]:
from sklearn.model_selection import train_test_split
X_train_tomek, X_valid_tomek, y_train_tomek, y_valid_tomek = train_test_split(X_tl, y_tl, test_size=0.20, random_state=42)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler

In [20]:
#Cross_validating using Tomek-under_sampling
def classifier_performance(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X_train_tomek, y_train_tomek, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)

In [25]:
#Checking the classifiers' performance using Tomek-under_sampling (there is slight dip in scores compared to that of the base models)
#P.S.: but the data for these classifiers are not scaled. But the data was standardized to build the base models (regardless of which classifier was used)
for m in [GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), LogisticRegression(), DecisionTreeClassifier()]:
    print(classifier_performance(m))

(-0.23287632803624536, GaussianNB(priors=None, var_smoothing=1e-09))




(-0.012304858967679142, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))
(-9.992007221626413e-16, AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))




(-0.02861903599071492, LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))
(-9.992007221626413e-16, DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))


In [26]:
#Now let's check the performance of the below classifiers after standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_tomek)
X_transform=scaler.transform(X_train_tomek)

In [27]:
X_transform

array([[-0.54094641, -0.61329484,  0.22956733, ..., -0.9339596 ,
        -0.67049207,  0.67049207],
       [-0.10135145,  0.27356851, -1.48420335, ...,  1.07071012,
        -0.67049207,  0.67049207],
       [-0.48686926,  0.49528435, -1.14144921, ...,  1.07071012,
        -0.67049207,  0.67049207],
       ...,
       [-0.28974929,  0.49528435, -0.1131868 , ...,  1.07071012,
         1.49144196, -1.49144196],
       [ 2.67402784,  0.49528435, -6.28276127, ...,  1.07071012,
        -0.67049207,  0.67049207],
       [ 0.6016516 ,  0.49528435,  0.57232147, ..., -0.9339596 ,
        -0.67049207,  0.67049207]])

In [28]:
def classifier_performance_knn_svc(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X_transform, y_train_tomek, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)

In [None]:
#check this link re why do we have to normalize the data for MLP classifier
#https://stackoverflow.com/questions/4674623/why-do-we-have-to-normalize-the-input-for-an-artificial-neural-network

In [29]:
for m1 in make_pipeline([KNeighborsClassifier(), SVC(probability=True), MLPClassifier()]):
    print(classifier_performance_knn_svc(m1))

(-0.4230032364841435, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]))




(-0.0285558455411545, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]))
(-0.008010566390271744, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlpclassifier', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
 ...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]))


In [36]:
#Looks like there is no 'log_loss' in sklearn.metrics
from sklearn import metrics
sorted(metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [30]:
#Now on to evaluating the classifiers' performance using Over-sampling: SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)

In [31]:
from sklearn.model_selection import train_test_split
X_train_smote, X_valid_smote, y_train_smote, y_valid_smote = train_test_split(X_sm, y_sm, test_size=0.20, random_state=42)

In [38]:
#Now let's check the performance of the below classifiers after standardizing the data
from sklearn.preprocessing import StandardScaler
scaler2 = StandardScaler()
scaler2.fit(X_train_smote)
X_smote_transform=scaler2.transform(X_train_smote)

In [43]:
#Cross_validating using Over-sampling: SMOTE and X_smote_transform for the classifiers below
def classifier_performance_smote_scaled(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X_smote_transform, y_train_smote, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)

In [44]:
#So far I have understood that we want our loss to be as close to 0 as possible. So, for example,
#if negative_log_loss=-0.9 is farther from 0 compared to neg_log_loss=-0.005. In that case, we choose the classifier with
#neg_log_loss=-0.005, meaning the classifier's predicted values are pretty close to the true values, so its log loss is
#minimal
for m1 in make_pipeline([KNeighborsClassifier(), SVC(probability=True), MLPClassifier()]):
    print(classifier_performance_smote_scaled(m1))

(-0.40994588128571047, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]))




(-0.025070306216586763, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]))
(-0.005619180252968529, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlpclassifier', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
 ...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]))


In [None]:
#Now going to cross_validate using Over-sampling: SMOTE but without normalizing the data for the classifiers below:
#I stopped here cause I realized that I might have to try any sampling technique only on the training data so that way I 
#wouldn't touch the validation set. So the idea is that split the training set into train and validation sets and then try
#the sampling techniques on the training set. Here the validation set is split from the training set.
def classifier_performance_knn_svc(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X_transform, y_train_tomek, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)