In [1]:
## @see https://www.kaggle.com/toorkp/churn-wsdm/data

In [10]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import numpy as np
import pandas as pd
import time
import gc; gc.enable()
import time
import sys

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, f1_score, log_loss, confusion_matrix

from collections import Counter
from numpy.random import RandomState
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

pd.options.display.float_format = "{:.2f}".format
np.set_printoptions(precision=4)

import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('modules')

from shared_functions import *
import pickle
from datetime import datetime

In [4]:
# Load the pre-split data

store = pd.HDFStore('/home/dissertation/data/dfs_abt_split.h5')
X_train, X_test, y_train, y_test = store['X_train'], store['X_test'], store['y_train'], store['y_test']
store.close()

# Drop msno from the dataset
X_train.drop(['msno', 'registration_init_time', 'registration_init_time_dt'], inplace=True, axis=1, errors='ignore')
X_test.drop(['msno', 'registration_init_time', 'registration_init_time_dt'], inplace=True, axis=1, errors='ignore')

print("Train Shape:", X_train.shape, y_train.shape)
print("Train Shape:", X_test.shape, y_test.shape)
X_train.head()

Train Shape: (600803, 164) (600803,)
Train Shape: (257487, 164) (257487,)


Unnamed: 0,bd,city,gender,registered_via,SUM(transactions.payment_plan_days),SUM(transactions.plan_list_price),SUM(transactions.actual_amount_paid),SUM(transactions.transaction_date),SUM(transactions.membership_expire_date),SUM(transactions.price_difference),...,DIFF(MAX(transactions.planned_daily_price)),DIFF(MAX(transactions.daily_price)),DIFF(STD(transactions.payment_plan_days)),DIFF(STD(transactions.plan_list_price)),DIFF(STD(transactions.actual_amount_paid)),DIFF(STD(transactions.transaction_date)),DIFF(STD(transactions.membership_expire_date)),DIFF(STD(transactions.price_difference)),DIFF(STD(transactions.planned_daily_price)),DIFF(STD(transactions.daily_price))
551174,28,5,male,3,180,298.0,894.0,120923653,120924153,428.0,...,1.67,1.67,0.0,76.94,0.0,3890.64,3353.08,55.25,2.56,0.0
403333,20,13,male,3,127,627.0,627.0,100812365,100812869,0.0,...,1.03,1.03,9.96,42.15,71.37,-1967.03,-2345.67,-20.98,1.41,2.33
594889,18,13,male,3,540,2682.0,2682.0,362862440,362881912,0.0,...,0.0,0.0,0.0,-29.22,0.0,-371.25,-554.29,-20.98,-0.97,0.0
100379,0,1,unknown,7,547,2384.0,2682.0,382888820,383039868,214.0,...,0.0,0.0,4.9,21.64,34.18,-2480.05,344.36,9.19,0.73,1.08
207602,35,13,female,7,720,2529.0,2956.0,483713706,483807163,341.0,...,0.0,0.0,0.0,44.2,19.09,-1495.13,-544.06,38.55,1.47,0.64


In [5]:
RANDOM_STATE = 42 
CV_ITER = None
SCORING_METRIC = autosklearn.metrics.recall
BASE_NAME = "askbasic_" + str(CV_ITER) + "cv_" + str(SCORING_METRIC)
N_JOBS = 2
## Time periods to train for in minutes
# TIME_PERIODS = [0.25, 0.5, 1, 2, 3, 4]
TIME_PERIODS = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]

## Set the list of the categorical columns in the dataset
cat_col= ['gender', 'city', 'registered_via']

df_cols = X_train.columns
feat_types =  ['Categorical' if col in cat_col else 'Numerical' for col in df_cols]

classifiers = []

## Each entry in the list is a Tuple of
##   [ModelName, Model, HyperParams, ScoringMetric]    
for period in TIME_PERIODS:
    classifiers.append(
        ('ASKLEARN_{}_'.format(str(period)) + BASE_NAME,                        ## ModelName
         autosklearn.classification.AutoSklearnClassifier(                      ## Model  
                time_left_for_this_task=int(60*period), 
                n_jobs=N_JOBS,
                include_estimators=["random_forest", "decision_tree", "adaboost", "gaussian_nb",
                                    "liblinear_svc", "xgradient_boosting"], 
                exclude_estimators=None,
                include_preprocessors=["no_preprocessing", ], 
                exclude_preprocessors=None,
                ml_memory_limit = 3072*9), 
         {},                                                                    ## HyperParams
         SCORING_METRIC)                                                        ## ScoringMetric 
    )

In [6]:
all_metrics = pd.DataFrame()
all_results = list()

In [7]:
# with open('/tmp/dfsask_all_metrics.pickle', 'rb') as f:
#     # The protocol version used is detected automatically, so we do not
#     # have to specify it.
#     all_metrics = pickle.load(f)

# with open('/tmp/dfsask_all_results.pickle', 'rb') as f:
#     # The protocol version used is detected automatically, so we do not
#     # have to specify it.
#     all_results = pickle.load(f)

In [None]:
# # Write (overwrite) the file to store the experiment results
# with open('/tmp/dfsask_all_metrics.pickle', 'wb') as f:
#     # Pickle the 'data' dictionary using the highest protocol available.
#     print("Writing results to", f.name)
#     pickle.dump(all_metrics, f, pickle.HIGHEST_PROTOCOL)
    
# # Write (overwrite) the file to store the experiment results
# with open('/tmp/dfsask_all_results.pickle', 'wb') as f:
#     # Pickle the 'data' dictionary using the highest protocol available.
#     print("Writing results to", f.name)
#     pickle.dump(all_results, f, pickle.HIGHEST_PROTOCOL)

### Experiment 1.1 Baseline - Default Settings - No sampling

In [None]:
%%time
start = time.time()

sampler = ('None', DummySampler())

exp_metrics, model_results = perform_experiment(X_train, X_test, y_train, y_test, classifiers, sampler, 1, \
                                                cv_iter=CV_ITER, cat_col=cat_col, auto_ml = False)

all_metrics = all_metrics.append(exp_metrics)
all_results.append((sampler[0]+"_"+BASE_NAME, exp_metrics, model_results))

log("basic, autosklearn, {0}, {1}".format(sampler[0], time.time()-start), 'Basic_Autosklearn.log')

In [8]:
all_metrics[['sampling_method']].drop_duplicates()

Unnamed: 0,sampling_method
0,RUS 1:1
0,
0,ROS 3:1
0,ROS 2:1
0,ROS 1:1
0,ROS 3:2


In [None]:
all_metrics[['label','sampling_method']].groupby(['sampling_method']).count()

In [None]:
all_metrics.groupby('sampling_method').train_time.sum()/60/60

### Experiment 1.2 Baseline - Default Settings - Oversampled training set 100%

In [None]:
%%time
start = time.time()

sampler = ('ROS 1:1', RandomOverSampler(random_state=RANDOM_STATE))

exp_metrics, model_results = perform_experiment(X_train, X_test, y_train, y_test, classifiers, sampler, 1, \
                                                cv_iter=CV_ITER, cat_col=cat_col, auto_ml = False)

all_metrics = all_metrics.append(exp_metrics)
all_results.append((sampler[0]+"_"+BASE_NAME, exp_metrics, model_results))

log("basic, autosklearn, {0}, {1}".format(sampler[0], time.time()-start), 'Basic_Autosklearn.log')

### Experiment 1.3 Baseline - Default Settings - Undersampled training set

In [None]:
%%time
start = time.time()

sampler = ('RUS 1:1', RandomUnderSampler(random_state=RANDOM_STATE))

exp_metrics, model_results = perform_experiment(X_train, X_test, y_train, y_test, classifiers, sampler, 1, \
                                                cv_iter=CV_ITER, cat_col=cat_col, auto_ml = False)

all_metrics = all_metrics.append(exp_metrics)
all_results.append((sampler[0]+"_"+BASE_NAME, exp_metrics, model_results))

log("basic, autosklearn, {0}, {1}".format(sampler[0], time.time()-start), 'Basic_Autosklearn.log')

In [None]:
all_metrics

In [None]:
# sys.getsizeof(model)
# model = all_results[0][2][0][-1]
# print(model.sprint_statistics())
# print(model.show_models())

### Experiment 1.4 Baseline - Default Settings - Over sampling - 33% of majority size

In [None]:
%%time
start = time.time()

sampler = ('ROS 3:1', RandomOverSampler(random_state=RANDOM_STATE, sampling_strategy = 1/3))

exp_metrics, model_results = perform_experiment(X_train, X_test, y_train, y_test, classifiers, sampler, 1, \
                                                cv_iter=CV_ITER, cat_col=cat_col, auto_ml = False)

all_metrics = all_metrics.append(exp_metrics)
all_results.append((sampler[0]+"_"+BASE_NAME, exp_metrics, model_results))

log("basic, autosklearn, {0}, {1}".format(sampler[0], time.time()-start), 'Basic_Autosklearn.log')

In [None]:
all_metrics.sampling_method.value_counts()

In [None]:
all_metrics.sort_values(['balanced_accuracy', 'recall'], ascending=[False, False]).head()

### Experiment 1.5 Baseline - Default Settings - Over sampling - 66% of majority size

In [None]:
%%time
start = time.time()

sampler = ('ROS 3:2', RandomOverSampler(random_state=RANDOM_STATE, sampling_strategy = 2/3))

exp_metrics, model_results = perform_experiment(X_train, X_test, y_train, y_test, classifiers, sampler, 1, \
                                                cv_iter=CV_ITER, cat_col=cat_col, auto_ml = False)

all_metrics = all_metrics.append(exp_metrics)
all_results.append((sampler[0]+"_"+BASE_NAME, exp_metrics, model_results))

log("basic, autosklearn, {0}, {1}".format(sampler[0], time.time()-start), 'Basic_Autosklearn.log')

### Experiment 1.6 Baseline - Default Settings - Over sampling - 50% of majority size

In [None]:
%%time
start = time.time()

sampler = ('ROS 2:1', RandomOverSampler(random_state=RANDOM_STATE, sampling_strategy = 1/2))

exp_metrics, model_results = perform_experiment(X_train, X_test, y_train, y_test, classifiers, sampler, 1, \
                                                cv_iter=CV_ITER, cat_col=cat_col, auto_ml = False)

all_metrics = all_metrics.append(exp_metrics)
all_results.append((sampler[0]+"_"+BASE_NAME, exp_metrics, model_results))

log("basic, autosklearn, {0}, {1}".format(sampler[0], time.time()-start), 'Basic_Autosklearn.log')

### Experiment 1.7 Baseline Default Settings SMOTE-NC

In [None]:
sampler = ('SMOTE_NC', SMOTENC(random_state=RANDOM_STATE, categorical_features=[0,1,2,3,4,5,6,7,8,13,14], n_jobs=8))

X_train_t, X_test_t, y_train_t, y_test_t = \
    prepare_train_test_data(X_train, X_test, y_train, y_test, 
                            sampler = RandomOverSampler(random_state=RANDOM_STATE), 
                            cat_col = cat_col)

PRE-SAMPLING: (600803, 164) (600803,) Counter({0: 544661, 1: 56142})


In [None]:
%%time
start = time.time()
sampler = ('SMOTE_NC', DummySampler())

## Keep the following, essentially dropping the dt columns
cols = ['is_churn', 'city', 'bd', 'registered_via', 'total_order',
       'payment_method_id_mode', 'payment_method_id_count',
       'payment_plan_days_mode', 'payment_plan_days_mean',
       'plan_list_price_mean', 'plan_lifetime_value', 'actual_amount_mean',
       'total_actual_amount', 'is_auto_renew_mode', 'cancel_times']

exp_metrics, model_results = perform_experiment(X_train_t, X_test_t, y_train_t, y_test_t, classifiers, sampler, 1, \
                                                cv_iter=CV_ITER, cat_col=[], auto_ml = False, 
                                                n_jobs = 4, prepare_data = False)

all_metrics = all_metrics.append(exp_metrics)
all_results.append((sampler[0]+"_"+BASE_NAME, exp_metrics, model_results))

log("basic, autosklearn, {0}, {1}".format(sampler[0], time.time()-start), 'Basic_Autosklearn.log')

In [None]:
all_metrics.sort_values('recall', ascending=False).head(15)

In [None]:
#store(all_results, 'Asklearn_Default_Undersample.pickle')

# Write (overwrite) the file to store the experiment results
with open('Manual_Asklearn_20190813.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    print("Writing results to", f.name)
    pickle.dump(all_results, f, pickle.HIGHEST_PROTOCOL)

In [None]:
## Just try an SVM 
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

In [None]:
%%time
model_train_results = train_model(base_dataset, sampling_method = 'under', classifiers = [('SGDClassifier', SGDClassifier(loss='log'))])

In [None]:
## Print the feature importance 

feature_index = np.flip(np.argsort(model.feature_importances_), axis=0)
ordered_features = []
column_names = X_test.columns

for i in feature_index[0:10]:
    print(np.round(model.feature_importances_[i], 3), ' --> ', column_names[i])
    ordered_features.append(column_names[i])

In [None]:
## Print the permutation importance 

import eli5
from eli5.sklearn import PermutationImportance

_, X_test, _, y_test = prepare_train_test_split(model_dataset, 0)

perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)

eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
#model_name, model = model_train_results[1][2]

for model_name, model in model_train_results[1]:
    probs = model.predict_proba(X_test)[:, 1]
    pr_data = plot_precision_recall(
        y_test, probs, title='PR Curve for {0}'.format(model_name))

In [None]:
probs = model.predict_proba(X_test)[:, 1]
pr_data = plot_precision_recall(
    y_test, probs, title='PR Curve for {0}'.format(model_name))

In [None]:
pr_data = plot_roc(
    y_test, probs, title='ROC Curve for {0}'.format(model_name))

In [None]:
print(plt.style.available)

In [None]:
model_dataset.head()

In [None]:
print("Original Data distribution")
print(model_dataset['is_churn'].value_counts())

In [None]:
# Display new class counts
print('Sci-Kit Learn : resample : Down Sampled data set')
train_downsample = undersampled_dataset(model_dataset, 'is_churn')

print(train_downsample['is_churn'].value_counts())
print("Num records = ", train_downsample.shape[0])
train_downsample.is_churn.value_counts().plot(kind='bar', title='Count (target)')

In [None]:
# Display new class counts
print('Sci-Kit Learn : resample : Up Sampled data set')
train_upsample = oversampled_dataset(model_dataset, 'is_churn')

print(train_upsample['is_churn'].value_counts())
print("Num records = ", train_upsample.shape[0])
train_upsample.is_churn.value_counts().plot(kind='bar', title='Count (target)')

In [None]:
probs = model.predict_proba(X_test)[:, 1]
pr_data = plot_precision_recall(
    y_test, probs, title='Precision-Recall Curve for Random Forest')

In [None]:
precision_above = pr_data.loc[pr_data['precision'] >= 0.25].copy()
precision_above.sort_values('recall', ascending=False, inplace=True)
precision_above.head()

In [None]:
threshold_required = 0.5
# Make predictions where probability is above threshold
preds = np.zeros(len(y_test))
preds[probs >= threshold_required] = 1

# Make and plot confusion matrix
cm = confusion_matrix(y_test, preds)
plot_confusion_matrix(cm, classes=['No Churn', 'Churn'],
                      title='Churn Confusion Matrix')

In [None]:
model_dataset.head()

In [None]:
fi = pd.DataFrame({'importance': model.feature_importances_}, index=model_dataset.iloc[:, 1:].columns).\
    sort_values('importance', ascending=False)
fi.head(10)