# KKBox Customer Churn Prediction
### w/ BigQuery and Apache Spark

---

# Part III: <font color=green>*Model Creation and Evaluation*</font>

---

In [1]:
# General Imports
from __future__ import absolute_import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Imports for PySpark
import findspark
findspark.init('C:\spark\spark-2.4.4-bin-hadoop2.7')
# import pyspark
from pyspark import SparkConf
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext

# # Imports for BigQuery connection
# import json
# import pprint
# import subprocess

# # Imports for GCP
# from google.cloud import bigquery
import time 
# import gcsfs

# Imports for Spark ML
from pyspark.ml.feature import (VectorAssembler,StandardScaler, OneHotEncoderEstimator, OneHotEncoder)
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, Evaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
## Dataproc Specs

# Jupyter Initialization: gs://srcd-dataproc/jupyter.sh 
# Components Installed: Anaconda and Jupyter
# Master Node:   x1 - 4 vCPU w/ 15 GB RAM each
# Workers Nodes: x5 - 4 vCPU w/ 15 GB RAM each
# Disk: 100GB

## Create Spark Session and Import Data

In [4]:
# Specify Google Credentials
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='D:\OneDrive\J-5\GitHub\Google Credentials.json'

In [5]:
from pyspark.sql import SparkSession

# spark = SparkSession.Builder().config(conf=SparkConf().setMaster("local[*]")).getOrCreate()

# Instantiate BigQuery magic
# %load_ext google.cloud.bigquery

In [6]:
# If Working Locally on Computer, Importing Data Locally#

# Import DRV_Jan2016 (Train Set) 
DRV_Jan2016_1to1 = spark.read.csv('D:\J-5 Local\DRV_Jan2016_1to1_clust',inferSchema=True,header=True)
DRV_Jan2016_3to1 = spark.read.csv('D:\J-5 Local\DRV_Jan2016_3to1_clust',inferSchema=True,header=True)
DRV_Jan2016_5to1 = spark.read.csv('D:\J-5 Local\DRV_Jan2016_5to1_clust',inferSchema=True,header=True)
DRV_Jan2016_7to1 = spark.read.csv('D:\J-5 Local\DRV_Jan2016_7to1_clust',inferSchema=True,header=True)
DRV_Jan2016_9to1 = spark.read.csv('D:\J-5 Local\DRV_Jan2016_9to1_clust',inferSchema=True,header=True)
DRV_Jan2016_11to1 = spark.read.csv('D:\J-5 Local\DRV_Jan2016_11to1_clust',inferSchema=True,header=True)
DRV_Jan2016_13to1 = spark.read.csv('D:\J-5 Local\DRV_Jan2016_13to1_clust',inferSchema=True,header=True)

DRV_Jan2016 = spark.read.csv('D:\J-5 Local\DRV_Jan2016_With_Cluster',inferSchema=True,header=True)

# Import DRV_Feb2016 (Validation Set) 
DRV_Feb2016 = spark.read.csv('D:\J-5 Local\DRV_Feb2016_With_Cluster',inferSchema=True,header=True)


In [None]:
# ## If Working Locally on Computer, Importing Data from GCS ##

# # Import DRV_Jan2016 (Train Set) from Google Cloud Storage via Pandas
# DRV_Jan2016_Balanced_1 = spark.createDataFrame(pd.read_csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_Balanced_1'))
# DRV_Jan2016_Balanced_2 = spark.createDataFrame(pd.read_csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_Balanced_2'))
# DRV_Jan2016_Balanced_3 = spark.createDataFrame(pd.read_csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_Balanced_3'))

# # Import DRV_Feb2016 (Validation Set) from Google Cloud Storage via Pandas
# DRV_Feb2016 = spark.createDataFrame(pd.read_csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016'))

# # Import DRV_Mar2016 (Test Set) from Google Cloud Storage via Pandas
# DRV_Mar2016 = spark.createDataFrame(pd.read_csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Mar2016'))

In [None]:
# If Working on Dataproc Cloud ##

# Import DRV_Jan2016 (Train Set) 
DRV_Jan2016_1to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_1to1',inferSchema=True,header=True)
DRV_Jan2016_3to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_3to1',inferSchema=True,header=True)
DRV_Jan2016_5to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_5to1',inferSchema=True,header=True)
DRV_Jan2016_7to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_7to1',inferSchema=True,header=True)
DRV_Jan2016_9to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_9to1',inferSchema=True,header=True)
DRV_Jan2016_11to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_11to1',inferSchema=True,header=True)
DRV_Jan2016_13to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_13to1',inferSchema=True,header=True)

DRV_Jan20160 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000000',inferSchema=True,header=True)
DRV_Jan20161 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000001',inferSchema=True,header=True)
DRV_Jan20162 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000002',inferSchema=True,header=True)

DRV_Jan2016 = DRV_Jan20160.union(DRV_Jan20161)
DRV_Jan2016 = DRV_Jan2016.union(DRV_Jan20162)

DRV_Jan20160 = None
DRV_Jan20161 = None
DRV_Jan20162 = None

# Import DRV_Feb2016 (Validation Set) 
DRV_Feb20160 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000000',inferSchema=True,header=True)
DRV_Feb20161 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000001',inferSchema=True,header=True)
DRV_Feb20162 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000002',inferSchema=True,header=True)

DRV_Feb2016 = DRV_Feb20160.union(DRV_Feb20161)
DRV_Feb2016 = DRV_Feb2016.union(DRV_Feb20162)

DRV_Feb20160 = None
DRV_Feb20161 = None
DRV_Feb20162 = None

***Cast Correct Column Types on All Sets***

In [None]:
column_types_pd = [('msno', 'STRING'),
 ('membership_expire_date', 'DATE'),
 ('payment_method_id', 'INT64'),
 ('payment_plan_days', 'INT64'),
 ('plan_list_price', 'INT64'),
 ('net_paid_amount', 'INT64'),
 ('is_net_paid_amount', 'STRING'),
 ('is_auto_renew', 'INT64'),
 ('city', 'INT64'),
 ('bd', 'INT64'),
 ('registered_via', 'INT64'),
 ('registration_init_time', 'DATE'),
 ('membership_length', 'INT64'),
 ('is_churn', 'FLOAT64'),
 ('total_songs', 'INT64'),
 ('total_logins', 'INT64'),
 ('total_secs', 'FLOAT64'),
 ('sum_num_unq', 'INT64'),
 ('sum_num_repeat', 'INT64'),
 ('sum_over_50pec', 'INT64'),
 ('sum_over_75pec', 'INT64'),
 ('sum_over_985pec', 'INT64'),
 ('total_transactions', 'INT64'),
 ('total_spent', 'FLOAT64'),
 ('avg_spent_trans', 'FLOAT64'),
 ('spent_per_logins', 'FLOAT64'),
 ('spent_per_secs', 'FLOAT64'),
 ('spent_per_song', 'FLOAT64'),
 ('spent_per_num_unq', 'FLOAT64'),
 ('spent_per_num_repeats', 'FLOAT64'),
 ('never_active_subscriber', 'FLOAT64'),
 ('total_spent_zero', 'FLOAT64'),
 ('city_agg', 'INT64'),
 ('payment_method_agg', 'INT64'),
 ('songs_last_7', 'FLOAT64'),
 ('songs_last_7_AVG', 'FLOAT64'),
 ('logins_last_7', 'FLOAT64'),
 ('logins_last_7_AVG', 'FLOAT64'),
 ('total_secs_last_7', 'FLOAT64'),
 ('total_secs_last_7_AVG', 'FLOAT64'),
 ('num_unq_last_7', 'FLOAT64'),
 ('num_unq_last_7_AVG', 'FLOAT64'),
 ('num_repeat_last_7', 'FLOAT64'),
 ('num_repeat_last_7_AVG', 'FLOAT64'),
 ('over_50perc_last_7', 'FLOAT64'),
 ('over_50perc_last_7_AVG', 'FLOAT64'),
 ('over_75perc_last_7', 'FLOAT64'),
 ('over_75perc_last_7_AVG', 'FLOAT64'),
 ('over_985perc_last_7', 'FLOAT64'),
 ('over_985perc_last_7_AVG', 'FLOAT64'),
 ('songs_last_15', 'FLOAT64'),
 ('songs_last_15_AVG', 'FLOAT64'),
 ('logins_last_15', 'FLOAT64'),
 ('logins_last_15_AVG', 'FLOAT64'),
 ('total_secs_last_15', 'FLOAT64'),
 ('total_secs_last_15_AVG', 'FLOAT64'),
 ('num_unq_last_15', 'FLOAT64'),
 ('num_unq_last_15_AVG', 'FLOAT64'),
 ('num_repeat_last_15', 'FLOAT64'),
 ('num_repeat_last_15_AVG', 'FLOAT64'),
 ('over_50perc_last_15', 'FLOAT64'),
 ('over_50perc_last_15_AVG', 'FLOAT64'),
 ('over_75perc_last_15', 'FLOAT64'),
 ('over_75perc_last_15_AVG', 'FLOAT64'),
 ('over_985perc_last_15', 'FLOAT64'),
 ('over_985perc_last_15_AVG', 'FLOAT64'),
 ('songs_last_30', 'FLOAT64'),
 ('songs_last_30_AVG', 'FLOAT64'),
 ('logins_last_30', 'FLOAT64'),
 ('logins_last_30_AVG', 'FLOAT64'),
 ('total_secs_last_30', 'FLOAT64'),
 ('total_secs_last_30_AVG', 'FLOAT64'),
 ('num_unq_last_30', 'FLOAT64'),
 ('num_unq_last_30_AVG', 'FLOAT64'),
 ('num_repeat_last_30', 'FLOAT64'),
 ('num_repeat_last_30_AVG', 'FLOAT64'),
 ('over_50perc_last_30', 'FLOAT64'),
 ('over_50perc_last_30_AVG', 'FLOAT64'),
 ('over_75perc_last_30', 'FLOAT64'),
 ('over_75perc_last_30_AVG', 'FLOAT64'),
 ('over_985perc_last_30', 'FLOAT64'),
 ('over_985perc_last_30_AVG', 'FLOAT64'),
 ('songs_last_60', 'FLOAT64'),
 ('songs_last_60_AVG', 'FLOAT64'),
 ('logins_last_60', 'FLOAT64'),
 ('logins_last_60_AVG', 'FLOAT64'),
 ('total_secs_last_60', 'FLOAT64'),
 ('total_secs_last_60_AVG', 'FLOAT64'),
 ('num_unq_last_60', 'FLOAT64'),
 ('num_unq_last_60_AVG', 'FLOAT64'),
 ('num_repeat_last_60', 'FLOAT64'),
 ('num_repeat_last_60_AVG', 'FLOAT64'),
 ('over_50perc_last_60', 'FLOAT64'),
 ('over_50perc_last_60_AVG', 'FLOAT64'),
 ('over_75perc_last_60', 'FLOAT64'),
 ('over_75perc_last_60_AVG', 'FLOAT64'),
 ('over_985perc_last_60', 'FLOAT64'),
 ('over_985perc_last_60_AVG', 'FLOAT64'),
 ('songs_last_120', 'FLOAT64'),
 ('songs_last_120_AVG', 'FLOAT64'),
 ('logins_last_120', 'FLOAT64'),
 ('logins_last_120_AVG', 'FLOAT64'),
 ('total_secs_last_120', 'FLOAT64'),
 ('total_secs_last_120_AVG', 'FLOAT64'),
 ('num_unq_last_120', 'FLOAT64'),
 ('num_unq_last_120_AVG', 'FLOAT64'),
 ('num_repeat_last_120', 'FLOAT64'),
 ('num_repeat_last_120_AVG', 'FLOAT64'),
 ('over_50perc_last_120', 'FLOAT64'),
 ('over_50perc_last_120_AVG', 'FLOAT64'),
 ('over_75perc_last_120', 'FLOAT64'),
 ('over_75perc_last_120_AVG', 'FLOAT64'),
 ('over_985perc_last_120', 'FLOAT64'),
 ('over_985perc_last_120_AVG', 'FLOAT64'),
 ('SUM_unq_songs_0_15', 'FLOAT64'),
 ('AVG_unq_songs_0_15', 'FLOAT64'),
 ('SUM_songs_0_15', 'FLOAT64'),
 ('AVG_songs_0_15', 'FLOAT64'),
 ('SUM_secs_0_15', 'FLOAT64'),
 ('AVG_secs_0_15', 'FLOAT64'),
 ('SUM_songs50_0_15', 'FLOAT64'),
 ('AVG_songs50_0_15', 'FLOAT64'),
 ('SUM_logins_0_15', 'FLOAT64'),
 ('AVG_logins_0_15', 'FLOAT64'),
 ('SUM_repeats_0_15', 'FLOAT64'),
 ('AVG_repeats_0_15', 'FLOAT64'),
 ('SUM_unq_songs_15_30', 'FLOAT64'),
 ('AVG_unq_songs_15_30', 'FLOAT64'),
 ('SUM_songs_15_30', 'FLOAT64'),
 ('AVG_songs_15_30', 'FLOAT64'),
 ('SUM_secs_15_30', 'FLOAT64'),
 ('AVG_secs_15_30', 'FLOAT64'),
 ('SUM_songs50_15_30', 'FLOAT64'),
 ('AVG_songs50_15_30', 'FLOAT64'),
 ('SUM_logins_15_30', 'FLOAT64'),
 ('AVG_logins_15_30', 'FLOAT64'),
 ('SUM_repeats_15_30', 'FLOAT64'),
 ('AVG_repeats_15_30', 'FLOAT64'),
 ('SUM_unq_songs_30_45', 'FLOAT64'),
 ('AVG_unq_songs_30_45', 'FLOAT64'),
 ('SUM_songs_30_45', 'FLOAT64'),
 ('AVG_songs_30_45', 'FLOAT64'),
 ('SUM_secs_30_45', 'FLOAT64'),
 ('AVG_secs_30_45', 'FLOAT64'),
 ('SUM_songs50_30_45', 'FLOAT64'),
 ('AVG_songs50_30_45', 'FLOAT64'),
 ('SUM_logins_30_45', 'FLOAT64'),
 ('AVG_logins_30_45', 'FLOAT64'),
 ('SUM_repeats_30_45', 'FLOAT64'),
 ('AVG_repeats_30_45', 'FLOAT64'),
 ('SUM_unq_songs_45_60', 'FLOAT64'),
 ('AVG_unq_songs_45_60', 'FLOAT64'),
 ('SUM_songs_45_60', 'FLOAT64'),
 ('AVG_songs_45_60', 'FLOAT64'),
 ('SUM_secs_45_60', 'FLOAT64'),
 ('AVG_secs_45_60', 'FLOAT64'),
 ('SUM_songs50_45_60', 'FLOAT64'),
 ('AVG_songs50_45_60', 'FLOAT64'),
 ('SUM_logins_45_60', 'FLOAT64'),
 ('AVG_logins_45_60', 'FLOAT64'),
 ('SUM_repeats_45_60', 'FLOAT64'),
 ('AVG_repeats_45_60', 'FLOAT64'),
 ('DIFSUM_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_songs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_secs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_secs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_logins_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_logins_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_songs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_secs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_secs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_logins_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_logins_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_songs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_secs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_secs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_logins_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_logins_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_repeats_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_repeats_30_45_45_60', 'FLOAT64'),
 ('expire_last_login', 'INT64'),
 ('total_cancelations', 'INT64'),
 ('login_after_expire_10', 'INT64'),
 ('login_after_expire_20', 'INT64'),
 ('login_after_expire_30', 'INT64'),
 ('STD_unq_songs_0_15', 'FLOAT64'),
 ('STD_songs_0_15', 'FLOAT64'),
 ('STD_secs_0_15', 'FLOAT64'),
 ('STD_songs50_0_15', 'FLOAT64'),
 ('STD_repeats_0_15', 'FLOAT64'),
 ('STD_unq_songs_15_30', 'FLOAT64'),
 ('STD_songs_15_30', 'FLOAT64'),
 ('STD_secs_15_30', 'FLOAT64'),
 ('STD_songs50_15_30', 'FLOAT64'),
 ('STD_repeats_15_30', 'FLOAT64'),
 ('STD_unq_songs_30_45', 'FLOAT64'),
 ('STD_songs_30_45', 'FLOAT64'),
 ('STD_secs_30_45', 'FLOAT64'),
 ('STD_songs50_30_45', 'FLOAT64'),
 ('STD_repeats_30_45', 'FLOAT64'),
 ('STD_unq_songs_45_60', 'FLOAT64'),
 ('STD_songs_45_60', 'FLOAT64'),
 ('STD_secs_45_60', 'FLOAT64'),
 ('STD_songs50_45_60', 'FLOAT64'),
 ('STD_repeats_45_60', 'FLOAT64'),
 ('DIFSTD_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_secs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_secs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_secs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_repeats_30_45_45_60', 'FLOAT64'),
 ('is_cancel', 'INT64')]

In [None]:
from pyspark.sql.functions import expr

# Correctly Cast DRV_Feb2016
for feature, datatype in column_types_pd:
    if datatype == 'STRING':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS string)"))')
    if datatype == 'DATE':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS timestamp)"))')
    if datatype == 'INT64':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS integer)"))')
    if datatype == 'FLOAT64':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS double)"))')

## Model Pre-Processing
https://medium.com/@dhiraj.p.rai/essentials-of-feature-engineering-in-pyspark-part-i-76a57680a85

#### - <font color=blue>Split Feautres by Categorical or Continuous</font> -

In [7]:
# Create list of Categorical feature names
cat_feats = ['is_auto_renew', 'total_spent_zero', 'city_agg', 'payment_method_agg', 'never_active_subscriber', 'Cluster']

In [8]:
# Create list of Continuous feature names
cont_feats = [x for x in DRV_Jan2016_1to1.columns if x not in cat_feats]
cont_feats.remove('msno')
cont_feats.remove('is_churn')
cont_feats.remove('membership_expire_date')
cont_feats.remove('registration_init_time')
cont_feats.remove('city')
cont_feats.remove('bd')
cont_feats.remove('payment_method_id')
cont_feats.remove('is_net_paid_amount')
cont_feats.remove('registered_via')
cont_feats.remove('_c0')

### - <font color=blue>Data Pre-Processing</font> -

#### <font color=purple>*Encode Categorical Variables*</font>

In [9]:
# Create a list of categorical, 'Vector' feature names
cat_feats_vec = ['is_auto_renew_vec', 'total_spent_zero_vec', 'city_agg_vec', 
                 'payment_method_agg_vec', 'never_active_subscriber_vec', 'Cluster_vec']

In [10]:
# One Hot Encode

is_auto_renew_encoder = OneHotEncoder(inputCol='is_auto_renew',outputCol='is_auto_renew_vec')
total_spent_zero_encoder = OneHotEncoder(inputCol='total_spent_zero',outputCol='total_spent_zero_vec')
city_agg_encoder = OneHotEncoder(inputCol='city_agg',outputCol='city_agg_vec')
payment_method_agg_encoder = OneHotEncoder(inputCol='payment_method_agg',outputCol='payment_method_agg_vec')
never_active_subscriber_encoder = OneHotEncoder(inputCol='never_active_subscriber',outputCol='never_active_subscriber_vec')
clueter_encoder = OneHotEncoder(inputCol='Cluster',outputCol='Cluster_vec')
# is_net_paid_amount_encoder = OneHotEncoder(inputCol='is_net_paid_amount',outputCol='is_net_paid_amount_vec')

# registered_via_encoder = OneHotEncoder(inputCol='registered_via',outputCol='registered_via_vec')


#### <font color=purple>*Vector Assembler*</font>

In [11]:
# Create master list of feature names for model
final_features = cont_feats + cat_feats_vec

In [12]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

#### <font color=purple>*Feature Scaling*</font>

In [13]:
# Scale all features into our final output features
scaler = StandardScaler(inputCol='features', 
                        outputCol='features_scaled',
                        withStd=True, withMean=False)

## Model Creation: Pipeline and Tuning

### - <font color=blue>Create Pipeline Object</font> -
https://spark.apache.org/docs/2.4.3/ml-pipeline.html

In [14]:
# Instatiate Model Estimators and Parameters
gbt = GBTClassifier(featuresCol='features_scaled',
                    labelCol='is_churn')

In [15]:
# Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            clueter_encoder,assembler,scaler,gbt])

### - <font color=blue>Model Tuning</font> -
https://spark.apache.org/docs/2.4.3/ml-tuning.html

In [16]:
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_churn')

#### <font color=purple>Gradient Boosted Trees Parameter Tuning</font>

In [17]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

## Model Execution and Evaluation

### - <font color=blue>Train Model: All Features, All Splits</font> -

***Gradient Boosted Trees***

In [19]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to1 = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 285


In [20]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to1 = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 333


In [21]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to1 = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 384


In [22]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to1 = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 455


In [23]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to1 = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 613


In [24]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to1 = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 801


In [25]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to1 = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 1015


#### <font color=purple>Evaluate Trained Model</font>

***Create Custom Evaluator***

In [26]:
# Custom evaluator that we can use instead of BinaryClassificationEvaluator() in grid search
class ClassEvaluator:

    def __init__(self, resultname, resultdata, model):
        from sklearn.metrics import confusion_matrix
        # Initialize variables
        self.resultPandas = resultdata[['is_churn', 'prediction']].toPandas()
        self.resultdata = resultdata 
        self.resultname = resultname
        self.model = model
        
        self.cm = confusion_matrix(self.resultPandas['is_churn'],self.resultPandas['prediction'])
        
        # Calculate confusion matrix values
        self.tp = self.cm[0][0]
        self.fp = self.cm[0][1]
        self.tn = self.cm[1][1]
        self.fn = self.cm[1][0]
        
    def evaluate(self):
        # Calculate Metrics and add epsilon to prevent division by zero
        precision = self.tp / float(self.tp + self.fp + 0.00001)
        recall = self.tp / float(self.tp + self.fn + 0.00001)
        f1 = (2 * precision * recall) / float(precision + recall + 0.00001)
        error = (self.fp + self.fn + 0.00001) / (self.tp + self.fp + self.tn + self.fn + 0.00001)
        
        # Instantiate Evaluator and call AUC metric
        my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                                labelCol='is_churn')
        AUC = my_eval.evaluate(self.resultdata)
        
        
        return pd.DataFrame(data=[[self.resultname, AUC, f1, precision, recall, error]], 
                            columns=['resultname', 'AUC', 'f1', 'precision', 'recall', 'error'])
    
    def confusionmatrix(self):
        from sklearn.metrics import confusion_matrix
        # Print Confusion Matrix
        return self.cm
        
    
    def modelparams(self):
        scores = self.model.avgMetrics
        params = [{p.name: v for p, v in m.items()} for m in self.model.getEstimatorParamMaps()]
        params_pd = pd.DataFrame(params)
        params_pd['AUC score'] = scores
        return params_pd

### - <font color=blue>Evaluate Train Model: All Features, All Splits</font> -

***Transform Train Data on Trained Models***

In [27]:
# Dictionary of Models that were created
models_created = {
                  'gbt_model_1to1' : (gbt_model_1to1, DRV_Jan2016_1to1),
                  'gbt_model_3to1' : (gbt_model_3to1, DRV_Jan2016_3to1),
                  'gbt_model_5to1' : (gbt_model_5to1, DRV_Jan2016_5to1),
                  'gbt_model_7to1' : (gbt_model_7to1, DRV_Jan2016_7to1),
                  'gbt_model_9to1' : (gbt_model_9to1, DRV_Jan2016_9to1),
                  'gbt_model_11to1' : (gbt_model_11to1, DRV_Jan2016_11to1),
                  'gbt_model_13to1' : (gbt_model_13to1, DRV_Jan2016_13to1)}

In [28]:
## Create empty dataframe and populate with Train Set transformation results.
train_resultsall = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_resultsall = train_resultsall.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to1
[[589237  83514]
 [  1943  17205]]
 
gbt_model_3to1
[[634127  38624]
 [  4063  15085]]
 
gbt_model_5to1
[[648451  24300]
 [  5649  13499]]
 
gbt_model_7to1
[[655955  16796]
 [  7050  12098]]
 
gbt_model_9to1
[[659533  13218]
 [  7639  11509]]
 
gbt_model_11to1
[[662619  10132]
 [  8402  10746]]
 
gbt_model_13to1
[[664652   8099]
 [  9080  10068]]
 


In [29]:
# Evaluate Training Model
train_resultsall.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to1,0.887195,0.932383,0.875862,0.996713,0.123511
0,gbt_model_3to1,0.865199,0.967433,0.942588,0.993634,0.061695
0,gbt_model_5to1,0.834431,0.977424,0.96388,0.991364,0.043285
0,gbt_model_7to1,0.803425,0.982143,0.975034,0.989367,0.034465
0,gbt_model_9to1,0.790704,0.984429,0.980352,0.98855,0.030145
0,gbt_model_11to1,0.773073,0.986202,0.984939,0.987479,0.026787
0,gbt_model_13to1,0.75688,0.987237,0.987961,0.986523,0.024829


### - <font color=blue>Evaluate Validation Model: All Features, All Splits</font> -

In [30]:
# Create empty dataframe and populate with Train Set transformation results.
validation_resultsall = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_resultsall = validation_resultsall.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to1
[[450285  77919]
 [  2476  17447]]
 
gbt_model_3to1
[[493859  34345]
 [  5259  14664]]
 
gbt_model_5to1
[[507065  21139]
 [  7330  12593]]
 
gbt_model_7to1
[[513140  15064]
 [  9005  10918]]
 
gbt_model_9to1
[[516137  12067]
 [  9909  10014]]
 
gbt_model_11to1
[[518683   9521]
 [ 10788   9135]]
 
gbt_model_13to1
[[520483   7721]
 [ 11560   8363]]
 


In [31]:
# Evaluate Validation Model
validation_resultsall.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to1,0.864102,0.91804,0.852483,0.994531,0.146672
0,gbt_model_3to1,0.835506,0.961444,0.934978,0.989463,0.072253
0,gbt_model_5to1,0.796031,0.972689,0.959979,0.98575,0.051939
0,gbt_model_7to1,0.759745,0.97708,0.971481,0.982754,0.043911
0,gbt_model_9to1,0.739895,0.97915,0.977155,0.981163,0.040093
0,gbt_model_11to1,0.720245,0.980793,0.981975,0.979625,0.037052
0,gbt_model_13to1,0.702574,0.98181,0.985383,0.978272,0.035176


We now currently have ~230 Features with the inclusion of our *Bi-Weekly Activity Block* and *Comparison of Bi-Weekly Activity Block Features*. Looking at the results above we can see that we have made some improvements on AUC. We also notice how Recall increased significantly over the Ratio'd subsets. The ratio'd subsets seemed to have helped address the issue of having a high amount of False Positives.

We can see that our models are still overfitting with the higher ratio'd models being the worst. However our Precision scores are better over our higher ratio'd models along with an improvement in model error and a slight decrease in Recall. Each of these models have there pros and cons and we might benefit from some sort of ensemble of these models. Before we do such a thing let's play around a bit with feature selection to see if we improve on the overall generalization between our Train and Validation models.

#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [85]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

Unnamed: 0,maxIter,maxDepth,minInstancesPerNode,AUC,Model
0,20,5,20,0.954903,gbt_model_1to1
0,20,5,20,0.954625,gbt_model_3to1
0,20,5,20,0.954089,gbt_model_5to1
0,20,5,20,0.954205,gbt_model_7to1
0,20,5,20,0.954931,gbt_model_9to1
0,20,5,20,0.954439,gbt_model_11to1
0,20,5,20,0.954333,gbt_model_13to1


#### <font color=purple>Model Evaluation: Feature Importance</font>

We will derive average feature importance scores for all features. Then we will produce a 5 Number Summary on these scores and group our features based on their scores against the following thresholds: Mean, 75th Percentile, 50th Percentile, and 25th Percentile.

In [32]:
# Display feature importances
importances = gbt_model_1to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column1 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_3to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column3 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_5to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column5 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_7to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column7 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_9to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column9 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_11to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column11 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_13to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column13 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

In [33]:
feature_imp = pd.merge(column1, column3 , on='Features')
feature_imp = pd.merge(feature_imp, column5, on='Features')
feature_imp = pd.merge(feature_imp, column7, on='Features')
feature_imp = pd.merge(feature_imp, column9, on='Features')
feature_imp = pd.merge(feature_imp, column11, on='Features')
feature_imp = pd.merge(feature_imp, column13, on='Features')

feature_imp['avg'] = feature_imp[list(feature_imp.columns[1:-1])].mean(axis=1)

In [34]:
# Show 5-Number Sumamry
feature_imp['avg'].describe()

count    231.000000
mean       0.004126
std        0.018269
min        0.000000
25%        0.000000
50%        0.000331
75%        0.001119
max        0.167198
Name: avg, dtype: float64

In [35]:
feature_imp[['Features','avg']].head(30)

Unnamed: 0,Features,avg
0,login_after_expire_30,0.136184
1,total_transactions,0.140791
2,is_auto_renew_vec,0.167198
3,login_after_expire_20,0.017738
4,is_cancel,0.038142
5,plan_list_price,0.072796
6,avg_spent_trans,0.036482
7,total_spent,0.034143
8,net_paid_amount,0.034785
9,expire_last_login,0.036996


In [36]:
mean_feats = feature_imp[feature_imp['avg'] > .004158]['Features'].tolist()
quart25_feats = feature_imp[feature_imp['avg'] > .000215]['Features'].tolist()
quart50_feats = feature_imp[feature_imp['avg'] > .000455]['Features'].tolist()
quart75_feats = feature_imp[feature_imp['avg'] > .001476]['Features'].tolist()

In [22]:
mean_feats = ['login_after_expire_30',
 'total_transactions',
 'is_auto_renew_vec',
 'login_after_expire_20',
 'plan_list_price',
 'avg_spent_trans',
 'is_cancel',
 'total_spent',
 'expire_last_login',
 'net_paid_amount',
 'membership_length',
 'spent_per_logins',
 'logins_last_60',
 'total_logins',
 'SUM_logins_30_45',
 'payment_plan_days',
 'DIFAVG_logins_15_30_30_45',
 'payment_method_agg_vec',
 'SUM_songs50_0_15',
 'num_unq_last_60']

In [23]:
quart25_feats = ['login_after_expire_30',
 'total_transactions',
 'is_auto_renew_vec',
 'login_after_expire_20',
 'plan_list_price',
 'avg_spent_trans',
 'is_cancel',
 'total_spent',
 'expire_last_login',
 'net_paid_amount',
 'membership_length',
 'spent_per_logins',
 'total_spent_zero_vec',
 'logins_last_60',
 'total_logins',
 'SUM_logins_30_45',
 'payment_plan_days',
 'logins_last_120',
 'DIFAVG_logins_15_30_30_45',
 'payment_method_agg_vec',
 'sum_over_50pec',
 'SUM_songs50_0_15',
 'SUM_logins_45_60',
 'songs_last_60',
 'songs_last_120',
 'spent_per_song',
 'DIFSUM_songs50_0_15_15_30',
 'total_secs_last_15',
 'SUM_songs_30_45',
 'num_repeat_last_120',
 'STD_songs50_30_45',
 'sum_over_985pec',
 'SUM_songs50_30_45',
 'sum_num_unq',
 'spent_per_num_repeats',
 'spent_per_num_unq',
 'DIFSUM_logins_30_45_45_60',
 'SUM_unq_songs_30_45',
 'sum_over_75pec',
 'over_50perc_last_7',
 'DIFSUM_secs_0_15_15_30',
 'DIFSUM_unq_songs_30_45_45_60',
 'total_secs_last_7',
 'DIFSUM_songs50_15_30_30_45',
 'STD_songs_45_60',
 'DIFSUM_secs_30_45_45_60',
 'songs_last_7',
 'DIFSTD_repeats_15_30_30_45',
 'STD_unq_songs_30_45',
 'sum_num_repeat',
 'over_75perc_last_7',
 'num_unq_last_7',
 'DIFAVG_logins_30_45_45_60',
 'over_985perc_last_120',
 'STD_repeats_30_45',
 'STD_songs_15_30',
 'STD_unq_songs_15_30',
 'DIFSUM_logins_0_15_15_30',
 'total_secs_last_60',
 'SUM_repeats_15_30',
 'num_unq_last_15',
 'logins_last_15',
 'num_unq_last_30',
 'num_repeat_last_60',
 'DIFSTD_repeats_0_15_15_30',
 'over_75perc_last_15',
 'STD_secs_15_30',
 'SUM_songs_45_60',
 'songs_last_15',
 'SUM_songs50_45_60',
 'total_secs',
 'SUM_repeats_45_60',
 'DIFSUM_logins_15_30_30_45',
 'DIFSUM_songs50_30_45_45_60',
 'over_75perc_last_60',
 'logins_last_30',
 'over_75perc_last_120',
 'num_unq_last_120',
 'DIFSUM_unq_songs_15_30_30_45',
 'total_secs_last_30',
 'over_985perc_last_7',
 'num_unq_last_60',
 'SUM_secs_30_45',
 'over_985perc_last_30',
 'num_repeat_last_7',
 'DIFAVG_songs50_15_30_30_45',
 'SUM_songs50_15_30',
 'DIFSTD_songs_0_15_15_30',
 'DIFSTD_songs50_0_15_15_30',
 'DIFSUM_songs_0_15_15_30',
 'DIFAVG_logins_0_15_15_30',
 'DIFAVG_songs_0_15_15_30',
 'DIFSUM_secs_15_30_30_45',
 'songs_last_30',
 'STD_songs50_45_60',
 'SUM_secs_45_60',
 'STD_secs_45_60',
 'STD_unq_songs_45_60',
 'SUM_unq_songs_45_60',
 'DIFSUM_repeats_15_30_30_45',
 'total_songs',
 'DIFSTD_repeats_30_45_45_60',
 'DIFSTD_secs_30_45_45_60',
 'DIFSTD_songs_30_45_45_60',
 'DIFSTD_songs50_15_30_30_45',
 'STD_repeats_45_60',
 'DIFSTD_songs_15_30_30_45',
 'DIFSTD_unq_songs_15_30_30_45',
 'DIFSUM_repeats_0_15_15_30',
 'DIFAVG_songs50_0_15_15_30',
 'DIFSTD_unq_songs_0_15_15_30',
 'DIFAVG_unq_songs_0_15_15_30',
 'DIFSUM_unq_songs_0_15_15_30',
 'SUM_songs_15_30',
 'over_985perc_last_60',
 'over_50perc_last_30',
 'over_75perc_last_30',
 'num_unq_last_60_AVG',
 'over_50perc_last_60',
 'logins_last_7',
 'num_repeat_last_15',
 'over_50perc_last_15',
 'SUM_secs_15_30',
 'STD_songs50_15_30',
 'STD_repeats_15_30',
 'STD_songs50_0_15',
 'AVG_songs50_0_15',
 'total_secs_last_120',
 'over_50perc_last_120',
 'spent_per_secs',
 'STD_unq_songs_0_15']

In [24]:
quart50_feats = ['login_after_expire_30',
 'total_transactions',
 'is_auto_renew_vec',
 'login_after_expire_20',
 'plan_list_price',
 'avg_spent_trans',
 'is_cancel',
 'total_spent',
 'expire_last_login',
 'net_paid_amount',
 'membership_length',
 'spent_per_logins',
 'total_spent_zero_vec',
 'logins_last_60',
 'total_logins',
 'SUM_logins_30_45',
 'payment_plan_days',
 'logins_last_120',
 'DIFAVG_logins_15_30_30_45',
 'payment_method_agg_vec',
 'sum_over_50pec',
 'SUM_songs50_0_15',
 'SUM_logins_45_60',
 'songs_last_60',
 'songs_last_120',
 'spent_per_song',
 'DIFSUM_songs50_0_15_15_30',
 'total_secs_last_15',
 'SUM_songs_30_45',
 'num_repeat_last_120',
 'STD_songs50_30_45',
 'sum_over_985pec',
 'SUM_songs50_30_45',
 'sum_num_unq',
 'spent_per_num_repeats',
 'spent_per_num_unq',
 'DIFSUM_logins_30_45_45_60',
 'SUM_unq_songs_30_45',
 'sum_over_75pec',
 'over_50perc_last_7',
 'DIFSUM_secs_0_15_15_30',
 'DIFSUM_unq_songs_30_45_45_60',
 'total_secs_last_7',
 'DIFSUM_secs_30_45_45_60',
 'songs_last_7',
 'STD_unq_songs_30_45',
 'sum_num_repeat',
 'over_75perc_last_7',
 'num_unq_last_7',
 'DIFAVG_logins_30_45_45_60',
 'over_985perc_last_120',
 'STD_repeats_30_45',
 'STD_songs_15_30',
 'DIFSUM_logins_0_15_15_30',
 'total_secs_last_60',
 'SUM_repeats_15_30',
 'num_unq_last_15',
 'logins_last_15',
 'num_unq_last_30',
 'num_repeat_last_60',
 'SUM_songs_45_60',
 'songs_last_15',
 'SUM_songs50_45_60',
 'total_secs',
 'SUM_repeats_45_60',
 'DIFSUM_logins_15_30_30_45',
 'DIFSUM_songs50_30_45_45_60',
 'over_75perc_last_60',
 'logins_last_30',
 'num_unq_last_120',
 'total_secs_last_30',
 'num_unq_last_60',
 'SUM_secs_30_45',
 'num_repeat_last_7',
 'SUM_songs50_15_30',
 'DIFSUM_songs_0_15_15_30',
 'DIFAVG_logins_0_15_15_30',
 'DIFSUM_secs_15_30_30_45',
 'songs_last_30',
 'STD_songs50_45_60',
 'STD_secs_45_60',
 'STD_unq_songs_45_60',
 'SUM_unq_songs_45_60',
 'DIFSUM_repeats_15_30_30_45',
 'total_songs',
 'DIFSTD_songs_30_45_45_60',
 'DIFSTD_songs50_15_30_30_45',
 'STD_repeats_45_60',
 'DIFSTD_unq_songs_0_15_15_30',
 'DIFAVG_unq_songs_0_15_15_30',
 'over_985perc_last_60',
 'over_50perc_last_30',
 'over_75perc_last_30',
 'num_unq_last_60_AVG',
 'over_50perc_last_60',
 'logins_last_7',
 'num_repeat_last_15',
 'STD_songs50_15_30',
 'STD_repeats_15_30',
 'STD_songs50_0_15',
 'total_secs_last_120',
 'spent_per_secs',
 'STD_unq_songs_0_15']



In [25]:
quart75_feats = ['login_after_expire_30',
 'total_transactions',
 'is_auto_renew_vec',
 'login_after_expire_20',
 'plan_list_price',
 'avg_spent_trans',
 'is_cancel',
 'total_spent',
 'expire_last_login',
 'net_paid_amount',
 'membership_length',
 'spent_per_logins',
 'total_spent_zero_vec',
 'logins_last_60',
 'total_logins',
 'SUM_logins_30_45',
 'payment_plan_days',
 'logins_last_120',
 'DIFAVG_logins_15_30_30_45',
 'payment_method_agg_vec',
 'sum_over_50pec',
 'SUM_songs50_0_15',
 'SUM_logins_45_60',
 'songs_last_60',
 'songs_last_120',
 'SUM_songs50_30_45',
 'sum_num_unq',
 'songs_last_7',
 'sum_num_repeat',
 'num_unq_last_7',
 'DIFAVG_logins_30_45_45_60',
 'over_985perc_last_120',
 'DIFSUM_logins_0_15_15_30',
 'total_secs_last_60',
 'num_unq_last_15',
 'logins_last_15',
 'num_unq_last_30',
 'songs_last_15',
 'DIFSUM_logins_15_30_30_45',
 'over_75perc_last_60',
 'logins_last_30',
 'total_secs_last_30',
 'num_unq_last_60',
 'SUM_secs_30_45',
 'num_repeat_last_7',
 'songs_last_30',
 'STD_songs50_45_60',
 'over_50perc_last_60',
 'total_secs_last_120']

--------------

### - <font color=blue>Train Model: Mean Features, All Splits</font> -

In [107]:
# Create master list of feature names for model
final_features = mean_feats

In [108]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [109]:
# Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

In [110]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

***Gradient Boosted Trees***

In [41]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to1mean = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 147


In [42]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to1mean = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 175


In [43]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to1mean = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 205


In [44]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to1mean = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 228


In [45]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to1mean = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 254


In [46]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to1mean = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 292


In [47]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to1mean = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 309


In [111]:
# Dictionary of Models that were created
models_createdmean = {
                  'gbt_model_1to1mean' : (gbt_model_1to1mean, DRV_Jan2016_1to1),
                  'gbt_model_3to1mean' : (gbt_model_3to1mean, DRV_Jan2016_3to1),
                  'gbt_model_5to1mean' : (gbt_model_5to1mean, DRV_Jan2016_5to1),
                  'gbt_model_7to1mean' : (gbt_model_7to1mean, DRV_Jan2016_7to1),
                  'gbt_model_9to1mean' : (gbt_model_9to1mean, DRV_Jan2016_9to1),
                  'gbt_model_11to1mean' : (gbt_model_11to1mean, DRV_Jan2016_11to1),
                  'gbt_model_13to1mean' : (gbt_model_13to1mean, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: Mean Features, All Splits</font> -

In [112]:
## Create empty dataframe and populate with Train Set transformation results.
train_resultsmean = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_createdmean.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_resultsmean = train_resultsmean.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to1mean
[[589479  83272]
 [  1948  17200]]
 
gbt_model_3to1mean
[[635623  37128]
 [  4069  15079]]
 
gbt_model_5to1mean
[[648524  24227]
 [  5524  13624]]
 
gbt_model_7to1mean
[[655800  16951]
 [  6998  12150]]
 
gbt_model_9to1mean
[[659279  13472]
 [  7548  11600]]
 
gbt_model_11to1mean
[[662423  10328]
 [  8351  10797]]
 
gbt_model_13to1mean
[[664784   7967]
 [  9221   9927]]
 


In [113]:
# Evaluate Training Model
train_resultsmean

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to1mean,0.887244,0.932584,0.876222,0.996706,0.123168
0,gbt_model_3to1mean,0.866155,0.968605,0.944812,0.993639,0.059542
0,gbt_model_5to1mean,0.837749,0.977572,0.963988,0.991554,0.042999
0,gbt_model_7to1mean,0.804667,0.982063,0.974803,0.989442,0.034613
0,gbt_model_9to1mean,0.792891,0.984303,0.979975,0.988681,0.03038
0,gbt_model_11to1mean,0.77426,0.986092,0.984648,0.98755,0.026997
0,gbt_model_13to1mean,0.753296,0.987232,0.988158,0.986319,0.024842


### - <font color=blue>Evaluate Validation Model: Mean Features, All Splits</font> -

In [114]:
# Create empty dataframe and populate with Train Set transformation results.
validation_resultsmean = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_createdmean.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_resultsmean = validation_resultsmean.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to1mean
[[451279  76925]
 [  2407  17516]]
 
gbt_model_3to1mean
[[491516  36688]
 [  5241  14682]]
 
gbt_model_5to1mean
[[505446  22758]
 [  7500  12423]]
 
gbt_model_7to1mean
[[512838  15366]
 [  9112  10811]]
 
gbt_model_9to1mean
[[515809  12395]
 [  9734  10189]]
 
gbt_model_11to1mean
[[518516   9688]
 [ 10708   9215]]
 
gbt_model_13to1mean
[[520712   7492]
 [ 11699   8224]]
 


In [115]:
# Evaluate Validation Model
validation_resultsmean

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to1mean,0.866775,0.9192,0.854365,0.994695,0.144733
0,gbt_model_3to1mean,0.83374,0.959087,0.930542,0.98945,0.076495
0,gbt_model_5to1mean,0.790233,0.970933,0.956914,0.985379,0.055203
0,gbt_model_7to1mean,0.756774,0.976686,0.970909,0.982542,0.044658
0,gbt_model_9to1mean,0.743976,0.978995,0.976534,0.981478,0.040372
0,gbt_model_11to1mean,0.722095,0.980707,0.981659,0.979767,0.03721
0,gbt_model_13to1mean,0.699303,0.981901,0.985816,0.978026,0.035012


#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_createdmean.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

------

### - <font color=blue>Train Model: 75th Percentile, All Splits</font> -

In [53]:
# Create master list of feature names for model
final_features = quart75_feats

In [54]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [55]:
# Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

In [56]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

***Gradient Boosted Trees***

In [57]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to175 = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 162


In [58]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to175 = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 196


In [59]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to175 = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 228


In [60]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to175 = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 248


In [61]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to175 = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 279


In [62]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to175 = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 313


In [63]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to175 = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 336


In [116]:
# Dictionary of Models that were created
models_created75 = {
                  'gbt_model_1to175' : (gbt_model_1to175, DRV_Jan2016_1to1),
                  'gbt_model_3to175' : (gbt_model_3to175, DRV_Jan2016_3to1),
                  'gbt_model_5to175' : (gbt_model_5to175, DRV_Jan2016_5to1),
                  'gbt_model_7to175' : (gbt_model_7to175, DRV_Jan2016_7to1),
                  'gbt_model_9to175' : (gbt_model_9to175, DRV_Jan2016_9to1),
                  'gbt_model_11to175' : (gbt_model_11to175, DRV_Jan2016_11to1),
                  'gbt_model_13to175' : (gbt_model_13to175, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 75th Percentile, All Splits</font> -

In [117]:
## Create empty dataframe and populate with Train Set transformation results.
train_results75 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created75.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results75 = train_results75.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to175
[[588069  84682]
 [  1921  17227]]
 
gbt_model_3to175
[[633400  39351]
 [  4007  15141]]
 
gbt_model_5to175
[[648476  24275]
 [  5583  13565]]
 
gbt_model_7to175
[[655759  16992]
 [  6999  12149]]
 
gbt_model_9to175
[[659640  13111]
 [  7684  11464]]
 
gbt_model_11to175
[[662484  10267]
 [  8384  10764]]
 
gbt_model_13to175
[[664918   7833]
 [  9257   9891]]
 


In [118]:
# Evaluate Training Model
train_results75

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to175,0.886901,0.931412,0.874126,0.996744,0.125167
0,gbt_model_3to175,0.866121,0.966901,0.941507,0.993714,0.062665
0,gbt_model_5to175,0.836173,0.977491,0.963917,0.991464,0.043154
0,gbt_model_7to175,0.804611,0.982031,0.974743,0.98944,0.034674
0,gbt_model_9to175,0.789608,0.984477,0.980511,0.988485,0.030055
0,gbt_model_11to175,0.773443,0.986114,0.984739,0.987503,0.026956
0,gbt_model_13to175,0.752456,0.987307,0.988357,0.986269,0.0247


### - <font color=blue>Evaluate Validation Model: 75th Percentile, All Splits</font> -

In [119]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results75 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created75.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results75 = validation_results75.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to175
[[451959  76245]
 [  2403  17520]]
 
gbt_model_3to175
[[493954  34250]
 [  5248  14675]]
 
gbt_model_5to175
[[507006  21198]
 [  7582  12341]]
 
gbt_model_7to175
[[513176  15028]
 [  9121  10802]]
 
gbt_model_9to175
[[516314  11890]
 [ 10002   9921]]
 
gbt_model_11to175
[[518654   9550]
 [ 10804   9119]]
 
gbt_model_13to175
[[520836   7368]
 [ 11695   8228]]
 


In [120]:
# Evaluate Validation Model
validation_results75

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to175,0.867519,0.919952,0.855652,0.994711,0.143485
0,gbt_model_3to175,0.835872,0.961551,0.935158,0.989487,0.07206
0,gbt_model_5to175,0.789651,0.972396,0.959868,0.985266,0.052506
0,gbt_model_7to175,0.756868,0.977007,0.971549,0.982537,0.044057
0,gbt_model_9to175,0.737728,0.979235,0.97749,0.980996,0.03994
0,gbt_model_11to175,0.719816,0.980751,0.98192,0.979594,0.037134
0,gbt_model_13to175,0.69952,0.982023,0.986051,0.978039,0.034778


#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created75.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

------

### - <font color=blue>Train Model: 50th Percentile, All Splits</font> -

In [69]:
# Create master list of feature names for model
final_features = quart50_feats

In [70]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [71]:
# # Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

In [72]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

***Gradient Boosted Trees***

In [73]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to150 = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 196


In [74]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to150 = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 238


In [75]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to150 = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 273


In [76]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to150 = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 306


In [77]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to150 = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 343


In [78]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to150 = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 374


In [79]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to150 = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 412


In [121]:
# Dictionary of Models that were created
models_created50 = {
                  'gbt_model_1to150' : (gbt_model_1to150, DRV_Jan2016_1to1),
                  'gbt_model_3to150' : (gbt_model_3to150, DRV_Jan2016_3to1),
                  'gbt_model_5to150' : (gbt_model_5to150, DRV_Jan2016_5to1),
                  'gbt_model_7to150' : (gbt_model_7to150, DRV_Jan2016_7to1),
                  'gbt_model_9to150' : (gbt_model_9to150, DRV_Jan2016_9to1),
                  'gbt_model_11to150' : (gbt_model_11to150, DRV_Jan2016_11to1),
                  'gbt_model_13to150' : (gbt_model_13to150, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 50th Percentile, All Splits</font> -

In [122]:
## Create empty dataframe and populate with Train Set transformation results.
train_results50 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created50.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results50 = train_results50.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to150
[[589029  83722]
 [  1949  17199]]
 
gbt_model_3to150
[[633860  38891]
 [  3965  15183]]
 
gbt_model_5to150
[[648310  24441]
 [  5574  13574]]
 
gbt_model_7to150
[[655771  16980]
 [  6992  12156]]
 
gbt_model_9to150
[[659713  13038]
 [  7721  11427]]
 
gbt_model_11to150
[[662457  10294]
 [  8389  10759]]
 
gbt_model_13to150
[[664466   8285]
 [  9047  10101]]
 


In [123]:
# Evaluate Training Model
train_results50

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to150,0.886883,0.932203,0.875553,0.996702,0.12382
0,gbt_model_3to150,0.86756,0.967295,0.942191,0.993784,0.06194
0,gbt_model_5to150,0.836285,0.97737,0.96367,0.991476,0.043381
0,gbt_model_7to150,0.804802,0.982045,0.97476,0.98945,0.034647
0,gbt_model_9to150,0.788696,0.984505,0.98062,0.988432,0.030003
0,gbt_model_11to150,0.773293,0.98609,0.984699,0.987495,0.027002
0,gbt_model_13to150,0.757604,0.987121,0.987685,0.986567,0.02505


### - <font color=blue>Evaluate Validation Model: 50th Percentile, All Splits</font> -

In [124]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results50 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created50.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results50 = validation_results50.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to150
[[451132  77072]
 [  2388  17535]]
 
gbt_model_3to150
[[492185  36019]
 [  5534  14389]]
 
gbt_model_5to150
[[506962  21242]
 [  7434  12489]]
 
gbt_model_7to150
[[513024  15180]
 [  9042  10881]]
 
gbt_model_9to150
[[516376  11828]
 [  9993   9930]]
 
gbt_model_11to150
[[518543   9661]
 [ 10768   9155]]
 
gbt_model_13to150
[[520476   7728]
 [ 11603   8320]]
 


In [125]:
# Evaluate Validation Model
validation_results50

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to150,0.867113,0.919056,0.854087,0.994735,0.144966
0,gbt_model_3to150,0.82702,0.959492,0.931809,0.988881,0.075809
0,gbt_model_5to150,0.793324,0.972491,0.959784,0.985548,0.052316
0,gbt_model_7to150,0.758707,0.976932,0.971261,0.98268,0.04419
0,gbt_model_9to150,0.738013,0.979303,0.977607,0.981015,0.03981
0,gbt_model_11to150,0.720614,0.980677,0.98171,0.979657,0.037271
0,gbt_model_13to150,0.701489,0.981763,0.985369,0.978193,0.035267


#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created50.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

------

### - <font color=blue>Train Model: 25th Percentile, All Splits</font> -

In [126]:
# Create master list of feature names for model
final_features = quart25_feats

In [127]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [128]:
# # Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

In [129]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

***Gradient Boosted Trees***

In [89]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to125 = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 219


In [90]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to125 = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 266


In [91]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to125 = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 299


In [92]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to125 = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 335


In [93]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to125 = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 379


In [94]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to125 = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 416


In [95]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to125 = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 463


In [130]:
# Dictionary of Models that were created
models_created25 = {
                  'gbt_model_1to125' : (gbt_model_1to125, DRV_Jan2016_1to1),
                  'gbt_model_3to125' : (gbt_model_3to125, DRV_Jan2016_3to1),
                  'gbt_model_5to125' : (gbt_model_5to125, DRV_Jan2016_5to1),
                  'gbt_model_7to125' : (gbt_model_7to125, DRV_Jan2016_7to1),
                  'gbt_model_9to125' : (gbt_model_9to125, DRV_Jan2016_9to1),
                  'gbt_model_11to125' : (gbt_model_11to125, DRV_Jan2016_11to1),
                  'gbt_model_13to125' : (gbt_model_13to125, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 25th Percentile, All Splits</font> -

In [131]:
## Create empty dataframe and populate with Train Set transformation results.
train_results25 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created25.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results25 = train_results25.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to125
[[589180  83571]
 [  1936  17212]]
 
gbt_model_3to125
[[633316  39435]
 [  3955  15193]]
 
gbt_model_5to125
[[648190  24561]
 [  5552  13596]]
 
gbt_model_7to125
[[655981  16770]
 [  7017  12131]]
 
gbt_model_9to125
[[659603  13148]
 [  7674  11474]]
 
gbt_model_11to125
[[662721  10030]
 [  8395  10753]]
 
gbt_model_13to125
[[664744   8007]
 [  9121  10027]]
 


In [132]:
# Evaluate Training Model
train_results25

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to125,0.887335,0.93234,0.875777,0.996725,0.123583
0,gbt_model_3to125,0.867417,0.966873,0.941382,0.993794,0.062711
0,gbt_model_5to125,0.83677,0.977294,0.963492,0.991507,0.043522
0,gbt_model_7to125,0.804306,0.982187,0.975073,0.989416,0.034379
0,gbt_model_9to125,0.789842,0.984457,0.980456,0.9885,0.030094
0,gbt_model_11to125,0.773332,0.986285,0.985091,0.987491,0.02663
0,gbt_model_13to125,0.755878,0.987276,0.988098,0.986465,0.024755


### - <font color=blue>Evaluate Validation Model: 25th Percentile, All Splits</font> -

In [133]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results25 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created25.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results25 = validation_results25.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_1to125
[[452018  76186]
 [  2444  17479]]
 
gbt_model_3to125
[[492364  35840]
 [  5469  14454]]
 
gbt_model_5to125
[[506779  21425]
 [  7267  12656]]
 
gbt_model_7to125
[[513179  15025]
 [  9151  10772]]
 
gbt_model_9to125
[[516249  11955]
 [ 10001   9922]]
 
gbt_model_11to125
[[518813   9391]
 [ 10854   9069]]
 
gbt_model_13to125
[[520566   7638]
 [ 11603   8320]]
 


In [134]:
validation_results25

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to125,0.866546,0.919978,0.855764,0.994622,0.143452
0,gbt_model_3to125,0.82882,0.959734,0.932147,0.989014,0.075364
0,gbt_model_5to125,0.797342,0.972466,0.959438,0.985863,0.052346
0,gbt_model_7to125,0.756118,0.976982,0.971555,0.98248,0.044107
0,gbt_model_9to125,0.737692,0.979173,0.977367,0.980996,0.040056
0,gbt_model_11to125,0.718712,0.980858,0.982221,0.979508,0.036935
0,gbt_model_13to125,0.701574,0.981849,0.98554,0.978197,0.035103


#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created50.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

------

## <font color=black>Evaluation of Generalization over All Models</font>

In [135]:
results_all = train_resultsall[train_resultsall.columns[1:]] - validation_resultsall[validation_resultsall.columns[1:]]
results_all['resultname'] = train_resultsall['resultname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.023092,0.014343,0.023379,0.002182,-0.023161,gbt_model_1to1
0,0.029694,0.005989,0.00761,0.00417,-0.010558,gbt_model_3to1
0,0.038399,0.004734,0.0039,0.005613,-0.008653,gbt_model_5to1
0,0.043679,0.005063,0.003553,0.006613,-0.009447,gbt_model_7to1
0,0.050809,0.005279,0.003198,0.007387,-0.009948,gbt_model_9to1
0,0.052828,0.005409,0.002965,0.007854,-0.010264,gbt_model_11to1
0,0.054306,0.005427,0.002579,0.00825,-0.010347,gbt_model_13to1


In [136]:
results_mean = train_resultsmean[train_resultsmean.columns[1:]] - validation_resultsmean[validation_resultsmean.columns[1:]]
results_mean['resultname'] = train_resultsmean['resultname']
results_mean.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.020469,0.013384,0.021857,0.002012,-0.021565,gbt_model_1to1mean
0,0.032415,0.009518,0.01427,0.00419,-0.016953,gbt_model_3to1mean
0,0.047517,0.006639,0.007074,0.006176,-0.012203,gbt_model_5to1mean
0,0.047893,0.005377,0.003894,0.006899,-0.010044,gbt_model_7to1mean
0,0.048915,0.005309,0.003441,0.007203,-0.009992,gbt_model_9to1mean
0,0.052165,0.005385,0.00299,0.007784,-0.010214,gbt_model_11to1mean
0,0.053994,0.005332,0.002341,0.008293,-0.01017,gbt_model_13to1mean


In [137]:
results_75 = train_results75[validation_results25.columns[1:]] - validation_results75[validation_results25.columns[1:]]
results_75['resultname'] = train_results75['resultname']
results_75.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.019382,0.01146,0.018473,0.002033,-0.018318,gbt_model_1to175
0,0.03025,0.005351,0.00635,0.004226,-0.009395,gbt_model_3to175
0,0.046522,0.005095,0.004049,0.006198,-0.009352,gbt_model_5to175
0,0.047743,0.005024,0.003194,0.006903,-0.009383,gbt_model_7to175
0,0.05188,0.005242,0.003022,0.007489,-0.009885,gbt_model_9to175
0,0.052936,0.005283,0.002306,0.00823,-0.010078,gbt_model_13to175
0,0.053627,0.005363,0.002819,0.007909,-0.010177,gbt_model_11to175


In [138]:
results_50 = train_results50[train_results50.columns[1:]] - validation_results50[validation_results50.columns[1:]]
results_50['resultname'] = train_results50['resultname']
results_50.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.019771,0.013147,0.021466,0.001968,-0.021146,gbt_model_1to150
0,0.04054,0.007803,0.010383,0.004902,-0.013869,gbt_model_3to150
0,0.042961,0.004879,0.003886,0.005927,-0.008936,gbt_model_5to150
0,0.046095,0.005113,0.003499,0.00677,-0.009544,gbt_model_7to150
0,0.050683,0.005202,0.003013,0.007417,-0.009807,gbt_model_9to150
0,0.052678,0.005413,0.002989,0.007838,-0.010268,gbt_model_11to150
0,0.056115,0.005358,0.002316,0.008374,-0.010217,gbt_model_13to150


In [139]:
results_25 = train_results25[validation_results25.columns[1:]] - validation_results25[validation_results25.columns[1:]]
results_25['resultname'] = train_results25['resultname']
results_25.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.020789,0.012362,0.020013,0.002103,-0.019869,gbt_model_1to125
0,0.038596,0.007139,0.009235,0.004779,-0.012652,gbt_model_3to125
0,0.039428,0.004828,0.004054,0.005644,-0.008823,gbt_model_5to125
0,0.048188,0.005205,0.003518,0.006936,-0.009727,gbt_model_7to125
0,0.05215,0.005284,0.00309,0.007504,-0.009962,gbt_model_9to125
0,0.054304,0.005426,0.002558,0.008268,-0.010348,gbt_model_13to125
0,0.05462,0.005427,0.00287,0.007983,-0.010305,gbt_model_11to125


Based on generalization, the best in class features were as follows.
- **gbt_model_1to150** - 0.019771
- **gbt_model_3to1** - 0.029694
- **gbt_model_5to1** - 0.038399
- **gbt_model_7to1** - 0.043679
- **gbt_model_9to1mean** - 0.048915
- **gbt_model_11to1mean** - 0.052165
- **gbt_model_13to175** - 0.052936

Based on generalization, the best in class features were as follows.
- **gbt_model_1to175** - 0.024326
- **gbt_model_3to125** - 0.028995
- **gbt_model_5to175** - 0.040094
- **gbt_model_7to150** - 0.041693
- **gbt_model_9to175** - 0.049611
- **gbt_model_11to1mean** - 0.050464
- **gbt_model_13to1mean** - 0.052361

-------------

-------------

## Recursive Feature Elimination - Ensemble of Models

In [49]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

***Create Custom Classifier***

In [50]:
# Custom evaluator that we can use instead of BinaryClassificationEvaluator() in grid search
class ClassEvaluatorPandas:

    def __init__(self, modelname, model, y_pred, y_true):
        
        # Initialize variables
        self.modelname = modelname
        self.y_pred = y_pred 
        self.y_true = y_true
        self.model = model
        
        # Calculate confusion matrix
        from sklearn.metrics import confusion_matrix
        self.cm = confusion_matrix(y_true,y_pred)
        
        # Calculate confusion matrix values
        self.tp = self.cm[0][0]
        self.fp = self.cm[0][1]
        self.tn = self.cm[1][1]
        self.fn = self.cm[1][0]
        
    def evaluate(self):
        
        # Calculate Metrics and add epsilon to prevent division by zero
        precision = self.tp / float(self.tp + self.fp + 0.00001)
        recall = self.tp / float(self.tp + self.fn + 0.00001)
        f1 = (2 * precision * recall) / float(precision + recall + 0.00001)
        error = (self.fp + self.fn + 0.00001) / (self.tp + self.fp + self.tn + self.fn + 0.00001)
        
        # Instantiate Evaluator and call AUC metric
        from sklearn.metrics import confusion_matrix, roc_curve, auc
        false_positive_rate, true_positive_rate, thresholds = roc_curve(self.y_true, self.y_pred)
        AUC = round(auc(false_positive_rate, true_positive_rate), ndigits=5)
        
        return pd.DataFrame(data=[[self.modelname, AUC, f1, precision, recall, error]], 
                            columns=['modelname', 'AUC', 'f1', 'precision', 'recall', 'error'])
    
    def confusionmatrix(self):
        from sklearn.metrics import confusion_matrix
        # Print Confusion Matrix
        return self.cm
        
    
    def modelparams(self):
        scores = self.model.avgMetrics
        params = [{p.name: v for p, v in m.items()} for m in self.model.getEstimatorParamMaps()]
        params_pd = pd.DataFrame(params)
        params_pd['AUC score'] = scores
        return params_pd

### - <font color=blue>Import Data</font> -

#### <font color=purple>Build Ensemble - Train Set</font>

In [None]:
# Return Probability Values as Spark DF
gbt_model_1to1_train = gbt_model_1to175.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_3to1_train = gbt_model_3to125.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_5to1_train = gbt_model_5to175.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_7to1_train = gbt_model_7to150.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_9to1_train = gbt_model_9to175.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_11to1_train = gbt_model_11to1mean.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_13to1_train = gbt_model_13to1mean.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()

In [None]:
# Create a single DF with all Predictions and Convert back to Spark DF
Jan2016_predictsgbt = pd.merge(gbt_model_1to1_train[['msno','prediction']], gbt_model_3to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_5to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_7to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_9to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_11to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_13to1_train[['msno','prediction', 'is_churn']], on='msno')

# Rename Columns
Jan2016_predictsgbt.columns = ['msno', 'gbt_model_1to175', 'gbt_model_3to150', 'gbt_model_5to175', 'gbt_model_7to150', 'gbt_model_9to175', 'gbt_model_11to1mean', 'gbt_model_13to175', 'is_churn']

In [None]:
Jan2016_predictsgbt.shape

In [None]:
# Export to GCS
sparkDf = spark.createDataFrame(Jan2016_predictsgbt)    
sparkDf.coalesce(1).write.option("header","true").csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Jan2016_predictsgbt')

#### <font color=purple>Build Ensemble - Validation Set</font>

In [None]:
# Return Prediction Values as Spark DF
gbt_model_1to1_valid = gbt_model_1to175.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_3to1_valid = gbt_model_3to150.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_5to1_valid = gbt_model_5to175.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_7to1_valid = gbt_model_7to150.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_9to1_valid = gbt_model_9to175.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_11to1_valid = gbt_model_11to1mean.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_13to1_valid = gbt_model_13to175.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()


In [None]:
# Create a single DF with all Predictions and Convert back to Spark DF
Feb2016_predictsgbt = pd.merge(gbt_model_1to1_valid[['msno','prediction']], gbt_model_3to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_5to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_7to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_9to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_11to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_13to1_valid[['msno','prediction', 'is_churn']], on='msno')

# Rename Columns
Feb2016_predictsgbt.columns = ['msno', 'gbt_model_1to175', 'gbt_model_3to150', 'gbt_model_5to175', 'gbt_model_7to150', 'gbt_model_9to175', 'gbt_model_11to1mean', 'gbt_model_13to175', 'is_churn']

In [None]:
Feb2016_predictsgbt.shape

In [None]:
# Export to GCS
sparkDf = spark.createDataFrame(Feb2016_predictsgbt)    
sparkDf.coalesce(1).write.option("header","true").csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Feb2016_predictsgbt')

#### <font color=purple>Import Ensemble Sets (if already built)</font>

In [34]:
Jan2016_predictsgbt = pd.read_csv('D:\J-5 Local\Jan2016_predictsgbt.csv')
Feb2016_predictsgbt = pd.read_csv('D:\J-5 Local\Feb2016_predictsgbt.csv')

In [35]:
# Instantiate Train x and y
train_x = Jan2016_predictsgbt[Jan2016_predictsgbt.columns[1:-1]]
train_y = Jan2016_predictsgbt['is_churn']

In [36]:
# Instantiate Validation x and y
valid_x = Feb2016_predictsgbt[Feb2016_predictsgbt.columns[1:-1]]
valid_y = Feb2016_predictsgbt['is_churn']

### <font color=blue>Train Model: All Splits, All Splits, XGB + RFECV</font>

In [38]:
# Instantiate Estimators
rfc = RandomForestClassifier()
gbm = GradientBoostingClassifier()
xgb = XGBClassifier()

In [11]:
start = time.time()
gbtrfc1 = RFECV(rfc, min_features_to_select=1, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 78


In [12]:
start = time.time()
gbtrfc2 = RFECV(rfc, min_features_to_select=2, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 70


In [13]:
start = time.time()
gbtrfc3 = RFECV(rfc, min_features_to_select=3, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 60


In [14]:
start = time.time()
gbtrfc4 = RFECV(rfc, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 50


In [15]:
start = time.time()
gbtrfc5 = RFECV(rfc, min_features_to_select=5, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 39


In [18]:
start = time.time()
gbtgmb4 = RFECV(gbm, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 1043


In [19]:
start = time.time()
gbtxgb4 = RFECV(xgb, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 718


In [20]:
# Dictionary of Models that were createdgg
ensembles_created = {
                  'GBT_RFC1' : gbtrfc1,
                  'GBT_RFC2' : gbtrfc2,
                  'GBT_RFC3' : gbtrfc3,
                  'GBT_RFC4' : gbtrfc4,
                  'GBT_RFC5' : gbtrfc5,
                  'GBT_GBM4' : gbtgmb4,
                  'GBT_XGB4' : gbtxgb4}

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + RFECV </font> -

In [22]:
# Train Model Results
train_ensemble_results = pd.DataFrame()

for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_ensemble_results = train_ensemble_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    end = time.time()
    print('Time spent for training: {}'.format(round(end-start)))
    print('')

GBT_RFC1
[[665630   7121]
 [  9385   9763]]
Time spent for training: 736

GBT_RFC2
[[665625   7126]
 [  9382   9766]]
Time spent for training: 737

GBT_RFC3
[[665613   7138]
 [  9371   9777]]
Time spent for training: 738

GBT_RFC4
[[665632   7119]
 [  9391   9757]]
Time spent for training: 738

GBT_RFC5
[[665630   7121]
 [  9385   9763]]
Time spent for training: 739

GBT_GBM4
[[665629   7122]
 [  9381   9767]]
Time spent for training: 741

GBT_XGB4
[[665632   7119]
 [  9391   9757]]
Time spent for training: 742



In [23]:
train_ensemble_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBT_RFC1,0.74964,0.987748,0.989415,0.986097,0.023856
0,GBT_RFC2,0.74972,0.987747,0.989408,0.986101,0.023859
0,GBT_RFC3,0.75,0.987746,0.98939,0.986117,0.02386
0,GBT_RFC4,0.74949,0.987745,0.989418,0.986088,0.023862
0,GBT_RFC5,0.74964,0.987748,0.989415,0.986097,0.023856
0,GBT_GBM4,0.74975,0.98775,0.989414,0.986102,0.023852
0,GBT_XGB4,0.74949,0.987745,0.989418,0.986088,0.023862


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + RFECV </font> -

In [24]:
# Validation Model Results
valid_ensemble_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    valid_ensemble_results = valid_ensemble_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    end = time.time()
    print('Time spent for training: {}'.format(round(end-start)))
    print('')

GBT_RFC1
[[521510   6694]
 [ 11828   8095]]
Time spent for training: 743

GBT_RFC2
[[521507   6697]
 [ 11823   8100]]
Time spent for training: 743

GBT_RFC3
[[521480   6724]
 [ 11821   8102]]
Time spent for training: 744

GBT_RFC4
[[521515   6689]
 [ 11840   8083]]
Time spent for training: 745

GBT_RFC5
[[521510   6694]
 [ 11828   8095]]
Time spent for training: 745

GBT_GBM4
[[521506   6698]
 [ 11828   8095]]
Time spent for training: 747

GBT_XGB4
[[521515   6689]
 [ 11840   8083]]
Time spent for training: 747



In [25]:
valid_ensemble_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBT_RFC1,0.69682,0.982547,0.987327,0.977823,0.033791
0,GBT_RFC2,0.69694,0.982549,0.987321,0.977832,0.033788
0,GBT_RFC3,0.69697,0.982525,0.98727,0.977834,0.033833
0,GBT_RFC4,0.69652,0.98254,0.987336,0.977801,0.033804
0,GBT_RFC5,0.69682,0.982547,0.987327,0.977823,0.033791
0,GBT_GBM4,0.69682,0.982543,0.987319,0.977823,0.033799
0,GBT_XGB4,0.69652,0.98254,0.987336,0.977801,0.033804


#### <font color=purple>Generalization Between Train and Validation</font>

In [39]:
results_all = train_ensemble_results[train_ensemble_results.columns[1:]] - valid_ensemble_results[valid_ensemble_results.columns[1:]]
results_all['modelname'] = train_ensemble_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05278,0.005198,0.002086,0.008269,-0.009929,GBT_RFC2
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBT_RFC1
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBT_RFC5
0,0.05293,0.005207,0.002094,0.00828,-0.009947,GBT_GBM4
0,0.05297,0.005205,0.002082,0.008287,-0.009942,GBT_RFC4
0,0.05297,0.005205,0.002082,0.008287,-0.009942,GBT_XGB4
0,0.05303,0.005221,0.00212,0.008282,-0.009973,GBT_RFC3


### <font color=blue>Train Model: All Splits, XGB + GridCV </font>

#### <font color=purple>XGBOOST Parameter Tuning</font>

In [67]:
# Create Param Grid

param_rfc = {
         'bootstrap': [True, False],
         'max_depth': [3, 5, 7],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
         'n_estimators': [100, 500, 1000]
        }

param_gbm = {
        'learning_rate': [.1, .5, .01],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7],
        'n_estimators': [100, 500, 1000],
        }

param_xgb = {
        'learning_rate': [.1, .5, .01],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7],
        'n_estimators': [100, 500, 1000],
        }

# Instatiate Esitmator Object
rfc = RandomForestClassifier()
gbm = GradientBoostingClassifier()
xgb = XGBClassifier()

# # Instatiate StratKFold Object
# from sklearn.model_selection import StratifiedKFold
# skf = StratifiedKFold(n_splits=5, shuffle = True)

# Instatiate Random Search CV Object
rscv_rfc = RandomizedSearchCV(rfc, param_distributions=param_rfc, n_iter=5, scoring='roc_auc', 
                                   n_jobs=4, cv=5, verbose=3)

rscv_gbm = RandomizedSearchCV(gbm, param_distributions=param_gbm, n_iter=5, scoring='roc_auc', 
                                   n_jobs=4, cv=5, verbose=3)

rscv_xgb = RandomizedSearchCV(xgb, param_distributions=param_xgb, n_iter=5, scoring='roc_auc', 
                                   n_jobs=4, cv=5, verbose=3)

In [68]:
start = time.time()
GBMrfc = rscv_rfc.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 13.3min finished


Time spent for training: 913


In [69]:
start = time.time()
GBMgmb = rscv_gbm.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 38.5min finished


Time spent for training: 2584


In [70]:
start = time.time()
GBMxgb = rscv_xgb.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 16.4min finished


Time spent for training: 1312


In [71]:
# Dictionary of Models that were created
ensembles_created1 = {
                  'GBM_RFC_rscv' : GBMrfc,
                  'GBM_GBM_rscv' : GBMgmb,
                  'GBM_XGB_rscv' : GBMxgb}

## Optimized for AUC

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [72]:
# Train Model Results
train_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_rscv_results = train_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[665629   7122]
 [  9382   9766]]

GBM_GBM_rscv
[[665630   7121]
 [  9385   9763]]

GBM_XGB_rscv
[[665630   7121]
 [  9385   9763]]



In [73]:
train_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.74972,0.98775,0.989414,0.986101,0.023853
0,GBM_GBM_rscv,0.74964,0.987748,0.989415,0.986097,0.023856
0,GBM_XGB_rscv,0.74964,0.987748,0.989415,0.986097,0.023856


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [74]:
# Validation Model Results
validation_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    validation_rscv_results = validation_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[521506   6698]
 [ 11828   8095]]

GBM_GBM_rscv
[[521510   6694]
 [ 11828   8095]]

GBM_XGB_rscv
[[521510   6694]
 [ 11828   8095]]



In [75]:
validation_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.69682,0.982543,0.987319,0.977823,0.033799
0,GBM_GBM_rscv,0.69682,0.982547,0.987327,0.977823,0.033791
0,GBM_XGB_rscv,0.69682,0.982547,0.987327,0.977823,0.033791


#### <font color=purple>Generalization Between Train and Validation</font>

In [76]:
results_all = train_rscv_results[train_rscv_results.columns[1:]] - validation_rscv_results[validation_rscv_results.columns[1:]]
results_all['modelname'] = train_rscv_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBM_GBM_rscv
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBM_XGB_rscv
0,0.0529,0.005207,0.002094,0.008278,-0.009946,GBM_RFC_rscv


## Optimized for Precision

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [56]:
# Train Model Results
train_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_rscv_results = train_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[665630   7121]
 [  9385   9763]]

GBM_GBM_rscv
[[665615   7136]
 [  9377   9771]]

GBM_XGB_rscv
[[665632   7119]
 [  9391   9757]]



In [57]:
train_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.74964,0.987748,0.989415,0.986097,0.023856
0,GBM_GBM_rscv,0.74984,0.987743,0.989393,0.986108,0.023866
0,GBM_XGB_rscv,0.74949,0.987745,0.989418,0.986088,0.023862


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [58]:
# Validation Model Results
validation_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    validation_rscv_results = validation_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[521510   6694]
 [ 11828   8095]]

GBM_GBM_rscv
[[521485   6719]
 [ 11833   8090]]

GBM_XGB_rscv
[[521515   6689]
 [ 11840   8083]]



In [59]:
validation_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.69682,0.982547,0.987327,0.977823,0.033791
0,GBM_GBM_rscv,0.69667,0.982518,0.98728,0.977812,0.033846
0,GBM_XGB_rscv,0.69652,0.98254,0.987336,0.977801,0.033804


#### <font color=purple>Generalization Between Train and Validation</font>

In [60]:
results_all = train_rscv_results[train_rscv_results.columns[1:]] - validation_rscv_results[validation_rscv_results.columns[1:]]
results_all['modelname'] = train_rscv_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBM_RFC_rscv
0,0.05297,0.005205,0.002082,0.008287,-0.009942,GBM_XGB_rscv
0,0.05317,0.005224,0.002113,0.008295,-0.00998,GBM_GBM_rscv


## Optimized for Recall

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [56]:
# Train Model Results
train_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_rscv_results = train_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[665629   7122]
 [  9382   9766]]

GBM_GBM_rscv
[[665526   7225]
 [  9342   9806]]

GBM_XGB_rscv
[[665630   7121]
 [  9385   9763]]



In [57]:
train_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.74972,0.98775,0.989414,0.986101,0.023853
0,GBM_GBM_rscv,0.75069,0.987701,0.989261,0.986157,0.023944
0,GBM_XGB_rscv,0.74964,0.987748,0.989415,0.986097,0.023856


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [58]:
# Validation Model Results
validation_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    validation_rscv_results = validation_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[521506   6698]
 [ 11828   8095]]

GBM_GBM_rscv
[[521372   6832]
 [ 11765   8158]]

GBM_XGB_rscv
[[521510   6694]
 [ 11828   8095]]



In [59]:
validation_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.69682,0.982543,0.987319,0.977823,0.033799
0,GBM_GBM_rscv,0.69827,0.982473,0.987066,0.977933,0.033928
0,GBM_XGB_rscv,0.69682,0.982547,0.987327,0.977823,0.033791


#### <font color=purple>Generalization Between Train and Validation</font>

In [60]:
results_all = train_rscv_results[train_rscv_results.columns[1:]] - validation_rscv_results[validation_rscv_results.columns[1:]]
results_all['modelname'] = train_rscv_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05242,0.005229,0.002195,0.008225,-0.009984,GBM_GBM_rscv
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBM_XGB_rscv
0,0.0529,0.005207,0.002094,0.008278,-0.009946,GBM_RFC_rscv
