# KKBox Customer Churn Prediction
### w/ BigQuery and Apache Spark

---

# Part III: <font color=green>*Model Creation and Evaluation*</font>
Please refer to the following article for a comprehensive review of the project: XXXXXX

---

In [1]:
# General Imports
from __future__ import absolute_import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Imports for PySpark
import findspark
findspark.init()
# import pyspark
from pyspark import SparkConf
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext

# # Imports for BigQuery connection
# import json
# import pprint
# import subprocess

# # Imports for GCP
# from google.cloud import bigquery
import time 
# import gcsfs

# Imports for Spark ML
from pyspark.ml.feature import (VectorAssembler,StandardScaler, OneHotEncoderEstimator, OneHotEncoder)
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, Evaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [3]:
## Dataproc Specs

# Jupyter Initialization: gs://srcd-dataproc/jupyter.sh 
# Components Installed: Anaconda and Jupyter
# Master Node:   x1 - 4 vCPU w/ 15 GB RAM each
# Workers Nodes: x5 - 4 vCPU w/ 15 GB RAM each
# Disk: 100GB

## Create Spark Session and Import Data

In [4]:
# Specify Google Credentials
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='D:\OneDrive\J-5\GitHub\Google Credentials.json'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.Builder().config(conf=SparkConf().setMaster("local[*]")).getOrCreate()

# Instantiate BigQuery magic
# %load_ext google.cloud.bigquery

In [3]:
# # If Working Locally on Computer, Importing Data Locally#

# # Import DRV_Jan2016 (Train Set) 
# DRV_Jan2016_1to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_1to1',inferSchema=True,header=True)
# DRV_Jan2016_3to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_3to1',inferSchema=True,header=True)
# DRV_Jan2016_5to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_5to1',inferSchema=True,header=True)
# DRV_Jan2016_7to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_7to1',inferSchema=True,header=True)
# DRV_Jan2016_9to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_9to1',inferSchema=True,header=True)
# DRV_Jan2016_11to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_11to1',inferSchema=True,header=True)
# DRV_Jan2016_13to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_13to1',inferSchema=True,header=True)

# DRV_Jan20160 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016000000000000',inferSchema=True,header=True)
# DRV_Jan20161 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016000000000001',inferSchema=True,header=True)
# DRV_Jan20162 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016000000000002',inferSchema=True,header=True)

# DRV_Jan2016 = DRV_Jan20160.union(DRV_Jan20161)
# DRV_Jan2016 = DRV_Jan2016.union(DRV_Jan20162)

# DRV_Jan20160.unpersist()
# DRV_Jan20161.unpersist()
# DRV_Jan20162.unpersist()

# # Import DRV_Feb2016 (Validation Set) 
# DRV_Feb20160 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Feb2016000000000000',inferSchema=True,header=True)
# DRV_Feb20161 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Feb2016000000000001',inferSchema=True,header=True)
# DRV_Feb20162 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Feb2016000000000002',inferSchema=True,header=True)

# DRV_Feb2016 = DRV_Feb20160.union(DRV_Feb20161)
# DRV_Feb2016 = DRV_Feb2016.union(DRV_Feb20162)

# DRV_Feb20160.unpersist()
# DRV_Feb20161.unpersist()
# DRV_Feb20162.unpersist()

DataFrame[msno: string, membership_expire_date: timestamp, payment_method_id: int, payment_plan_days: int, plan_list_price: int, net_paid_amount: int, is_net_paid_amount: string, is_auto_renew: int, is_cancel: int, city: int, bd: int, registered_via: int, registration_init_time: timestamp, membership_length: int, is_churn: int, total_songs: int, total_logins: int, total_secs: double, sum_num_unq: int, sum_num_repeat: int, sum_over_50pec: int, sum_over_75pec: int, sum_over_985pec: int, total_transactions: int, total_spent: int, avg_spent_trans: double, spent_per_logins: double, spent_per_secs: double, spent_per_song: double, spent_per_num_unq: double, spent_per_num_repeats: double, never_active_subscriber: int, total_spent_zero: int, city_agg: int, payment_method_agg: int, expire_last_login: int, total_cancelations: int, songs_last_7: int, songs_last_7_AVG: double, logins_last_7: int, logins_last_7_AVG: double, total_secs_last_7: double, total_secs_last_7_AVG: double, num_unq_last_7: in

In [None]:
# If Working on Dataproc Cloud ##

# Import DRV_Jan2016 (Train Set) 
DRV_Jan2016_1to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_1to1',inferSchema=True,header=True)
DRV_Jan2016_3to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_3to1',inferSchema=True,header=True)
DRV_Jan2016_5to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_5to1',inferSchema=True,header=True)
DRV_Jan2016_7to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_7to1',inferSchema=True,header=True)
DRV_Jan2016_9to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_9to1',inferSchema=True,header=True)
DRV_Jan2016_11to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_11to1',inferSchema=True,header=True)
DRV_Jan2016_13to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_13to1',inferSchema=True,header=True)

DRV_Jan20160 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000000',inferSchema=True,header=True)
DRV_Jan20161 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000001',inferSchema=True,header=True)
DRV_Jan20162 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000002',inferSchema=True,header=True)

DRV_Jan2016 = DRV_Jan20160.union(DRV_Jan20161)
DRV_Jan2016 = DRV_Jan2016.union(DRV_Jan20162)

DRV_Jan20160 = None
DRV_Jan20161 = None
DRV_Jan20162 = None

# Import DRV_Feb2016 (Validation Set) 
DRV_Feb20160 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000000',inferSchema=True,header=True)
DRV_Feb20161 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000001',inferSchema=True,header=True)
DRV_Feb20162 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000002',inferSchema=True,header=True)

DRV_Feb2016 = DRV_Feb20160.union(DRV_Feb20161)
DRV_Feb2016 = DRV_Feb2016.union(DRV_Feb20162)

DRV_Feb20160 = None
DRV_Feb20161 = None
DRV_Feb20162 = None

***Cast Correct Column Types on All Sets***

In [None]:
column_types_pd = [('msno', 'STRING'),
 ('membership_expire_date', 'DATE'),
 ('payment_method_id', 'INT64'),
 ('payment_plan_days', 'INT64'),
 ('plan_list_price', 'INT64'),
 ('net_paid_amount', 'INT64'),
 ('is_net_paid_amount', 'STRING'),
 ('is_auto_renew', 'INT64'),
 ('city', 'INT64'),
 ('bd', 'INT64'),
 ('registered_via', 'INT64'),
 ('registration_init_time', 'DATE'),
 ('membership_length', 'INT64'),
 ('is_churn', 'FLOAT64'),
 ('total_songs', 'INT64'),
 ('total_logins', 'INT64'),
 ('total_secs', 'FLOAT64'),
 ('sum_num_unq', 'INT64'),
 ('sum_num_repeat', 'INT64'),
 ('sum_over_50pec', 'INT64'),
 ('sum_over_75pec', 'INT64'),
 ('sum_over_985pec', 'INT64'),
 ('total_transactions', 'INT64'),
 ('total_spent', 'FLOAT64'),
 ('avg_spent_trans', 'FLOAT64'),
 ('spent_per_logins', 'FLOAT64'),
 ('spent_per_secs', 'FLOAT64'),
 ('spent_per_song', 'FLOAT64'),
 ('spent_per_num_unq', 'FLOAT64'),
 ('spent_per_num_repeats', 'FLOAT64'),
 ('never_active_subscriber', 'FLOAT64'),
 ('total_spent_zero', 'FLOAT64'),
 ('city_agg', 'INT64'),
 ('payment_method_agg', 'INT64'),
 ('songs_last_7', 'FLOAT64'),
 ('songs_last_7_AVG', 'FLOAT64'),
 ('logins_last_7', 'FLOAT64'),
 ('logins_last_7_AVG', 'FLOAT64'),
 ('total_secs_last_7', 'FLOAT64'),
 ('total_secs_last_7_AVG', 'FLOAT64'),
 ('num_unq_last_7', 'FLOAT64'),
 ('num_unq_last_7_AVG', 'FLOAT64'),
 ('num_repeat_last_7', 'FLOAT64'),
 ('num_repeat_last_7_AVG', 'FLOAT64'),
 ('over_50perc_last_7', 'FLOAT64'),
 ('over_50perc_last_7_AVG', 'FLOAT64'),
 ('over_75perc_last_7', 'FLOAT64'),
 ('over_75perc_last_7_AVG', 'FLOAT64'),
 ('over_985perc_last_7', 'FLOAT64'),
 ('over_985perc_last_7_AVG', 'FLOAT64'),
 ('songs_last_15', 'FLOAT64'),
 ('songs_last_15_AVG', 'FLOAT64'),
 ('logins_last_15', 'FLOAT64'),
 ('logins_last_15_AVG', 'FLOAT64'),
 ('total_secs_last_15', 'FLOAT64'),
 ('total_secs_last_15_AVG', 'FLOAT64'),
 ('num_unq_last_15', 'FLOAT64'),
 ('num_unq_last_15_AVG', 'FLOAT64'),
 ('num_repeat_last_15', 'FLOAT64'),
 ('num_repeat_last_15_AVG', 'FLOAT64'),
 ('over_50perc_last_15', 'FLOAT64'),
 ('over_50perc_last_15_AVG', 'FLOAT64'),
 ('over_75perc_last_15', 'FLOAT64'),
 ('over_75perc_last_15_AVG', 'FLOAT64'),
 ('over_985perc_last_15', 'FLOAT64'),
 ('over_985perc_last_15_AVG', 'FLOAT64'),
 ('songs_last_30', 'FLOAT64'),
 ('songs_last_30_AVG', 'FLOAT64'),
 ('logins_last_30', 'FLOAT64'),
 ('logins_last_30_AVG', 'FLOAT64'),
 ('total_secs_last_30', 'FLOAT64'),
 ('total_secs_last_30_AVG', 'FLOAT64'),
 ('num_unq_last_30', 'FLOAT64'),
 ('num_unq_last_30_AVG', 'FLOAT64'),
 ('num_repeat_last_30', 'FLOAT64'),
 ('num_repeat_last_30_AVG', 'FLOAT64'),
 ('over_50perc_last_30', 'FLOAT64'),
 ('over_50perc_last_30_AVG', 'FLOAT64'),
 ('over_75perc_last_30', 'FLOAT64'),
 ('over_75perc_last_30_AVG', 'FLOAT64'),
 ('over_985perc_last_30', 'FLOAT64'),
 ('over_985perc_last_30_AVG', 'FLOAT64'),
 ('songs_last_60', 'FLOAT64'),
 ('songs_last_60_AVG', 'FLOAT64'),
 ('logins_last_60', 'FLOAT64'),
 ('logins_last_60_AVG', 'FLOAT64'),
 ('total_secs_last_60', 'FLOAT64'),
 ('total_secs_last_60_AVG', 'FLOAT64'),
 ('num_unq_last_60', 'FLOAT64'),
 ('num_unq_last_60_AVG', 'FLOAT64'),
 ('num_repeat_last_60', 'FLOAT64'),
 ('num_repeat_last_60_AVG', 'FLOAT64'),
 ('over_50perc_last_60', 'FLOAT64'),
 ('over_50perc_last_60_AVG', 'FLOAT64'),
 ('over_75perc_last_60', 'FLOAT64'),
 ('over_75perc_last_60_AVG', 'FLOAT64'),
 ('over_985perc_last_60', 'FLOAT64'),
 ('over_985perc_last_60_AVG', 'FLOAT64'),
 ('songs_last_120', 'FLOAT64'),
 ('songs_last_120_AVG', 'FLOAT64'),
 ('logins_last_120', 'FLOAT64'),
 ('logins_last_120_AVG', 'FLOAT64'),
 ('total_secs_last_120', 'FLOAT64'),
 ('total_secs_last_120_AVG', 'FLOAT64'),
 ('num_unq_last_120', 'FLOAT64'),
 ('num_unq_last_120_AVG', 'FLOAT64'),
 ('num_repeat_last_120', 'FLOAT64'),
 ('num_repeat_last_120_AVG', 'FLOAT64'),
 ('over_50perc_last_120', 'FLOAT64'),
 ('over_50perc_last_120_AVG', 'FLOAT64'),
 ('over_75perc_last_120', 'FLOAT64'),
 ('over_75perc_last_120_AVG', 'FLOAT64'),
 ('over_985perc_last_120', 'FLOAT64'),
 ('over_985perc_last_120_AVG', 'FLOAT64'),
 ('SUM_unq_songs_0_15', 'FLOAT64'),
 ('AVG_unq_songs_0_15', 'FLOAT64'),
 ('SUM_songs_0_15', 'FLOAT64'),
 ('AVG_songs_0_15', 'FLOAT64'),
 ('SUM_secs_0_15', 'FLOAT64'),
 ('AVG_secs_0_15', 'FLOAT64'),
 ('SUM_songs50_0_15', 'FLOAT64'),
 ('AVG_songs50_0_15', 'FLOAT64'),
 ('SUM_logins_0_15', 'FLOAT64'),
 ('AVG_logins_0_15', 'FLOAT64'),
 ('SUM_repeats_0_15', 'FLOAT64'),
 ('AVG_repeats_0_15', 'FLOAT64'),
 ('SUM_unq_songs_15_30', 'FLOAT64'),
 ('AVG_unq_songs_15_30', 'FLOAT64'),
 ('SUM_songs_15_30', 'FLOAT64'),
 ('AVG_songs_15_30', 'FLOAT64'),
 ('SUM_secs_15_30', 'FLOAT64'),
 ('AVG_secs_15_30', 'FLOAT64'),
 ('SUM_songs50_15_30', 'FLOAT64'),
 ('AVG_songs50_15_30', 'FLOAT64'),
 ('SUM_logins_15_30', 'FLOAT64'),
 ('AVG_logins_15_30', 'FLOAT64'),
 ('SUM_repeats_15_30', 'FLOAT64'),
 ('AVG_repeats_15_30', 'FLOAT64'),
 ('SUM_unq_songs_30_45', 'FLOAT64'),
 ('AVG_unq_songs_30_45', 'FLOAT64'),
 ('SUM_songs_30_45', 'FLOAT64'),
 ('AVG_songs_30_45', 'FLOAT64'),
 ('SUM_secs_30_45', 'FLOAT64'),
 ('AVG_secs_30_45', 'FLOAT64'),
 ('SUM_songs50_30_45', 'FLOAT64'),
 ('AVG_songs50_30_45', 'FLOAT64'),
 ('SUM_logins_30_45', 'FLOAT64'),
 ('AVG_logins_30_45', 'FLOAT64'),
 ('SUM_repeats_30_45', 'FLOAT64'),
 ('AVG_repeats_30_45', 'FLOAT64'),
 ('SUM_unq_songs_45_60', 'FLOAT64'),
 ('AVG_unq_songs_45_60', 'FLOAT64'),
 ('SUM_songs_45_60', 'FLOAT64'),
 ('AVG_songs_45_60', 'FLOAT64'),
 ('SUM_secs_45_60', 'FLOAT64'),
 ('AVG_secs_45_60', 'FLOAT64'),
 ('SUM_songs50_45_60', 'FLOAT64'),
 ('AVG_songs50_45_60', 'FLOAT64'),
 ('SUM_logins_45_60', 'FLOAT64'),
 ('AVG_logins_45_60', 'FLOAT64'),
 ('SUM_repeats_45_60', 'FLOAT64'),
 ('AVG_repeats_45_60', 'FLOAT64'),
 ('DIFSUM_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_songs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_secs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_secs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_logins_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_logins_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_songs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_secs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_secs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_logins_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_logins_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_songs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_secs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_secs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_logins_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_logins_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_repeats_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_repeats_30_45_45_60', 'FLOAT64'),
 ('expire_last_login', 'INT64'),
 ('total_cancelations', 'INT64'),
 ('login_after_expire_10', 'INT64'),
 ('login_after_expire_20', 'INT64'),
 ('login_after_expire_30', 'INT64'),
 ('STD_unq_songs_0_15', 'FLOAT64'),
 ('STD_songs_0_15', 'FLOAT64'),
 ('STD_secs_0_15', 'FLOAT64'),
 ('STD_songs50_0_15', 'FLOAT64'),
 ('STD_repeats_0_15', 'FLOAT64'),
 ('STD_unq_songs_15_30', 'FLOAT64'),
 ('STD_songs_15_30', 'FLOAT64'),
 ('STD_secs_15_30', 'FLOAT64'),
 ('STD_songs50_15_30', 'FLOAT64'),
 ('STD_repeats_15_30', 'FLOAT64'),
 ('STD_unq_songs_30_45', 'FLOAT64'),
 ('STD_songs_30_45', 'FLOAT64'),
 ('STD_secs_30_45', 'FLOAT64'),
 ('STD_songs50_30_45', 'FLOAT64'),
 ('STD_repeats_30_45', 'FLOAT64'),
 ('STD_unq_songs_45_60', 'FLOAT64'),
 ('STD_songs_45_60', 'FLOAT64'),
 ('STD_secs_45_60', 'FLOAT64'),
 ('STD_songs50_45_60', 'FLOAT64'),
 ('STD_repeats_45_60', 'FLOAT64'),
 ('DIFSTD_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_secs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_secs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_secs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_repeats_30_45_45_60', 'FLOAT64'),
 ('is_cancel', 'INT64')]

In [None]:
from pyspark.sql.functions import expr

# Correctly Cast DRV_Feb2016
for feature, datatype in column_types_pd:
    if datatype == 'STRING':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS string)"))')
    if datatype == 'DATE':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS timestamp)"))')
    if datatype == 'INT64':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS integer)"))')
    if datatype == 'FLOAT64':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS double)"))')

## Model Pre-Processing
https://medium.com/@dhiraj.p.rai/essentials-of-feature-engineering-in-pyspark-part-i-76a57680a85

#### - <font color=blue>Split Feautres by Categorical or Continuous</font> -

In [4]:
# Create list of Categorical feature names
cat_feats = ['is_auto_renew', 'total_spent_zero', 'city_agg', 'payment_method_agg', 'never_active_subscriber']

In [5]:
# Create list of Continuous feature names
cont_feats = [x for x in DRV_Jan2016_1to1.columns if x not in cat_feats]
cont_feats.remove('msno')
cont_feats.remove('is_churn')
cont_feats.remove('membership_expire_date')
cont_feats.remove('registration_init_time')
cont_feats.remove('city')
cont_feats.remove('bd')
cont_feats.remove('payment_method_id')
cont_feats.remove('is_net_paid_amount')
cont_feats.remove('registered_via')

### - <font color=blue>Data Pre-Processing</font> -

#### <font color=purple>*Encode Categorical Variables*</font>

In [6]:
# Create a list of categorical, 'Vector' feature names
cat_feats_vec = ['is_auto_renew_vec', 'total_spent_zero_vec', 'city_agg_vec', 'payment_method_agg_vec', 'never_active_subscriber_vec']

In [7]:
# One Hot Encode

is_auto_renew_encoder = OneHotEncoder(inputCol='is_auto_renew',outputCol='is_auto_renew_vec')
total_spent_zero_encoder = OneHotEncoder(inputCol='total_spent_zero',outputCol='total_spent_zero_vec')
city_agg_encoder = OneHotEncoder(inputCol='city_agg',outputCol='city_agg_vec')
payment_method_agg_encoder = OneHotEncoder(inputCol='payment_method_agg',outputCol='payment_method_agg_vec')
never_active_subscriber_encoder = OneHotEncoder(inputCol='never_active_subscriber',outputCol='never_active_subscriber_vec')
# is_net_paid_amount_encoder = OneHotEncoder(inputCol='is_net_paid_amount',outputCol='is_net_paid_amount_vec')

# registered_via_encoder = OneHotEncoder(inputCol='registered_via',outputCol='registered_via_vec')


#### <font color=purple>*Vector Assembler*</font>

In [8]:
# Create master list of feature names for model
final_features = cont_feats + cat_feats_vec

In [9]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

#### <font color=purple>*Feature Scaling*</font>

In [10]:
# Scale all features into our final output features
scaler = StandardScaler(inputCol='features', 
                        outputCol='features_scaled',
                        withStd=True, withMean=False)

## Model Creation: Pipeline and Tuning

### - <font color=blue>Create Pipeline Object</font> -
https://spark.apache.org/docs/2.4.3/ml-pipeline.html

In [11]:
# Instatiate Model Estimators and Parameters
gbt = GBTClassifier(featuresCol='features_scaled',
                    labelCol='is_churn')

In [12]:
# Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

### - <font color=blue>Model Tuning</font> -
https://spark.apache.org/docs/2.4.3/ml-tuning.html

In [13]:
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_churn')

#### <font color=purple>Gradient Boosted Trees Parameter Tuning</font>

In [14]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

## Model Execution and Evaluation

### - <font color=blue>Train Model: All Features, All Splits</font> -

***Gradient Boosted Trees***

In [15]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to1 = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 303


In [20]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to1 = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 357


In [21]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to1 = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 420


In [22]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to1 = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 506


In [23]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to1 = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 636


In [24]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to1 = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 894


In [25]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to1 = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 1082


#### <font color=purple>Evaluate Trained Model</font>

***Create Custom Evaluator***

In [26]:
# Custom evaluator that we can use instead of BinaryClassificationEvaluator() in grid search
class ClassEvaluator:

    def __init__(self, resultname, resultdata, model):
        from sklearn.metrics import confusion_matrix
        # Initialize variables
        self.resultPandas = resultdata[['is_churn', 'prediction']].toPandas()
        self.resultdata = resultdata 
        self.resultname = resultname
        self.model = model
        
        self.cm = confusion_matrix(self.resultPandas['is_churn'],self.resultPandas['prediction'])
        
        # Calculate confusion matrix values
        self.tp = self.cm[0][0]
        self.fn = self.cm[0][1]
        self.tn = self.cm[1][1]
        self.fp = self.cm[1][0]
        
    def evaluate(self):
        # Calculate Metrics and add epsilon to prevent division by zero
        precision = self.tp / float(self.tp + self.fp + 0.00001)
        recall = self.tp / float(self.tp + self.fn + 0.00001)
        f1 = (2 * precision * recall) / float(precision + recall + 0.00001)
        error = (self.fp + self.fn + 0.00001) / (self.tp + self.fp + self.tn + self.fn + 0.00001)
        
        # Instantiate Evaluator and call AUC metric
        my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                                labelCol='is_churn')
        AUC = my_eval.evaluate(self.resultdata)
        
        
        return pd.DataFrame(data=[[self.resultname, AUC, f1, precision, recall, error]], 
                            columns=['resultname', 'AUC', 'f1', 'precision', 'recall', 'error'])
    
    def confusionmatrix(self):
        from sklearn.metrics import confusion_matrix
        # Print Confusion Matrix
        return self.cm
        
    
    def modelparams(self):
        scores = self.model.avgMetrics
        params = [{p.name: v for p, v in m.items()} for m in self.model.getEstimatorParamMaps()]
        params_pd = pd.DataFrame(params)
        params_pd['AUC score'] = scores
        return params_pd

### - <font color=blue>Evaluate Train Model: All Features, All Splits</font> -

***Transform Train Data on Trained Models***

In [27]:
# Dictionary of Models that were created
models_created = {
                  'gbt_model_1to1' : (gbt_model_1to1, DRV_Jan2016_1to1),
                  'gbt_model_3to1' : (gbt_model_3to1, DRV_Jan2016_3to1),
                  'gbt_model_5to1' : (gbt_model_5to1, DRV_Jan2016_5to1),
                  'gbt_model_7to1' : (gbt_model_7to1, DRV_Jan2016_7to1),
                  'gbt_model_9to1' : (gbt_model_9to1, DRV_Jan2016_9to1),
                  'gbt_model_11to1' : (gbt_model_11to1, DRV_Jan2016_11to1),
                  'gbt_model_13to1' : (gbt_model_13to1, DRV_Jan2016_13to1)}

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_resultsall = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_resultsall = train_resultsall.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_13to1
[[664719   8032]
 [  9165   9983]]
 
gbt_model_11to1
[[662295  10456]
 [  8295  10853]]
 
gbt_model_1to1
[[588901  83850]
 [  1946  17202]]
 
gbt_model_5to1
[[648674  24077]
 [  5662  13486]]
 
gbt_model_9to1
[[659427  13324]
 [  7553  11595]]
 
gbt_model_3to1
[[634113  38638]
 [  4040  15108]]
 
gbt_model_7to1
[[655981  16770]
 [  6932  12216]]
 


In [None]:
# Evaluate Training Model
train_resultsall.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to1,0.886867,0.932097,0.875363,0.996706,0.124001
0,gbt_model_3to1,0.86579,0.967439,0.942567,0.993669,0.061682
0,gbt_model_5to1,0.834257,0.977586,0.964211,0.991347,0.042982
0,gbt_model_7to1,0.806525,0.98225,0.975073,0.989543,0.034256
0,gbt_model_9to1,0.792871,0.984412,0.980195,0.988676,0.030173
0,gbt_model_11to1,0.775627,0.986037,0.984458,0.98763,0.027101
0,gbt_model_13to1,0.75471,0.987225,0.988061,0.9864,0.024855


### - <font color=blue>Evaluate Validation Model: All Features, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_resultsall = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_resultsall = validation_resultsall.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_13to1
[[520605   7599]
 [ 11662   8261]]
 
gbt_model_11to1
[[518491   9713]
 [ 10807   9116]]
 
gbt_model_1to1
[[451574  76630]
 [  2645  17278]]
 
gbt_model_5to1
[[507052  21152]
 [  7455  12468]]
 
gbt_model_9to1
[[516148  12056]
 [  9905  10018]]
 
gbt_model_3to1
[[492518  35686]
 [  5431  14492]]
 
gbt_model_7to1
[[512813  15391]
 [  9089  10834]]
 


In [None]:
# Evaluate Validation Model
validation_resultsall.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_1to1,0.861081,0.919302,0.854923,0.994177,0.144629
0,gbt_model_3to1,0.82992,0.959926,0.932439,0.989093,0.075014
0,gbt_model_5to1,0.792882,0.97256,0.959955,0.98551,0.05219
0,gbt_model_7to1,0.757328,0.976683,0.970862,0.982585,0.044661
0,gbt_model_9to1,0.740006,0.979164,0.977175,0.981171,0.040066
0,gbt_model_11to1,0.719586,0.980591,0.981611,0.979582,0.037437
0,gbt_model_13to1,0.70013,0.981832,0.985614,0.97809,0.03514


We now currently have ~230 Features with the inclusion of our *Bi-Weekly Activity Block* and *Comparison of Bi-Weekly Activity Block Features*. Looking at the results above we can see that we have made some improvements on AUC. We also notice how Recall increased significantly over the Ratio'd subsets. The ratio'd subsets seemed to have helped address the issue of having a high amount of False Positives.

We can see that our models are still overfitting with the higher ratio'd models being the worst. However our Precision scores are better over our higher ratio'd models along with an improvement in model error and a slight decrease in Recall. Each of these models have there pros and cons and we might benefit from some sort of ensemble of these models. Before we do such a thing let's play around a bit with feature selection to see if we improve on the overall generalization between our Train and Validation models.

#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

#### <font color=purple>Model Evaluation: Feature Importance</font>

We will derive average feature importance scores for all features. Then we will produce a 5 Number Summary on these scores and group our features based on their scores against the following thresholds: Mean, 75th Percentile, 50th Percentile, and 25th Percentile.

In [28]:
# Display feature importances
importances = gbt_model_1to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column1 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_3to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column3 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_5to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column5 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_7to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column7 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_9to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column9 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_11to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column11 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = gbt_model_13to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column13 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

In [29]:
feature_imp = pd.merge(column1, column3 , on='Features')
feature_imp = pd.merge(feature_imp, column5, on='Features')
feature_imp = pd.merge(feature_imp, column7, on='Features')
feature_imp = pd.merge(feature_imp, column9, on='Features')
feature_imp = pd.merge(feature_imp, column11, on='Features')
feature_imp = pd.merge(feature_imp, column13, on='Features')

feature_imp['avg'] = feature_imp[list(feature_imp.columns[1:-1])].mean(axis=1)

In [30]:
# Show 5-Number Sumamry
feature_imp['avg'].describe()

count    230.000000
mean       0.004142
std        0.017953
min        0.000000
25%        0.000000
50%        0.000357
75%        0.001211
max        0.167032
Name: avg, dtype: float64

In [34]:
feature_imp[['Features','avg']].head(30)

Unnamed: 0,Features,avg
0,login_after_expire_30,0.130559
1,total_transactions,0.131067
2,is_auto_renew_vec,0.167032
3,login_after_expire_20,0.020181
4,is_cancel,0.039873
5,plan_list_price,0.073427
6,avg_spent_trans,0.035378
7,expire_last_login,0.039178
8,total_spent,0.045327
9,net_paid_amount,0.035028


In [None]:
mean_feats = feature_imp[feature_imp['avg'] > .004158]['Features'].tolist()
quart25_feats = feature_imp[feature_imp['avg'] > .000215]['Features'].tolist()
quart50_feats = feature_imp[feature_imp['avg'] > .000455]['Features'].tolist()
quart75_feats = feature_imp[feature_imp['avg'] > .001476]['Features'].tolist()

In [22]:
mean_feats = ['login_after_expire_30',
 'total_transactions',
 'is_auto_renew_vec',
 'login_after_expire_20',
 'is_cancel',
 'plan_list_price',
 'avg_spent_trans',
 'membership_length',
 'expire_last_login',
 'total_spent',
 'net_paid_amount',
 'SUM_logins_30_45',
 'spent_per_logins',
 'total_spent_zero_vec',
 'logins_last_60',
 'payment_method_agg_vec',
 'DIFAVG_logins_15_30_30_45',
 'total_logins',
 'SUM_songs50_0_15',
 'payment_plan_days',
 'num_unq_last_60']

In [23]:
quart25_feats = ['login_after_expire_30',
 'total_transactions',
 'is_auto_renew_vec',
 'login_after_expire_20',
 'is_cancel',
 'plan_list_price',
 'avg_spent_trans',
 'membership_length',
 'expire_last_login',
 'total_spent',
 'net_paid_amount',
 'SUM_logins_30_45',
 'spent_per_logins',
 'total_spent_zero_vec',
 'logins_last_60',
 'payment_method_agg_vec',
 'logins_last_120',
 'SUM_logins_45_60',
 'spent_per_song',
 'DIFAVG_logins_15_30_30_45',
 'songs_last_60',
 'total_logins',
 'SUM_secs_45_60',
 'num_unq_last_7',
 'STD_repeats_0_15',
 'SUM_songs50_0_15',
 'sum_over_50pec',
 'SUM_songs50_30_45',
 'SUM_songs_30_45',
 'over_985perc_last_120',
 'DIFAVG_unq_songs_0_15_15_30',
 'spent_per_secs',
 'over_50perc_last_7',
 'sum_over_75pec',
 'DIFSUM_songs_0_15_15_30',
 'payment_plan_days',
 'DIFSUM_logins_0_15_15_30',
 'spent_per_num_unq',
 'SUM_repeats_45_60',
 'total_secs_last_7',
 'STD_songs50_30_45',
 'STD_songs_45_60',
 'STD_repeats_30_45',
 'DIFSUM_secs_0_15_15_30',
 'total_secs_last_15',
 'DIFSTD_unq_songs_0_15_15_30',
 'spent_per_num_repeats',
 'STD_repeats_15_30',
 'STD_songs_15_30',
 'STD_secs_0_15',
 'num_unq_last_15',
 'SUM_repeats_15_30',
 'SUM_songs50_15_30',
 'DIFSUM_repeats_30_45_45_60',
 'over_75perc_last_120',
 'total_secs_last_120',
 'DIFSTD_repeats_15_30_30_45',
 'sum_num_unq',
 'num_repeat_last_7',
 'num_repeat_last_120',
 'STD_unq_songs_15_30',
 'DIFSUM_songs50_0_15_15_30',
 'STD_songs_0_15',
 'DIFAVG_unq_songs_15_30_30_45',
 'songs_last_15',
 'logins_last_30',
 'STD_secs_15_30',
 'songs_last_7',
 'SUM_unq_songs_15_30',
 'DIFSUM_unq_songs_15_30_30_45',
 'DIFAVG_logins_30_45_45_60',
 'DIFSTD_songs50_15_30_30_45',
 'DIFSUM_unq_songs_0_15_15_30',
 'sum_num_repeat',
 'num_unq_last_60',
 'total_secs',
 'DIFSTD_repeats_0_15_15_30',
 'logins_last_15',
 'DIFAVG_unq_songs_30_45_45_60',
 'DIFSUM_secs_15_30_30_45',
 'SUM_unq_songs_30_45',
 'DIFAVG_logins_0_15_15_30',
 'songs_last_30',
 'STD_unq_songs_0_15',
 'total_songs',
 'DIFSTD_songs50_0_15_15_30',
 'DIFSTD_songs_30_45_45_60',
 'DIFSUM_secs_30_45_45_60',
 'SUM_songs_45_60',
 'STD_unq_songs_45_60',
 'SUM_unq_songs_45_60',
 'SUM_repeats_30_45',
 'DIFSUM_logins_30_45_45_60',
 'DIFAVG_repeats_30_45_45_60',
 'DIFSTD_repeats_30_45_45_60',
 'SUM_songs50_45_60',
 'DIFSUM_repeats_0_15_15_30',
 'DIFSTD_unq_songs_15_30_30_45',
 'DIFAVG_songs_0_15_15_30',
 'sum_over_985pec',
 'DIFSTD_songs_15_30_30_45',
 'STD_repeats_45_60',
 'DIFAVG_songs50_15_30_30_45',
 'DIFSUM_logins_15_30_30_45',
 'DIFAVG_repeats_15_30_30_45',
 'DIFSUM_unq_songs_30_45_45_60',
 'STD_songs50_45_60',
 'SUM_secs_30_45',
 'over_75perc_last_30',
 'over_985perc_last_30',
 'over_75perc_last_7',
 'total_secs_last_60',
 'num_repeat_last_60',
 'over_50perc_last_60',
 'over_75perc_last_60',
 'over_75perc_last_60_AVG',
 'over_985perc_last_60',
 'songs_last_120',
 'num_repeat_last_15',
 'over_985perc_last_15',
 'total_secs_last_30',
 'num_unq_last_30',
 'SUM_songs_15_30',
 'over_985perc_last_7',
 'SUM_logins_15_30',
 'STD_unq_songs_30_45',
 'logins_last_7',
 'STD_songs50_0_15',
 'num_unq_last_120']

In [24]:
quart50_feats = ['login_after_expire_30',
 'total_transactions',
 'is_auto_renew_vec',
 'login_after_expire_20',
 'is_cancel',
 'plan_list_price',
 'avg_spent_trans',
 'membership_length',
 'expire_last_login',
 'total_spent',
 'net_paid_amount',
 'SUM_logins_30_45',
 'spent_per_logins',
 'total_spent_zero_vec',
 'logins_last_60',
 'payment_method_agg_vec',
 'logins_last_120',
 'SUM_logins_45_60',
 'spent_per_song',
 'DIFAVG_logins_15_30_30_45',
 'songs_last_60',
 'total_logins',
 'SUM_secs_45_60',
 'num_unq_last_7',
 'STD_repeats_0_15',
 'SUM_songs50_0_15',
 'sum_over_50pec',
 'SUM_songs50_30_45',
 'SUM_songs_30_45',
 'over_985perc_last_120',
 'DIFAVG_unq_songs_0_15_15_30',
 'spent_per_secs',
 'over_50perc_last_7',
 'sum_over_75pec',
 'DIFSUM_songs_0_15_15_30',
 'payment_plan_days',
 'DIFSUM_logins_0_15_15_30',
 'spent_per_num_unq',
 'SUM_repeats_45_60',
 'total_secs_last_7',
 'STD_songs50_30_45',
 'STD_repeats_30_45',
 'DIFSUM_secs_0_15_15_30',
 'total_secs_last_15',
 'DIFSTD_unq_songs_0_15_15_30',
 'spent_per_num_repeats',
 'STD_repeats_15_30',
 'STD_songs_15_30',
 'num_unq_last_15',
 'SUM_repeats_15_30',
 'SUM_songs50_15_30',
 'DIFSUM_repeats_30_45_45_60',
 'over_75perc_last_120',
 'total_secs_last_120',
 'sum_num_unq',
 'num_repeat_last_7',
 'num_repeat_last_120',
 'STD_unq_songs_15_30',
 'STD_songs_0_15',
 'songs_last_15',
 'logins_last_30',
 'songs_last_7',
 'DIFSUM_unq_songs_15_30_30_45',
 'DIFAVG_logins_30_45_45_60',
 'DIFSTD_songs50_15_30_30_45',
 'sum_num_repeat',
 'num_unq_last_60',
 'logins_last_15',
 'DIFSUM_secs_15_30_30_45',
 'SUM_unq_songs_30_45',
 'DIFAVG_logins_0_15_15_30',
 'songs_last_30',
 'STD_unq_songs_0_15',
 'total_songs',
 'DIFSUM_secs_30_45_45_60',
 'SUM_songs_45_60',
 'STD_unq_songs_45_60',
 'DIFSUM_logins_30_45_45_60',
 'SUM_songs50_45_60',
 'sum_over_985pec',
 'DIFSUM_logins_15_30_30_45',
 'STD_songs50_45_60',
 'total_secs_last_60',
 'over_50perc_last_60',
 'over_75perc_last_60',
 'over_75perc_last_60_AVG',
 'over_985perc_last_60',
 'songs_last_120',
 'over_985perc_last_15',
 'total_secs_last_30',
 'num_unq_last_30',
 'over_985perc_last_7',
 'logins_last_7',
 'STD_songs50_0_15',
 'num_unq_last_120']



In [25]:
quart75_feats = ['login_after_expire_30',
 'total_transactions',
 'is_auto_renew_vec',
 'login_after_expire_20',
 'is_cancel',
 'plan_list_price',
 'avg_spent_trans',
 'membership_length',
 'expire_last_login',
 'total_spent',
 'net_paid_amount',
 'SUM_logins_30_45',
 'spent_per_logins',
 'total_spent_zero_vec',
 'logins_last_60',
 'payment_method_agg_vec',
 'logins_last_120',
 'SUM_logins_45_60',
 'DIFAVG_logins_15_30_30_45',
 'songs_last_60',
 'total_logins',
 'num_unq_last_7',
 'SUM_songs50_0_15',
 'SUM_songs50_30_45',
 'over_985perc_last_120',
 'spent_per_secs',
 'over_50perc_last_7',
 'payment_plan_days',
 'DIFSUM_logins_0_15_15_30',
 'DIFSUM_secs_0_15_15_30',
 'num_unq_last_15',
 'total_secs_last_120',
 'sum_num_unq',
 'num_repeat_last_7',
 'logins_last_30',
 'DIFAVG_logins_30_45_45_60',
 'sum_num_repeat',
 'num_unq_last_60',
 'logins_last_15',
 'DIFAVG_logins_0_15_15_30',
 'DIFSUM_logins_15_30_30_45',
 'STD_songs50_45_60',
 'total_secs_last_60',
 'over_50perc_last_60',
 'over_75perc_last_60',
 'over_985perc_last_60',
 'songs_last_120',
 'total_secs_last_30']

--------------

### - <font color=blue>Train Model: Mean Features, All Splits</font> -

In [26]:
# Create master list of feature names for model
final_features = mean_feats

In [27]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [28]:
# Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

In [29]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

***Gradient Boosted Trees***

In [None]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to1mean = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 148.0


In [None]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to1mean = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 191.0


In [None]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to1mean = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 234.0


In [None]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to1mean = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 275.0


In [None]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to1mean = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 331.0


In [None]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to1mean = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to1mean = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
# Dictionary of Models that were created
models_createdmean = {
                  'gbt_model_1to1mean' : (gbt_model_1to1mean, DRV_Jan2016_1to1),
                  'gbt_model_3to1mean' : (gbt_model_3to1mean, DRV_Jan2016_3to1),
                  'gbt_model_5to1mean' : (gbt_model_5to1mean, DRV_Jan2016_5to1),
                  'gbt_model_7to1mean' : (gbt_model_7to1mean, DRV_Jan2016_7to1),
                  'gbt_model_9to1mean' : (gbt_model_9to1mean, DRV_Jan2016_9to1),
                  'gbt_model_11to1mean' : (gbt_model_11to1mean, DRV_Jan2016_11to1),
                  'gbt_model_13to1mean' : (gbt_model_13to1mean, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: Mean Features, All Splits</font> -

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_resultsmean = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_createdmean.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_resultsmean = train_resultsmean.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_11to1mean
[[662201  10550]
 [  8337  10811]]
 
gbt_model_7to1mean
[[655106  17645]
 [  6870  12278]]
 
gbt_model_13to1mean
[[664998   7753]
 [  9255   9893]]
 
gbt_model_9to1mean
[[659104  13647]
 [  7541  11607]]
 
gbt_model_1to1mean
[[589475  83276]
 [  1970  17178]]
 
gbt_model_5to1mean
[[648101  24650]
 [  5480  13668]]
 
gbt_model_3to1mean
[[635987  36764]
 [  4121  15027]]
 


In [None]:
# Evaluate Training Model
train_resultsmean

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_11to1mean,0.77446,0.985935,0.984318,0.987567,0.027297
0,gbt_model_7to1mean,0.807494,0.981628,0.973772,0.989622,0.035431
0,gbt_model_13to1mean,0.752568,0.987368,0.988476,0.986274,0.024582
0,gbt_model_9to1mean,0.792944,0.984176,0.979715,0.988688,0.030623
0,gbt_model_1to1mean,0.886666,0.932564,0.876216,0.996669,0.123206
0,gbt_model_5to1mean,0.838584,0.977278,0.963359,0.991615,0.043547
0,gbt_model_3to1mean,0.865067,0.968853,0.945353,0.993562,0.059091


### - <font color=blue>Evaluate Validation Model: Mean Features, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_resultsmean = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_createdmean.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_resultsmean = validation_resultsmean.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_11to1mean
[[518298   9906]
 [ 10624   9299]]
 
gbt_model_7to1mean
[[512417  15787]
 [  9118  10805]]
 
gbt_model_13to1mean
[[520898   7306]
 [ 11670   8253]]
 
gbt_model_9to1mean
[[515886  12318]
 [  9875  10048]]
 
gbt_model_1to1mean
[[453390  74814]
 [  2671  17252]]
 
gbt_model_5to1mean
[[505300  22904]
 [  7417  12506]]
 
gbt_model_3to1mean
[[492577  35627]
 [  5401  14522]]
 


In [None]:
# Evaluate Validation Model
validation_resultsmean

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_11to1mean,0.723996,0.980574,0.981246,0.979914,0.037455
0,gbt_model_7to1mean,0.756225,0.97627,0.970112,0.982517,0.045437
0,gbt_model_13to1mean,0.700207,0.982106,0.986168,0.978087,0.03462
0,gbt_model_9to1mean,0.740511,0.978938,0.976679,0.981218,0.040489
0,gbt_model_1to1mean,0.862148,0.921271,0.858362,0.994143,0.141363
0,gbt_model_5to1mean,0.792177,0.970866,0.956638,0.985534,0.055317
0,gbt_model_3to1mean,0.830728,0.960014,0.932551,0.989154,0.074851


#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_createdmean.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

------

### - <font color=blue>Train Model: 75th Percentile, All Splits</font> -

In [None]:
# Create master list of feature names for model
final_features = quart75_feats

In [None]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [None]:
# Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

In [None]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

***Gradient Boosted Trees***

In [None]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to175 = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to175 = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 205.0


In [None]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to175 = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to175 = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 299.0


In [None]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to175 = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to175 = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 395.0


In [None]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to175 = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 470.0


In [None]:
# Dictionary of Models that were created
models_created75 = {
                  'gbt_model_1to175' : (gbt_model_1to175, DRV_Jan2016_1to1),
                  'gbt_model_3to175' : (gbt_model_3to175, DRV_Jan2016_3to1),
                  'gbt_model_5to175' : (gbt_model_5to175, DRV_Jan2016_5to1),
                  'gbt_model_7to175' : (gbt_model_7to175, DRV_Jan2016_7to1),
                  'gbt_model_9to175' : (gbt_model_9to175, DRV_Jan2016_9to1),
                  'gbt_model_11to175' : (gbt_model_11to175, DRV_Jan2016_11to1),
                  'gbt_model_13to175' : (gbt_model_13to175, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 75th Percentile, All Splits</font> -

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_results75 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created75.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results75 = train_results75.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_5to175
[[648695  24056]
 [  5683  13465]]
 
gbt_model_7to175
[[655732  17019]
 [  6945  12203]]
 
gbt_model_9to175
[[659269  13482]
 [  7549  11599]]
 
gbt_model_3to175
[[634692  38059]
 [  4058  15090]]
 
gbt_model_11to175
[[662256  10495]
 [  8315  10833]]
 
gbt_model_1to175
[[588327  84424]
 [  1939  17209]]
 
gbt_model_13to175
[[664766   7985]
 [  9132  10016]]
 


In [None]:
# Evaluate Training Model
train_results75

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_5to175,0.833724,0.977586,0.964242,0.991315,0.042982
0,gbt_model_7to175,0.806001,0.98205,0.974702,0.98952,0.034635
0,gbt_model_9to175,0.792858,0.984295,0.97996,0.988679,0.030396
0,gbt_model_3to175,0.86575,0.967881,0.943428,0.993647,0.060872
0,gbt_model_11to175,0.775075,0.985992,0.9844,0.9876,0.027186
0,gbt_model_1to175,0.886623,0.931617,0.874509,0.996715,0.12482
0,gbt_model_13to175,0.755607,0.987284,0.988131,0.986449,0.024739


### - <font color=blue>Evaluate Validation Model: 75th Percentile, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results75 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created75.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results75 = validation_results75.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_5to175
[[507259  20945]
 [  7433  12490]]
 
gbt_model_7to175
[[512699  15505]
 [  8898  11025]]
 
gbt_model_9to175
[[516019  12185]
 [  9771  10152]]
 
gbt_model_3to175
[[493293  34911]
 [  5351  14572]]
 
gbt_model_11to175
[[518486   9718]
 [ 10804   9119]]
 
gbt_model_1to175
[[452142  76062]
 [  2618  17305]]
 
gbt_model_13to175
[[520545   7659]
 [ 11558   8365]]
 


In [None]:
# Evaluate Validation Model
validation_results75

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_5to175,0.79363,0.972784,0.960347,0.985558,0.051773
0,gbt_model_7to175,0.762013,0.97675,0.970646,0.982941,0.044521
0,gbt_model_9to175,0.743247,0.979164,0.976931,0.981417,0.040056
0,gbt_model_3to175,0.832661,0.960786,0.933906,0.989269,0.073454
0,gbt_model_11to175,0.719657,0.980589,0.981602,0.979588,0.03744
0,gbt_model_1to175,0.862296,0.919951,0.855999,0.994243,0.143543
0,gbt_model_13to175,0.702683,0.981871,0.9855,0.978279,0.035059


#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created75.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

------

### - <font color=blue>Train Model: 50th Percentile, All Splits</font> -

In [None]:
# Create master list of feature names for model
final_features = quart50_feats

In [None]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [None]:
# # Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

In [None]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

***Gradient Boosted Trees***

In [None]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to150 = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 210.0


In [None]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to150 = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 270.0


In [None]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to150 = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 326.0


In [None]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to150 = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to150 = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 443.0


In [None]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to150 = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 502.0


In [None]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to150 = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 590.0


In [None]:
# Dictionary of Models that were created
models_created50 = {
                  'gbt_model_1to150' : (gbt_model_1to150, DRV_Jan2016_1to1),
                  'gbt_model_3to150' : (gbt_model_3to150, DRV_Jan2016_3to1),
                  'gbt_model_5to150' : (gbt_model_5to150, DRV_Jan2016_5to1),
                  'gbt_model_7to150' : (gbt_model_7to150, DRV_Jan2016_7to1),
                  'gbt_model_9to150' : (gbt_model_9to150, DRV_Jan2016_9to1),
                  'gbt_model_11to150' : (gbt_model_11to150, DRV_Jan2016_11to1),
                  'gbt_model_13to150' : (gbt_model_13to150, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 50th Percentile, All Splits</font> -

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_results50 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created50.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results50 = train_results50.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_13to150
[[664759   7992]
 [  9131  10017]]
 
gbt_model_1to150
[[588108  84643]
 [  1952  17196]]
 
gbt_model_7to150
[[655927  16824]
 [  6976  12172]]
 
gbt_model_5to150
[[648417  24334]
 [  5592  13556]]
 
gbt_model_3to150
[[633643  39108]
 [  4031  15117]]
 
gbt_model_11to150
[[662568  10183]
 [  8403  10745]]
 
gbt_model_9to150
[[659596  13155]
 [  7632  11516]]
 


In [None]:
# Evaluate Training Model
train_results50

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_13to150,0.755628,0.98728,0.98812,0.98645,0.024748
0,gbt_model_1to150,0.886121,0.931422,0.874184,0.996692,0.125156
0,gbt_model_7to150,0.805336,0.982176,0.974992,0.989477,0.034398
0,gbt_model_5to150,0.835894,0.977439,0.963829,0.99145,0.043252
0,gbt_model_3to150,0.865675,0.967075,0.941869,0.993679,0.062349
0,gbt_model_11to150,0.773009,0.986163,0.984864,0.987476,0.026862
0,gbt_model_9to150,0.790933,0.984482,0.980446,0.988562,0.030043


### - <font color=blue>Evaluate Validation Model: 50th Percentile, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results50 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created50.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results50 = validation_results50.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_13to150
[[520637   7567]
 [ 11635   8288]]
 
gbt_model_1to150
[[452660  75544]
 [  2656  17267]]
 
gbt_model_7to150
[[512751  15453]
 [  8835  11088]]
 
gbt_model_5to150
[[506968  21236]
 [  7392  12531]]
 
gbt_model_3to150
[[492961  35243]
 [  5186  14737]]
 
gbt_model_11to150
[[518728   9476]
 [ 10834   9089]]
 
gbt_model_9to150
[[516233  11971]
 [  9945   9978]]
 


In [None]:
# Evaluate Validation Model
validation_results50

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_13to150,0.700838,0.981888,0.985674,0.978141,0.035032
0,gbt_model_1to150,0.861833,0.920485,0.85698,0.994167,0.142668
0,gbt_model_7to150,0.763643,0.976859,0.970744,0.983061,0.044311
0,gbt_model_5to150,0.794384,0.972536,0.959796,0.985629,0.052229
0,gbt_model_3to150,0.836488,0.960604,0.933278,0.989589,0.073758
0,gbt_model_11to150,0.719133,0.980794,0.98206,0.979542,0.037053
0,gbt_model_9to150,0.739082,0.979209,0.977336,0.9811,0.039983


#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created50.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

------

### - <font color=blue>Train Model: 25th Percentile, All Splits</font> -

In [None]:
# Create master list of feature names for model
final_features = quart25_feats

In [None]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [None]:
# # Create pipeline objects
gbt_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,gbt])

In [None]:
# Parameter Grid for Gradient Boosted Trees Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [5]) \
    .addGrid(gbt.minInstancesPerNode, [20]) \
    .build()

# Instantiate Cross Validation block
gbt_cv = CrossValidator(estimator=gbt_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=5)

***Gradient Boosted Trees***

In [None]:
# GBT Training on Trainset 1 to 1
start = time.time()
gbt_model_1to125 = gbt_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 251.0


In [None]:
# GBT Training on Trainset 3 to 1
start = time.time()
gbt_model_3to125 = gbt_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
# GBT Training on Trainset 5 to 1
start = time.time()
gbt_model_5to125 = gbt_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 381.0


In [None]:
# GBT Training on Trainset 7 to 1
start = time.time()
gbt_model_7to125 = gbt_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 448.0


In [None]:
# GBT Training on Trainset 9 to 1
start = time.time()
gbt_model_9to125 = gbt_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 514.0


In [None]:
# GBT Training on Trainset 11 to 1
start = time.time()
gbt_model_11to125 = gbt_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 577.0


In [None]:
# GBT Training on Trainset 13 to 1
start = time.time()
gbt_model_13to125 = gbt_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 682.0


In [None]:
# Dictionary of Models that were created
models_created25 = {
                  'gbt_model_1to125' : (gbt_model_1to125, DRV_Jan2016_1to1),
                  'gbt_model_3to125' : (gbt_model_3to125, DRV_Jan2016_3to1),
                  'gbt_model_5to125' : (gbt_model_5to125, DRV_Jan2016_5to1),
                  'gbt_model_7to125' : (gbt_model_7to125, DRV_Jan2016_7to1),
                  'gbt_model_9to125' : (gbt_model_9to125, DRV_Jan2016_9to1),
                  'gbt_model_11to125' : (gbt_model_11to125, DRV_Jan2016_11to1),
                  'gbt_model_13to125' : (gbt_model_13to125, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 25th Percentile, All Splits</font> -

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_results25 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created25.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results25 = train_results25.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_3to125
[[633621  39130]
 [  4067  15081]]
 
gbt_model_13to125
[[664409   8342]
 [  8972  10176]]
 
gbt_model_7to125
[[655931  16820]
 [  6952  12196]]
 
gbt_model_5to125
[[648495  24256]
 [  5592  13556]]
 
gbt_model_9to125
[[659540  13211]
 [  7607  11541]]
 
gbt_model_1to125
[[588772  83979]
 [  1936  17212]]
 
gbt_model_11to125
[[662347  10404]
 [  8352  10796]]
 


In [None]:
# Evaluate Training Model
train_results25

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_3to125,0.864719,0.967031,0.941836,0.993622,0.062433
0,gbt_model_13to125,0.75952,0.987133,0.9876,0.986676,0.025024
0,gbt_model_7to125,0.805966,0.982197,0.974998,0.989512,0.034358
0,gbt_model_5to125,0.835952,0.977499,0.963945,0.991451,0.043139
0,gbt_model_9to125,0.791544,0.984458,0.980363,0.988598,0.030088
0,gbt_model_1to125,0.887032,0.931995,0.875171,0.996723,0.124173
0,gbt_model_11to125,0.774177,0.986034,0.984535,0.987547,0.027108


### - <font color=blue>Evaluate Validation Model: 25th Percentile, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results25 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created25.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results25 = validation_results25.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

gbt_model_3to125
[[494805  33399]
 [  5286  14637]]
 
gbt_model_13to125
[[520345   7859]
 [ 11482   8441]]
 
gbt_model_7to125
[[512863  15341]
 [  9144  10779]]
 
gbt_model_5to125
[[506987  21217]
 [  7434  12489]]
 
gbt_model_9to125
[[516235  11969]
 [ 10015   9908]]
 
gbt_model_1to125
[[450871  77333]
 [  2593  17330]]
 
gbt_model_11to125
[[518574   9630]
 [ 10774   9149]]
 


In [None]:
validation_results25

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,gbt_model_3to125,0.835724,0.962374,0.936769,0.98943,0.070577
0,gbt_model_13to125,0.704401,0.981749,0.985121,0.97841,0.035286
0,gbt_model_7to125,0.755995,0.976681,0.970956,0.982483,0.04467
0,gbt_model_5to125,0.793348,0.972515,0.959832,0.985549,0.052271
0,gbt_model_9to125,0.737327,0.979146,0.97734,0.980969,0.040107
0,gbt_model_1to125,0.861721,0.918576,0.853593,0.994282,0.145817
0,gbt_model_11to125,0.720493,0.980701,0.981768,0.979647,0.037225


#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created50.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

------

## <font color=black>Evaluation of Generalization over All Models</font>

In [None]:
results_all = train_resultsall[train_resultsall.columns[1:]] - validation_resultsall[validation_resultsall.columns[1:]]
results_all['resultname'] = train_resultsall['resultname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.025785,0.012795,0.020439,0.00253,-0.020628,gbt_model_1to1
0,0.03587,0.007513,0.010128,0.004576,-0.013331,gbt_model_3to1
0,0.041375,0.005026,0.004256,0.005837,-0.009209,gbt_model_5to1
0,0.049198,0.005566,0.004211,0.006958,-0.010405,gbt_model_7to1
0,0.052865,0.005248,0.003019,0.007505,-0.009892,gbt_model_9to1
0,0.05458,0.005392,0.002447,0.00831,-0.010285,gbt_model_13to1
0,0.05604,0.005446,0.002847,0.008048,-0.010336,gbt_model_11to1


In [None]:
results_mean = train_resultsmean[train_resultsmean.columns[1:]] - validation_resultsmean[validation_resultsmean.columns[1:]]
results_mean['resultname'] = train_resultsmean['resultname']
results_mean.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.024519,0.011293,0.017854,0.002526,-0.018157,gbt_model_1to1mean
0,0.034339,0.008839,0.012802,0.004408,-0.01576,gbt_model_3to1mean
0,0.046406,0.006412,0.006721,0.006081,-0.011771,gbt_model_5to1mean
0,0.050464,0.00536,0.003072,0.007653,-0.010157,gbt_model_11to1mean
0,0.051269,0.005358,0.00366,0.007105,-0.010005,gbt_model_7to1mean
0,0.052361,0.005262,0.002307,0.008186,-0.010038,gbt_model_13to1mean
0,0.052433,0.005238,0.003035,0.00747,-0.009866,gbt_model_9to1mean


In [None]:
results_75 = train_results75[validation_results25.columns[1:]] - validation_results75[validation_results25.columns[1:]]
results_75['resultname'] = train_results75['resultname']
results_75.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.024326,0.011665,0.01851,0.002472,-0.018723,gbt_model_1to175
0,0.033089,0.007096,0.009522,0.004378,-0.012582,gbt_model_3to175
0,0.040094,0.004802,0.003896,0.005757,-0.008791,gbt_model_5to175
0,0.043987,0.005301,0.004057,0.006579,-0.009886,gbt_model_7to175
0,0.049611,0.005131,0.003029,0.007263,-0.00966,gbt_model_9to175
0,0.052924,0.005413,0.002631,0.00817,-0.01032,gbt_model_13to175
0,0.055418,0.005404,0.002798,0.008012,-0.010254,gbt_model_11to175


In [None]:
results_50 = train_results50[train_results50.columns[1:]] - validation_results50[validation_results50.columns[1:]]
results_50['resultname'] = train_results50['resultname']
results_50.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.024287,0.010937,0.017204,0.002525,-0.017512,gbt_model_1to150
0,0.029187,0.006471,0.008591,0.004089,-0.01141,gbt_model_3to150
0,0.04151,0.004904,0.004033,0.005821,-0.008977,gbt_model_5to150
0,0.041693,0.005317,0.004248,0.006415,-0.009913,gbt_model_7to150
0,0.051851,0.005273,0.00311,0.007462,-0.00994,gbt_model_9to150
0,0.053876,0.005369,0.002804,0.007935,-0.010191,gbt_model_11to150
0,0.05479,0.005392,0.002446,0.008309,-0.010284,gbt_model_13to150


In [None]:
results_25 = train_results25[validation_results25.columns[1:]] - validation_results25[validation_results25.columns[1:]]
results_25['resultname'] = train_results25['resultname']
results_25.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.025311,0.013419,0.021578,0.002441,-0.021644,gbt_model_1to125
0,0.028995,0.004657,0.005067,0.004192,-0.008144,gbt_model_3to125
0,0.042604,0.004984,0.004113,0.005902,-0.009131,gbt_model_5to125
0,0.049971,0.005516,0.004042,0.007029,-0.010313,gbt_model_7to125
0,0.053684,0.005333,0.002767,0.007901,-0.010117,gbt_model_11to125
0,0.054217,0.005312,0.003023,0.007629,-0.010019,gbt_model_9to125
0,0.055119,0.005384,0.002479,0.008266,-0.010262,gbt_model_13to125


Based on generalization, the best in class features were as follows.
- **gbt_model_1to175** - 0.024326
- **gbt_model_3to125** - 0.028995
- **gbt_model_5to175** - 0.040094
- **gbt_model_7to150** - 0.041693
- **gbt_model_9to175** - 0.049611
- **gbt_model_11to1mean** - 0.050464
- **gbt_model_13to1mean** - 0.052361

-------------

-------------

## Recursive Feature Elimination - Ensemble of Models

In [49]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

***Create Custom Classifier***

In [50]:
# Custom evaluator that we can use instead of BinaryClassificationEvaluator() in grid search
class ClassEvaluatorPandas:

    def __init__(self, modelname, model, y_pred, y_true):
        
        # Initialize variables
        self.modelname = modelname
        self.y_pred = y_pred 
        self.y_true = y_true
        self.model = model
        
        # Calculate confusion matrix
        from sklearn.metrics import confusion_matrix
        self.cm = confusion_matrix(y_true,y_pred)
        
        # Calculate confusion matrix values
        self.tp = self.cm[0][0]
        self.fp = self.cm[0][1]
        self.tn = self.cm[1][1]
        self.fn = self.cm[1][0]
        
    def evaluate(self):
        
        # Calculate Metrics and add epsilon to prevent division by zero
        precision = self.tp / float(self.tp + self.fp + 0.00001)
        recall = self.tp / float(self.tp + self.fn + 0.00001)
        f1 = (2 * precision * recall) / float(precision + recall + 0.00001)
        error = (self.fp + self.fn + 0.00001) / (self.tp + self.fp + self.tn + self.fn + 0.00001)
        
        # Instantiate Evaluator and call AUC metric
        from sklearn.metrics import confusion_matrix, roc_curve, auc
        false_positive_rate, true_positive_rate, thresholds = roc_curve(self.y_true, self.y_pred)
        AUC = round(auc(false_positive_rate, true_positive_rate), ndigits=5)
        
        return pd.DataFrame(data=[[self.modelname, AUC, f1, precision, recall, error]], 
                            columns=['modelname', 'AUC', 'f1', 'precision', 'recall', 'error'])
    
    def confusionmatrix(self):
        from sklearn.metrics import confusion_matrix
        # Print Confusion Matrix
        return self.cm
        
    
    def modelparams(self):
        scores = self.model.avgMetrics
        params = [{p.name: v for p, v in m.items()} for m in self.model.getEstimatorParamMaps()]
        params_pd = pd.DataFrame(params)
        params_pd['AUC score'] = scores
        return params_pd

### - <font color=blue>Import Data</font> -

#### <font color=purple>Build Ensemble - Train Set</font>

In [None]:
# Return Probability Values as Spark DF
gbt_model_1to1_train = gbt_model_1to175.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_3to1_train = gbt_model_3to125.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_5to1_train = gbt_model_5to175.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_7to1_train = gbt_model_7to150.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_9to1_train = gbt_model_9to175.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_11to1_train = gbt_model_11to1mean.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_13to1_train = gbt_model_13to1mean.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()

In [None]:
# Create a single DF with all Predictions and Convert back to Spark DF
Jan2016_predictsgbt = pd.merge(gbt_model_1to1_train[['msno','prediction']], gbt_model_3to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_5to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_7to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_9to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_11to1_train[['msno','prediction']], on='msno')
Jan2016_predictsgbt = pd.merge(Jan2016_predictsgbt, gbt_model_13to1_train[['msno','prediction', 'is_churn']], on='msno')

# Rename Columns
Jan2016_predictsgbt.columns = ['msno', 'gbt_model_1to175', 'gbt_model_3to150', 'gbt_model_5to175', 'gbt_model_7to150', 'gbt_model_9to175', 'gbt_model_11to1mean', 'gbt_model_13to175', 'is_churn']

In [None]:
Jan2016_predictsgbt.shape

In [None]:
# Export to GCS
sparkDf = spark.createDataFrame(Jan2016_predictsgbt)    
sparkDf.coalesce(1).write.option("header","true").csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Jan2016_predictsgbt')

#### <font color=purple>Build Ensemble - Validation Set</font>

In [None]:
# Return Prediction Values as Spark DF
gbt_model_1to1_valid = gbt_model_1to175.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_3to1_valid = gbt_model_3to150.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_5to1_valid = gbt_model_5to175.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_7to1_valid = gbt_model_7to150.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_9to1_valid = gbt_model_9to175.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_11to1_valid = gbt_model_11to1mean.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
gbt_model_13to1_valid = gbt_model_13to175.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()


In [None]:
# Create a single DF with all Predictions and Convert back to Spark DF
Feb2016_predictsgbt = pd.merge(gbt_model_1to1_valid[['msno','prediction']], gbt_model_3to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_5to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_7to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_9to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_11to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsgbt = pd.merge(Feb2016_predictsgbt, gbt_model_13to1_valid[['msno','prediction', 'is_churn']], on='msno')

# Rename Columns
Feb2016_predictsgbt.columns = ['msno', 'gbt_model_1to175', 'gbt_model_3to150', 'gbt_model_5to175', 'gbt_model_7to150', 'gbt_model_9to175', 'gbt_model_11to1mean', 'gbt_model_13to175', 'is_churn']

In [None]:
Feb2016_predictsgbt.shape

In [None]:
# Export to GCS
sparkDf = spark.createDataFrame(Feb2016_predictsgbt)    
sparkDf.coalesce(1).write.option("header","true").csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Feb2016_predictsgbt')

#### <font color=purple>Import Ensemble Sets (if already built)</font>

In [34]:
Jan2016_predictsgbt = pd.read_csv('D:\J-5 Local\Jan2016_predictsgbt.csv')
Feb2016_predictsgbt = pd.read_csv('D:\J-5 Local\Feb2016_predictsgbt.csv')

In [35]:
# Instantiate Train x and y
train_x = Jan2016_predictsgbt[Jan2016_predictsgbt.columns[1:-1]]
train_y = Jan2016_predictsgbt['is_churn']

In [36]:
# Instantiate Validation x and y
valid_x = Feb2016_predictsgbt[Feb2016_predictsgbt.columns[1:-1]]
valid_y = Feb2016_predictsgbt['is_churn']

### <font color=blue>Train Model: All Splits, All Splits, XGB + RFECV</font>

In [38]:
# Instantiate Estimators
rfc = RandomForestClassifier()
gbm = GradientBoostingClassifier()
xgb = XGBClassifier()

In [11]:
start = time.time()
gbtrfc1 = RFECV(rfc, min_features_to_select=1, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 78


In [12]:
start = time.time()
gbtrfc2 = RFECV(rfc, min_features_to_select=2, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 70


In [13]:
start = time.time()
gbtrfc3 = RFECV(rfc, min_features_to_select=3, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 60


In [14]:
start = time.time()
gbtrfc4 = RFECV(rfc, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 50


In [15]:
start = time.time()
gbtrfc5 = RFECV(rfc, min_features_to_select=5, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 39


In [18]:
start = time.time()
gbtgmb4 = RFECV(gbm, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 1043


In [19]:
start = time.time()
gbtxgb4 = RFECV(xgb, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 718


In [20]:
# Dictionary of Models that were createdgg
ensembles_created = {
                  'GBT_RFC1' : gbtrfc1,
                  'GBT_RFC2' : gbtrfc2,
                  'GBT_RFC3' : gbtrfc3,
                  'GBT_RFC4' : gbtrfc4,
                  'GBT_RFC5' : gbtrfc5,
                  'GBT_GBM4' : gbtgmb4,
                  'GBT_XGB4' : gbtxgb4}

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + RFECV </font> -

In [22]:
# Train Model Results
train_ensemble_results = pd.DataFrame()

for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_ensemble_results = train_ensemble_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    end = time.time()
    print('Time spent for training: {}'.format(round(end-start)))
    print('')

GBT_RFC1
[[665630   7121]
 [  9385   9763]]
Time spent for training: 736

GBT_RFC2
[[665625   7126]
 [  9382   9766]]
Time spent for training: 737

GBT_RFC3
[[665613   7138]
 [  9371   9777]]
Time spent for training: 738

GBT_RFC4
[[665632   7119]
 [  9391   9757]]
Time spent for training: 738

GBT_RFC5
[[665630   7121]
 [  9385   9763]]
Time spent for training: 739

GBT_GBM4
[[665629   7122]
 [  9381   9767]]
Time spent for training: 741

GBT_XGB4
[[665632   7119]
 [  9391   9757]]
Time spent for training: 742



In [23]:
train_ensemble_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBT_RFC1,0.74964,0.987748,0.989415,0.986097,0.023856
0,GBT_RFC2,0.74972,0.987747,0.989408,0.986101,0.023859
0,GBT_RFC3,0.75,0.987746,0.98939,0.986117,0.02386
0,GBT_RFC4,0.74949,0.987745,0.989418,0.986088,0.023862
0,GBT_RFC5,0.74964,0.987748,0.989415,0.986097,0.023856
0,GBT_GBM4,0.74975,0.98775,0.989414,0.986102,0.023852
0,GBT_XGB4,0.74949,0.987745,0.989418,0.986088,0.023862


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + RFECV </font> -

In [24]:
# Validation Model Results
valid_ensemble_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    valid_ensemble_results = valid_ensemble_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    end = time.time()
    print('Time spent for training: {}'.format(round(end-start)))
    print('')

GBT_RFC1
[[521510   6694]
 [ 11828   8095]]
Time spent for training: 743

GBT_RFC2
[[521507   6697]
 [ 11823   8100]]
Time spent for training: 743

GBT_RFC3
[[521480   6724]
 [ 11821   8102]]
Time spent for training: 744

GBT_RFC4
[[521515   6689]
 [ 11840   8083]]
Time spent for training: 745

GBT_RFC5
[[521510   6694]
 [ 11828   8095]]
Time spent for training: 745

GBT_GBM4
[[521506   6698]
 [ 11828   8095]]
Time spent for training: 747

GBT_XGB4
[[521515   6689]
 [ 11840   8083]]
Time spent for training: 747



In [25]:
valid_ensemble_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBT_RFC1,0.69682,0.982547,0.987327,0.977823,0.033791
0,GBT_RFC2,0.69694,0.982549,0.987321,0.977832,0.033788
0,GBT_RFC3,0.69697,0.982525,0.98727,0.977834,0.033833
0,GBT_RFC4,0.69652,0.98254,0.987336,0.977801,0.033804
0,GBT_RFC5,0.69682,0.982547,0.987327,0.977823,0.033791
0,GBT_GBM4,0.69682,0.982543,0.987319,0.977823,0.033799
0,GBT_XGB4,0.69652,0.98254,0.987336,0.977801,0.033804


#### <font color=purple>Generalization Between Train and Validation</font>

In [39]:
results_all = train_ensemble_results[train_ensemble_results.columns[1:]] - valid_ensemble_results[valid_ensemble_results.columns[1:]]
results_all['modelname'] = train_ensemble_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05278,0.005198,0.002086,0.008269,-0.009929,GBT_RFC2
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBT_RFC1
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBT_RFC5
0,0.05293,0.005207,0.002094,0.00828,-0.009947,GBT_GBM4
0,0.05297,0.005205,0.002082,0.008287,-0.009942,GBT_RFC4
0,0.05297,0.005205,0.002082,0.008287,-0.009942,GBT_XGB4
0,0.05303,0.005221,0.00212,0.008282,-0.009973,GBT_RFC3


### <font color=blue>Train Model: All Splits, XGB + GridCV </font>

#### <font color=purple>XGBOOST Parameter Tuning</font>

In [67]:
# Create Param Grid

param_rfc = {
         'bootstrap': [True, False],
         'max_depth': [3, 5, 7],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
         'n_estimators': [100, 500, 1000]
        }

param_gbm = {
        'learning_rate': [.1, .5, .01],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7],
        'n_estimators': [100, 500, 1000],
        }

param_xgb = {
        'learning_rate': [.1, .5, .01],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7],
        'n_estimators': [100, 500, 1000],
        }

# Instatiate Esitmator Object
rfc = RandomForestClassifier()
gbm = GradientBoostingClassifier()
xgb = XGBClassifier()

# # Instatiate StratKFold Object
# from sklearn.model_selection import StratifiedKFold
# skf = StratifiedKFold(n_splits=5, shuffle = True)

# Instatiate Random Search CV Object
rscv_rfc = RandomizedSearchCV(rfc, param_distributions=param_rfc, n_iter=5, scoring='roc_auc', 
                                   n_jobs=4, cv=5, verbose=3)

rscv_gbm = RandomizedSearchCV(gbm, param_distributions=param_gbm, n_iter=5, scoring='roc_auc', 
                                   n_jobs=4, cv=5, verbose=3)

rscv_xgb = RandomizedSearchCV(xgb, param_distributions=param_xgb, n_iter=5, scoring='roc_auc', 
                                   n_jobs=4, cv=5, verbose=3)

In [68]:
start = time.time()
GBMrfc = rscv_rfc.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 13.3min finished


Time spent for training: 913


In [69]:
start = time.time()
GBMgmb = rscv_gbm.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 38.5min finished


Time spent for training: 2584


In [70]:
start = time.time()
GBMxgb = rscv_xgb.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 16.4min finished


Time spent for training: 1312


In [71]:
# Dictionary of Models that were created
ensembles_created1 = {
                  'GBM_RFC_rscv' : GBMrfc,
                  'GBM_GBM_rscv' : GBMgmb,
                  'GBM_XGB_rscv' : GBMxgb}

## Optimized for AUC

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [72]:
# Train Model Results
train_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_rscv_results = train_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[665629   7122]
 [  9382   9766]]

GBM_GBM_rscv
[[665630   7121]
 [  9385   9763]]

GBM_XGB_rscv
[[665630   7121]
 [  9385   9763]]



In [73]:
train_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.74972,0.98775,0.989414,0.986101,0.023853
0,GBM_GBM_rscv,0.74964,0.987748,0.989415,0.986097,0.023856
0,GBM_XGB_rscv,0.74964,0.987748,0.989415,0.986097,0.023856


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [74]:
# Validation Model Results
validation_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    validation_rscv_results = validation_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[521506   6698]
 [ 11828   8095]]

GBM_GBM_rscv
[[521510   6694]
 [ 11828   8095]]

GBM_XGB_rscv
[[521510   6694]
 [ 11828   8095]]



In [75]:
validation_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.69682,0.982543,0.987319,0.977823,0.033799
0,GBM_GBM_rscv,0.69682,0.982547,0.987327,0.977823,0.033791
0,GBM_XGB_rscv,0.69682,0.982547,0.987327,0.977823,0.033791


#### <font color=purple>Generalization Between Train and Validation</font>

In [76]:
results_all = train_rscv_results[train_rscv_results.columns[1:]] - validation_rscv_results[validation_rscv_results.columns[1:]]
results_all['modelname'] = train_rscv_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBM_GBM_rscv
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBM_XGB_rscv
0,0.0529,0.005207,0.002094,0.008278,-0.009946,GBM_RFC_rscv


## Optimized for Precision

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [56]:
# Train Model Results
train_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_rscv_results = train_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[665630   7121]
 [  9385   9763]]

GBM_GBM_rscv
[[665615   7136]
 [  9377   9771]]

GBM_XGB_rscv
[[665632   7119]
 [  9391   9757]]



In [57]:
train_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.74964,0.987748,0.989415,0.986097,0.023856
0,GBM_GBM_rscv,0.74984,0.987743,0.989393,0.986108,0.023866
0,GBM_XGB_rscv,0.74949,0.987745,0.989418,0.986088,0.023862


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [58]:
# Validation Model Results
validation_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    validation_rscv_results = validation_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[521510   6694]
 [ 11828   8095]]

GBM_GBM_rscv
[[521485   6719]
 [ 11833   8090]]

GBM_XGB_rscv
[[521515   6689]
 [ 11840   8083]]



In [59]:
validation_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.69682,0.982547,0.987327,0.977823,0.033791
0,GBM_GBM_rscv,0.69667,0.982518,0.98728,0.977812,0.033846
0,GBM_XGB_rscv,0.69652,0.98254,0.987336,0.977801,0.033804


#### <font color=purple>Generalization Between Train and Validation</font>

In [60]:
results_all = train_rscv_results[train_rscv_results.columns[1:]] - validation_rscv_results[validation_rscv_results.columns[1:]]
results_all['modelname'] = train_rscv_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBM_RFC_rscv
0,0.05297,0.005205,0.002082,0.008287,-0.009942,GBM_XGB_rscv
0,0.05317,0.005224,0.002113,0.008295,-0.00998,GBM_GBM_rscv


## Optimized for Recall

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [56]:
# Train Model Results
train_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_rscv_results = train_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[665629   7122]
 [  9382   9766]]

GBM_GBM_rscv
[[665526   7225]
 [  9342   9806]]

GBM_XGB_rscv
[[665630   7121]
 [  9385   9763]]



In [57]:
train_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.74972,0.98775,0.989414,0.986101,0.023853
0,GBM_GBM_rscv,0.75069,0.987701,0.989261,0.986157,0.023944
0,GBM_XGB_rscv,0.74964,0.987748,0.989415,0.986097,0.023856


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [58]:
# Validation Model Results
validation_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    validation_rscv_results = validation_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

GBM_RFC_rscv
[[521506   6698]
 [ 11828   8095]]

GBM_GBM_rscv
[[521372   6832]
 [ 11765   8158]]

GBM_XGB_rscv
[[521510   6694]
 [ 11828   8095]]



In [59]:
validation_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,GBM_RFC_rscv,0.69682,0.982543,0.987319,0.977823,0.033799
0,GBM_GBM_rscv,0.69827,0.982473,0.987066,0.977933,0.033928
0,GBM_XGB_rscv,0.69682,0.982547,0.987327,0.977823,0.033791


#### <font color=purple>Generalization Between Train and Validation</font>

In [60]:
results_all = train_rscv_results[train_rscv_results.columns[1:]] - validation_rscv_results[validation_rscv_results.columns[1:]]
results_all['modelname'] = train_rscv_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05242,0.005229,0.002195,0.008225,-0.009984,GBM_GBM_rscv
0,0.05282,0.005201,0.002088,0.008274,-0.009935,GBM_XGB_rscv
0,0.0529,0.005207,0.002094,0.008278,-0.009946,GBM_RFC_rscv
