# KKBox Customer Churn Prediction
### w/ BigQuery and Apache Spark

---

# Part III: <font color=green>*RFC - Model Creation and Evaluation*</font>
Please refer to the following article for a comprehensive review of the project:

https://medium.com/@dangoml/1-churn-happiness-customer-centric-approach-to-retention-fd1d85464e45

---

In [3]:
# General Imports
from __future__ import absolute_import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import time 

# Imports for PySpark
import findspark
findspark.init()
# import pyspark
from pyspark import SparkConf
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext

# # Imports for BigQuery connection
# import json
# import pprint
# import subprocess

# # Imports for GCP
# from google.cloud import bigquery
# import gcsfs
import gc


# Imports for Spark ML
from pyspark.ml.feature import (VectorAssembler,StandardScaler, OneHotEncoderEstimator, OneHotEncoder)
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, Evaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
## Dataproc Specs

# Jupyter Initialization: gs://srcd-dataproc/jupyter.sh 
# Components Installed: Anaconda and Jupyter
# Master Node:   x1 - 4 vCPU w/ 15 GB RAM each
# Workers Nodes: x5 - 4 vCPU w/ 15 GB RAM each
# Disk: 100GB

## Create Spark Session and Import Data

In [3]:
# Specify Google Credentials
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='Google Credentials.json'

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('KKBox-Churn1').getOrCreate()

# Instantiate BigQuery magic
# %load_ext google.cloud.bigquery

In [None]:
# # If Working Locally on Computer, Importing Data Locally#

# # Import DRV_Jan2016 (Train Set) 
# DRV_Jan2016_1to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_1to1',inferSchema=True,header=True)
# DRV_Jan2016_3to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_3to1',inferSchema=True,header=True)
# DRV_Jan2016_5to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_5to1',inferSchema=True,header=True)
# DRV_Jan2016_7to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_7to1',inferSchema=True,header=True)
# DRV_Jan2016_9to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_9to1',inferSchema=True,header=True)
# DRV_Jan2016_11to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_11to1',inferSchema=True,header=True)
# DRV_Jan2016_13to1 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016_13to1',inferSchema=True,header=True)

# DRV_Jan20160 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016000000000000',inferSchema=True,header=True)
# DRV_Jan20161 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016000000000001',inferSchema=True,header=True)
# DRV_Jan20162 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Jan2016000000000002',inferSchema=True,header=True)

# DRV_Jan2016 = DRV_Jan20160.union(DRV_Jan20161)
# DRV_Jan2016 = DRV_Jan2016.union(DRV_Jan20162)

# DRV_Jan20160.unpersist()
# DRV_Jan20161.unpersist()
# DRV_Jan20162.unpersist()

# # Import DRV_Feb2016 (Validation Set) 
# DRV_Feb20160 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Feb2016000000000000',inferSchema=True,header=True)
# DRV_Feb20161 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Feb2016000000000001',inferSchema=True,header=True)
# DRV_Feb20162 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Feb2016000000000002',inferSchema=True,header=True)

# DRV_Feb2016 = DRV_Feb20160.union(DRV_Feb20161)
# DRV_Feb2016 = DRV_Feb2016.union(DRV_Feb20162)

# DRV_Feb20160.unpersist()
# DRV_Feb20161.unpersist()
# DRV_Feb20162.unpersist()

# # Import DRV_Feb2016 (Validation Set) 
# DRV_Mar20160 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Mar2016000000000000',inferSchema=True,header=True)
# DRV_Mar20161 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Mar2016000000000001',inferSchema=True,header=True)
# DRV_Mar20162 = spark.read.csv('D:\J-5 Local\Datasets_KKBox User Data_Monthly Datasets_DRV_Mar2016000000000002',inferSchema=True,header=True)

# DRV_Mar2016 = DRV_Mar20160.union(DRV_Mar20161)
# DRV_Mar2016 = DRV_Mar2016.union(DRV_Mar20162)

In [4]:
# If Working on Dataproc Cloud ##

# Import DRV_Jan2016 (Train Set) 
DRV_Jan2016_1to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_1to1',inferSchema=True,header=True)
DRV_Jan2016_3to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_3to1',inferSchema=True,header=True)
DRV_Jan2016_5to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_5to1',inferSchema=True,header=True)
DRV_Jan2016_7to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_7to1',inferSchema=True,header=True)
DRV_Jan2016_9to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_9to1',inferSchema=True,header=True)
DRV_Jan2016_11to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_11to1',inferSchema=True,header=True)
DRV_Jan2016_13to1 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016_13to1',inferSchema=True,header=True)

DRV_Jan20160 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000000',inferSchema=True,header=True)
DRV_Jan20161 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000001',inferSchema=True,header=True)
DRV_Jan20162 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Jan2016000000000002',inferSchema=True,header=True)

DRV_Jan2016 = DRV_Jan20160.union(DRV_Jan20161)
DRV_Jan2016 = DRV_Jan2016.union(DRV_Jan20162)

DRV_Jan20160 = None
DRV_Jan20161 = None
DRV_Jan20162 = None

# Import DRV_Feb2016 (Validation Set) 
DRV_Feb20160 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000000',inferSchema=True,header=True)
DRV_Feb20161 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000001',inferSchema=True,header=True)
DRV_Feb20162 = spark.read.csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Monthly Datasets/DRV_Feb2016000000000002',inferSchema=True,header=True)

DRV_Feb2016 = DRV_Feb20160.union(DRV_Feb20161)
DRV_Feb2016 = DRV_Feb2016.union(DRV_Feb20162)

DRV_Feb20160 = None
DRV_Feb20161 = None
DRV_Feb20162 = None

***Cast Correct Column Types on All Sets***

In [None]:
column_types_pd = [('msno', 'STRING'),
 ('membership_expire_date', 'DATE'),
 ('payment_method_id', 'INT64'),
 ('payment_plan_days', 'INT64'),
 ('plan_list_price', 'INT64'),
 ('net_paid_amount', 'INT64'),
 ('is_net_paid_amount', 'STRING'),
 ('is_auto_renew', 'INT64'),
 ('city', 'INT64'),
 ('bd', 'INT64'),
 ('registered_via', 'INT64'),
 ('registration_init_time', 'DATE'),
 ('membership_length', 'INT64'),
 ('is_churn', 'FLOAT64'),
 ('total_songs', 'INT64'),
 ('total_logins', 'INT64'),
 ('total_secs', 'FLOAT64'),
 ('sum_num_unq', 'INT64'),
 ('sum_num_repeat', 'INT64'),
 ('sum_over_50pec', 'INT64'),
 ('sum_over_75pec', 'INT64'),
 ('sum_over_985pec', 'INT64'),
 ('total_transactions', 'INT64'),
 ('total_spent', 'FLOAT64'),
 ('avg_spent_trans', 'FLOAT64'),
 ('spent_per_logins', 'FLOAT64'),
 ('spent_per_secs', 'FLOAT64'),
 ('spent_per_song', 'FLOAT64'),
 ('spent_per_num_unq', 'FLOAT64'),
 ('spent_per_num_repeats', 'FLOAT64'),
 ('never_active_subscriber', 'FLOAT64'),
 ('total_spent_zero', 'FLOAT64'),
 ('city_agg', 'INT64'),
 ('payment_method_agg', 'INT64'),
 ('songs_last_7', 'FLOAT64'),
 ('songs_last_7_AVG', 'FLOAT64'),
 ('logins_last_7', 'FLOAT64'),
 ('logins_last_7_AVG', 'FLOAT64'),
 ('total_secs_last_7', 'FLOAT64'),
 ('total_secs_last_7_AVG', 'FLOAT64'),
 ('num_unq_last_7', 'FLOAT64'),
 ('num_unq_last_7_AVG', 'FLOAT64'),
 ('num_repeat_last_7', 'FLOAT64'),
 ('num_repeat_last_7_AVG', 'FLOAT64'),
 ('over_50perc_last_7', 'FLOAT64'),
 ('over_50perc_last_7_AVG', 'FLOAT64'),
 ('over_75perc_last_7', 'FLOAT64'),
 ('over_75perc_last_7_AVG', 'FLOAT64'),
 ('over_985perc_last_7', 'FLOAT64'),
 ('over_985perc_last_7_AVG', 'FLOAT64'),
 ('songs_last_15', 'FLOAT64'),
 ('songs_last_15_AVG', 'FLOAT64'),
 ('logins_last_15', 'FLOAT64'),
 ('logins_last_15_AVG', 'FLOAT64'),
 ('total_secs_last_15', 'FLOAT64'),
 ('total_secs_last_15_AVG', 'FLOAT64'),
 ('num_unq_last_15', 'FLOAT64'),
 ('num_unq_last_15_AVG', 'FLOAT64'),
 ('num_repeat_last_15', 'FLOAT64'),
 ('num_repeat_last_15_AVG', 'FLOAT64'),
 ('over_50perc_last_15', 'FLOAT64'),
 ('over_50perc_last_15_AVG', 'FLOAT64'),
 ('over_75perc_last_15', 'FLOAT64'),
 ('over_75perc_last_15_AVG', 'FLOAT64'),
 ('over_985perc_last_15', 'FLOAT64'),
 ('over_985perc_last_15_AVG', 'FLOAT64'),
 ('songs_last_30', 'FLOAT64'),
 ('songs_last_30_AVG', 'FLOAT64'),
 ('logins_last_30', 'FLOAT64'),
 ('logins_last_30_AVG', 'FLOAT64'),
 ('total_secs_last_30', 'FLOAT64'),
 ('total_secs_last_30_AVG', 'FLOAT64'),
 ('num_unq_last_30', 'FLOAT64'),
 ('num_unq_last_30_AVG', 'FLOAT64'),
 ('num_repeat_last_30', 'FLOAT64'),
 ('num_repeat_last_30_AVG', 'FLOAT64'),
 ('over_50perc_last_30', 'FLOAT64'),
 ('over_50perc_last_30_AVG', 'FLOAT64'),
 ('over_75perc_last_30', 'FLOAT64'),
 ('over_75perc_last_30_AVG', 'FLOAT64'),
 ('over_985perc_last_30', 'FLOAT64'),
 ('over_985perc_last_30_AVG', 'FLOAT64'),
 ('songs_last_60', 'FLOAT64'),
 ('songs_last_60_AVG', 'FLOAT64'),
 ('logins_last_60', 'FLOAT64'),
 ('logins_last_60_AVG', 'FLOAT64'),
 ('total_secs_last_60', 'FLOAT64'),
 ('total_secs_last_60_AVG', 'FLOAT64'),
 ('num_unq_last_60', 'FLOAT64'),
 ('num_unq_last_60_AVG', 'FLOAT64'),
 ('num_repeat_last_60', 'FLOAT64'),
 ('num_repeat_last_60_AVG', 'FLOAT64'),
 ('over_50perc_last_60', 'FLOAT64'),
 ('over_50perc_last_60_AVG', 'FLOAT64'),
 ('over_75perc_last_60', 'FLOAT64'),
 ('over_75perc_last_60_AVG', 'FLOAT64'),
 ('over_985perc_last_60', 'FLOAT64'),
 ('over_985perc_last_60_AVG', 'FLOAT64'),
 ('songs_last_120', 'FLOAT64'),
 ('songs_last_120_AVG', 'FLOAT64'),
 ('logins_last_120', 'FLOAT64'),
 ('logins_last_120_AVG', 'FLOAT64'),
 ('total_secs_last_120', 'FLOAT64'),
 ('total_secs_last_120_AVG', 'FLOAT64'),
 ('num_unq_last_120', 'FLOAT64'),
 ('num_unq_last_120_AVG', 'FLOAT64'),
 ('num_repeat_last_120', 'FLOAT64'),
 ('num_repeat_last_120_AVG', 'FLOAT64'),
 ('over_50perc_last_120', 'FLOAT64'),
 ('over_50perc_last_120_AVG', 'FLOAT64'),
 ('over_75perc_last_120', 'FLOAT64'),
 ('over_75perc_last_120_AVG', 'FLOAT64'),
 ('over_985perc_last_120', 'FLOAT64'),
 ('over_985perc_last_120_AVG', 'FLOAT64'),
 ('SUM_unq_songs_0_15', 'FLOAT64'),
 ('AVG_unq_songs_0_15', 'FLOAT64'),
 ('SUM_songs_0_15', 'FLOAT64'),
 ('AVG_songs_0_15', 'FLOAT64'),
 ('SUM_secs_0_15', 'FLOAT64'),
 ('AVG_secs_0_15', 'FLOAT64'),
 ('SUM_songs50_0_15', 'FLOAT64'),
 ('AVG_songs50_0_15', 'FLOAT64'),
 ('SUM_logins_0_15', 'FLOAT64'),
 ('AVG_logins_0_15', 'FLOAT64'),
 ('SUM_repeats_0_15', 'FLOAT64'),
 ('AVG_repeats_0_15', 'FLOAT64'),
 ('SUM_unq_songs_15_30', 'FLOAT64'),
 ('AVG_unq_songs_15_30', 'FLOAT64'),
 ('SUM_songs_15_30', 'FLOAT64'),
 ('AVG_songs_15_30', 'FLOAT64'),
 ('SUM_secs_15_30', 'FLOAT64'),
 ('AVG_secs_15_30', 'FLOAT64'),
 ('SUM_songs50_15_30', 'FLOAT64'),
 ('AVG_songs50_15_30', 'FLOAT64'),
 ('SUM_logins_15_30', 'FLOAT64'),
 ('AVG_logins_15_30', 'FLOAT64'),
 ('SUM_repeats_15_30', 'FLOAT64'),
 ('AVG_repeats_15_30', 'FLOAT64'),
 ('SUM_unq_songs_30_45', 'FLOAT64'),
 ('AVG_unq_songs_30_45', 'FLOAT64'),
 ('SUM_songs_30_45', 'FLOAT64'),
 ('AVG_songs_30_45', 'FLOAT64'),
 ('SUM_secs_30_45', 'FLOAT64'),
 ('AVG_secs_30_45', 'FLOAT64'),
 ('SUM_songs50_30_45', 'FLOAT64'),
 ('AVG_songs50_30_45', 'FLOAT64'),
 ('SUM_logins_30_45', 'FLOAT64'),
 ('AVG_logins_30_45', 'FLOAT64'),
 ('SUM_repeats_30_45', 'FLOAT64'),
 ('AVG_repeats_30_45', 'FLOAT64'),
 ('SUM_unq_songs_45_60', 'FLOAT64'),
 ('AVG_unq_songs_45_60', 'FLOAT64'),
 ('SUM_songs_45_60', 'FLOAT64'),
 ('AVG_songs_45_60', 'FLOAT64'),
 ('SUM_secs_45_60', 'FLOAT64'),
 ('AVG_secs_45_60', 'FLOAT64'),
 ('SUM_songs50_45_60', 'FLOAT64'),
 ('AVG_songs50_45_60', 'FLOAT64'),
 ('SUM_logins_45_60', 'FLOAT64'),
 ('AVG_logins_45_60', 'FLOAT64'),
 ('SUM_repeats_45_60', 'FLOAT64'),
 ('AVG_repeats_45_60', 'FLOAT64'),
 ('DIFSUM_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_songs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_secs_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_secs_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_logins_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_logins_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFAVG_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFSUM_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_songs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_secs_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_secs_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_logins_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_logins_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFAVG_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFSUM_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_songs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_secs_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_secs_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_logins_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_logins_30_45_45_60', 'FLOAT64'),
 ('DIFSUM_repeats_30_45_45_60', 'FLOAT64'),
 ('DIFAVG_repeats_30_45_45_60', 'FLOAT64'),
 ('expire_last_login', 'INT64'),
 ('total_cancelations', 'INT64'),
 ('login_after_expire_10', 'INT64'),
 ('login_after_expire_20', 'INT64'),
 ('login_after_expire_30', 'INT64'),
 ('STD_unq_songs_0_15', 'FLOAT64'),
 ('STD_songs_0_15', 'FLOAT64'),
 ('STD_secs_0_15', 'FLOAT64'),
 ('STD_songs50_0_15', 'FLOAT64'),
 ('STD_repeats_0_15', 'FLOAT64'),
 ('STD_unq_songs_15_30', 'FLOAT64'),
 ('STD_songs_15_30', 'FLOAT64'),
 ('STD_secs_15_30', 'FLOAT64'),
 ('STD_songs50_15_30', 'FLOAT64'),
 ('STD_repeats_15_30', 'FLOAT64'),
 ('STD_unq_songs_30_45', 'FLOAT64'),
 ('STD_songs_30_45', 'FLOAT64'),
 ('STD_secs_30_45', 'FLOAT64'),
 ('STD_songs50_30_45', 'FLOAT64'),
 ('STD_repeats_30_45', 'FLOAT64'),
 ('STD_unq_songs_45_60', 'FLOAT64'),
 ('STD_songs_45_60', 'FLOAT64'),
 ('STD_secs_45_60', 'FLOAT64'),
 ('STD_songs50_45_60', 'FLOAT64'),
 ('STD_repeats_45_60', 'FLOAT64'),
 ('DIFSTD_unq_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_songs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_secs_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_songs50_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_repeats_0_15_15_30', 'FLOAT64'),
 ('DIFSTD_unq_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_songs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_secs_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_songs50_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_repeats_15_30_30_45', 'FLOAT64'),
 ('DIFSTD_unq_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_songs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_secs_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_songs50_30_45_45_60', 'FLOAT64'),
 ('DIFSTD_repeats_30_45_45_60', 'FLOAT64'),
 ('is_cancel', 'INT64')]

In [None]:
from pyspark.sql.functions import expr

# Correctly Cast DRV_Feb2016
for feature, datatype in column_types_pd:
    if datatype == 'STRING':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS string)"))')
    if datatype == 'DATE':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS timestamp)"))')
    if datatype == 'INT64':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS integer)"))')
    if datatype == 'FLOAT64':
        exec(f'DRV_Jan2016_1to1 = DRV_Feb2016.withColumn("{feature}", expr("CAST({feature} AS double)"))')

## Model Pre-Processing
https://medium.com/@dhiraj.p.rai/essentials-of-feature-engineering-in-pyspark-part-i-76a57680a85

#### - <font color=blue>Split Feautres by Categorical or Continuous</font> -

In [5]:
# Create list of Categorical feature names
cat_feats = ['is_auto_renew', 'total_spent_zero', 'city_agg', 'payment_method_agg', 'never_active_subscriber','is_net_paid_amount']

In [6]:
# Create list of Continuous feature names
cont_feats = [x for x in DRV_Jan2016_1to1.columns if x not in cat_feats]
cont_feats.remove('msno')
cont_feats.remove('is_churn')
cont_feats.remove('membership_expire_date')
cont_feats.remove('registration_init_time')
cont_feats.remove('city')
cont_feats.remove('bd')
cont_feats.remove('payment_method_id')

cont_feats.remove('registered_via')

### - <font color=blue>Data Pre-Processing</font> -

#### <font color=purple>*Encode Categorical Variables*</font>

In [7]:
# Create a list of categorical, 'Vector' feature names
cat_feats_vec = ['is_auto_renew_vec', 'total_spent_zero_vec', 'city_agg_vec', 'payment_method_agg_vec', 'never_active_subscriber_vec']

In [8]:
# One Hot Encode
is_auto_renew_encoder = OneHotEncoder(inputCol='is_auto_renew',outputCol='is_auto_renew_vec')
total_spent_zero_encoder = OneHotEncoder(inputCol='total_spent_zero',outputCol='total_spent_zero_vec')
city_agg_encoder = OneHotEncoder(inputCol='city_agg',outputCol='city_agg_vec')
payment_method_agg_encoder = OneHotEncoder(inputCol='payment_method_agg',outputCol='payment_method_agg_vec')
never_active_subscriber_encoder = OneHotEncoder(inputCol='never_active_subscriber',outputCol='never_active_subscriber_vec')
# is_net_paid_amount_encoder = OneHotEncoder(inputCol='is_net_paid_amount',outputCol='is_net_paid_amount_vec')

# registered_via_encoder = OneHotEncoder(inputCol='registered_via',outputCol='registered_via_vec')


#### <font color=purple>*Vector Assembler*</font>

In [9]:
# Create master list of feature names for model
final_features = cont_feats + cat_feats_vec

In [10]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

#### <font color=purple>*Feature Scaling*</font>

In [11]:
# Scale all features into our final output features
scaler = StandardScaler(inputCol='features', 
                        outputCol='features_scaled',
                        withStd=True, withMean=False)

## Model Creation: Pipeline and Tuning

### - <font color=blue>Create Pipeline Object</font> -
https://spark.apache.org/docs/2.4.3/ml-pipeline.html

In [12]:
# Instatiate Model Estimators and Parameters
rfc = RandomForestClassifier(featuresCol='features_scaled', 
                             labelCol='is_churn')

In [13]:
# Create pipeline objects
rfc_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,rfc])

### - <font color=blue>Model Tuning</font> -
https://spark.apache.org/docs/2.4.3/ml-tuning.html

In [14]:
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_churn')

#### <font color=purple>Random Forest Classifier Parameter Tuning</font>

In [15]:
# Parameter Grid for Random Forest Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(rfc.impurity,['entropy']) \
    .addGrid(rfc.maxDepth,[7]) \
    .build()

# Instantiate Cross Validation block
rfc_cv = CrossValidator(estimator=rfc_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=7)

After running various models we were able to determine the optimal parameters for both Gradient Boosted Trees and Random Forest.

## Model Execution and Evaluation

### - <font color=blue>Train Model: All Features, All Splits</font> -

***Random Forest Classifiers***

In [16]:
# GBT Training on Trainset 1 to 1
start = time.time()
rfc_model_1to1 = rfc_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 111.0


In [17]:
# GBT Training on Trainset 3 to 1
start = time.time()
rfc_model_3to1 = rfc_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 136.0


In [18]:
# GBT Training on Trainset 5 to 1
start = time.time()
rfc_model_5to1 = rfc_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 156.0


In [19]:
# GBT Training on Trainset 7 to 1
start = time.time()
rfc_model_7to1 = rfc_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 186.0


In [20]:
# GBT Training on Trainset 9 to 1
start = time.time()
rfc_model_9to1 = rfc_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 216.0


In [21]:
# GBT Training on Trainset 11 to 1
start = time.time()
rfc_model_11to1 = rfc_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 249.0


In [22]:
# GBT Training on Trainset 13 to 1
start = time.time()
rfc_model_13to1 = rfc_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 281.0


#### <font color=purple>Evaluate Trained Model</font>

***Create Custom Evaluator***

In [23]:
# Custom evaluator that we can use instead of BinaryClassificationEvaluator() in grid search
class ClassEvaluator:

    def __init__(self, resultname, resultdata, model):
        from sklearn.metrics import confusion_matrix
        # Initialize variables
        self.resultPandas = resultdata[['is_churn', 'prediction']].toPandas()
        self.resultdata = resultdata 
        self.resultname = resultname
        self.model = model
        
        self.cm = confusion_matrix(self.resultPandas['is_churn'],self.resultPandas['prediction'])
        
        # Calculate confusion matrix values
        self.tp = self.cm[0][0]
        self.fn = self.cm[0][1]
        self.tn = self.cm[1][1]
        self.fp = self.cm[1][0]
        
    def evaluate(self):
        # Calculate Metrics and add epsilon to prevent division by zero
        precision = self.tp / float(self.tp + self.fp + 0.00001)
        recall = self.tp / float(self.tp + self.fn + 0.00001)
        f1 = (2 * precision * recall) / float(precision + recall + 0.00001)
        error = (self.fp + self.fn + 0.00001) / (self.tp + self.fp + self.tn + self.fn + 0.00001)
        
        # Instantiate Evaluator and call AUC metric
        my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                                labelCol='is_churn')
        AUC = my_eval.evaluate(self.resultdata)
        
        
        return pd.DataFrame(data=[[self.resultname, AUC, f1, precision, recall, error]], 
                            columns=['resultname', 'AUC', 'f1', 'precision', 'recall', 'error'])
    
    def confusionmatrix(self):
        from sklearn.metrics import confusion_matrix
        # Print Confusion Matrix
        return self.cm
        
    
    def modelparams(self):
        scores = self.model.avgMetrics
        params = [{p.name: v for p, v in m.items()} for m in self.model.getEstimatorParamMaps()]
        params_pd = pd.DataFrame(params)
        params_pd['AUC score'] = scores
        return params_pd

### - <font color=blue>Evaluate Train Model: All Features, All Splits</font> -

***Transform Train Data on Trained Models***

In [None]:
# Dictionary of Models that were created
models_created = {
                  'rfc_model_1to1' : (rfc_model_1to1, DRV_Jan2016_1to1),
                  'rfc_model_3to1' : (rfc_model_3to1, DRV_Jan2016_3to1),
                  'rfc_model_5to1' : (rfc_model_5to1, DRV_Jan2016_5to1),
                  'rfc_model_7to1' : (rfc_model_7to1, DRV_Jan2016_7to1),
                  'rfc_model_9to1' : (rfc_model_9to1, DRV_Jan2016_9to1),
                  'rfc_model_11to1' : (rfc_model_11to1, DRV_Jan2016_11to1),
                  'rfc_model_13to1' : (rfc_model_13to1, DRV_Jan2016_13to1)}

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_resultsall = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_resultsall = train_resultsall.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_13to1
[[667641   5110]
 [ 12024   7124]]
 
rfc_model_3to1
[[641550  31201]
 [  5890  13258]]
 
rfc_model_9to1
[[664604   8147]
 [ 10498   8650]]
 
rfc_model_7to1
[[662211  10540]
 [  9618   9530]]
 
rfc_model_1to1
[[589598  83153]
 [  3059  16089]]
 
rfc_model_11to1
[[666593   6158]
 [ 11306   7842]]
 
rfc_model_5to1
[[658749  14002]
 [  8791  10357]]
 


In [None]:
# Evaluate Training Model
train_resultsall.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to1,0.858321,0.931865,0.876399,0.994838,0.124602
0,rfc_model_3to1,0.823009,0.9719,0.953622,0.990903,0.053608
0,rfc_model_5to1,0.760039,0.982989,0.979187,0.986831,0.032943
0,rfc_model_7to1,0.741018,0.985003,0.984333,0.985684,0.029134
0,rfc_model_9to1,0.719817,0.986162,0.98789,0.98445,0.026948
0,rfc_model_11to1,0.700197,0.987065,0.990847,0.983322,0.025241
0,rfc_model_13to1,0.682227,0.987326,0.992404,0.982309,0.024764


### - <font color=blue>Evaluate Validation Model: All Features, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_resultsall = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_resultsall = validation_resultsall.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_13to1
[[524375   3829]
 [ 14932   4991]]
 
rfc_model_3to1
[[503902  24302]
 [  8237  11686]]
 
rfc_model_9to1
[[522183   6021]
 [ 13387   6536]]
 
rfc_model_7to1
[[520180   8024]
 [ 12367   7556]]
 
rfc_model_1to1
[[460430  67774]
 [  3805  16118]]
 
rfc_model_11to1
[[523542   4662]
 [ 14245   5678]]
 
rfc_model_5to1
[[517343  10861]
 [ 11473   8450]]
 


In [None]:
# Evaluate Validation Model
validation_resultsall.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to1,0.840352,0.927871,0.87169,0.991804,0.130588
0,rfc_model_3to1,0.770275,0.968718,0.953991,0.983916,0.059364
0,rfc_model_5to1,0.701785,0.978866,0.979438,0.978304,0.040746
0,rfc_model_7to1,0.682035,0.980772,0.984809,0.976778,0.037201
0,rfc_model_9to1,0.658332,0.981751,0.988601,0.975004,0.035408
0,rfc_model_11to1,0.638086,0.982258,0.991174,0.973512,0.034494
0,rfc_model_13to1,0.621633,0.98242,0.992751,0.972313,0.034227


We now currently have ~230 Features with the inclusion of our *Bi-Weekly Activity Block* and *Comparison of Bi-Weekly Activity Block Features*. Looking at the results above we can see that we have made some improvements on AUC. We also notice how Recall increased significantly over the Ratio'd subsets. The ratio'd subsets seemed to have helped address the issue of having a high amount of False Positives.

We can see that our models are still overfitting with the higher ratio'd models being the worst. However our Precision scores are better over our higher ratio'd models along with an improvement in model error and a slight decrease in Recall. Each of these models have there pros and cons and we might benefit from some sort of ensemble of these models. Before we do such a thing let's play around a bit with feature selection to see if we improve on the overall generalization between our Train and Validation models.

#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

Unnamed: 0,impurity,maxDepth,AUC,Model
0,entropy,7,0.938831,rfc_model_13to1
0,entropy,7,0.938921,rfc_model_3to1
0,entropy,7,0.938895,rfc_model_9to1
0,entropy,7,0.93883,rfc_model_7to1
0,entropy,7,0.937017,rfc_model_1to1
0,entropy,7,0.938055,rfc_model_11to1
0,entropy,7,0.937443,rfc_model_5to1


#### <font color=purple>Model Evaluation: Feature Importance</font>

We will derive average feature importance scores for all features. Then we will produce a 5 Number Summary on these scores and group our features based on their scores against the following thresholds: Mean, 75th Percentile, 50th Percentile, and 25th Percentile.

In [None]:
# Display feature importances
importances = rfc_model_1to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column1 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = rfc_model_3to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column3 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = rfc_model_5to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column5 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = rfc_model_7to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column7 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = rfc_model_9to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column9 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = rfc_model_11to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column11 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

importances = rfc_model_13to1.bestModel.stages[-1].featureImportances
importances_list = [importances[i] for i in range(len(importances))]
names = final_features
column13 = pd.DataFrame(data=list(zip(names, importances_list)), columns=['Features', 'Importance Score']).sort_values(by='Importance Score', ascending=False)

In [None]:
feature_imp = pd.merge(column1, column3 , on='Features')
feature_imp = pd.merge(feature_imp, column5, on='Features')
feature_imp = pd.merge(feature_imp, column7, on='Features')
feature_imp = pd.merge(feature_imp, column9, on='Features')
feature_imp = pd.merge(feature_imp, column11, on='Features')
feature_imp = pd.merge(feature_imp, column13, on='Features')

feature_imp['avg'] = feature_imp[list(feature_imp.columns[1:-1])].mean(axis=1)

In [None]:
feature_imp['avg'].describe()

count    230.000000
mean       0.004152
std        0.015061
min        0.000013
25%        0.000186
50%        0.000482
75%        0.001621
max        0.134015
Name: avg, dtype: float64

In [None]:
mean_feats = feature_imp[feature_imp['avg'] > feature_imp['avg'].describe()['mean']]['Features'].tolist()
quart25_feats = feature_imp[feature_imp['avg'] > feature_imp['avg'].describe()['25%']]['Features'].tolist()
quart50_feats = feature_imp[feature_imp['avg'] > feature_imp['avg'].describe()['50%']]['Features'].tolist()
quart75_feats = feature_imp[feature_imp['avg'] > feature_imp['avg'].describe()['75%']]['Features'].tolist()

In [110]:
mean_feats = ['total_transactions',
 'login_after_expire_20',
 'is_auto_renew_vec',
 'total_spent',
 'login_after_expire_30',
 'total_cancelations',
 'avg_spent_trans',
 'login_after_expire_10',
 'total_spent_zero_vec',
 'plan_list_price',
 'total_logins',
 'net_paid_amount',
 'is_cancel',
 'AVG_secs_45_60',
 'STD_songs50_45_60',
 'SUM_logins_45_60',
 'STD_secs_45_60',
 'over_75perc_last_7_AVG',
 'payment_plan_days',
 'total_secs_last_7_AVG',
 'DIFSUM_logins_15_30_30_45',
 'spent_per_song',
 'expire_last_login',
 'num_unq_last_60',
 'over_985perc_last_7_AVG',
 'spent_per_num_unq',
 'spent_per_num_repeats',
 'membership_length',
 'DIFAVG_logins_15_30_30_45',
 'spent_per_logins',
 'AVG_unq_songs_0_15',
 'spent_per_secs',
 'num_unq_last_30',
 'STD_songs50_15_30']



In [109]:
quart25_feats = ['total_transactions',
 'login_after_expire_20',
 'is_auto_renew_vec',
 'total_spent',
 'login_after_expire_30',
 'total_cancelations',
 'avg_spent_trans',
 'login_after_expire_10',
 'total_spent_zero_vec',
 'plan_list_price',
 'total_logins',
 'net_paid_amount',
 'is_cancel',
 'AVG_secs_45_60',
 'STD_songs50_45_60',
 'SUM_logins_45_60',
 'STD_secs_45_60',
 'over_75perc_last_7_AVG',
 'payment_plan_days',
 'total_secs_last_7_AVG',
 'DIFSUM_logins_15_30_30_45',
 'spent_per_song',
 'expire_last_login',
 'num_unq_last_60',
 'over_985perc_last_7_AVG',
 'spent_per_num_unq',
 'songs_last_7',
 'spent_per_num_repeats',
 'membership_length',
 'logins_last_7_AVG',
 'AVG_songs_45_60',
 'songs_last_15',
 'DIFAVG_logins_15_30_30_45',
 'spent_per_logins',
 'num_repeat_last_7',
 'over_75perc_last_15_AVG',
 'AVG_unq_songs_0_15',
 'num_unq_last_7_AVG',
 'DIFAVG_secs_15_30_30_45',
 'AVG_logins_45_60',
 'spent_per_secs',
 'total_secs_last_30_AVG',
 'AVG_logins_30_45',
 'songs_last_60_AVG',
 'num_unq_last_120',
 'payment_method_agg_vec',
 'over_985perc_last_15_AVG',
 'num_unq_last_30',
 'AVG_logins_15_30',
 'SUM_logins_30_45',
 'SUM_secs_45_60',
 'logins_last_120_AVG',
 'logins_last_60_AVG',
 'total_songs',
 'SUM_unq_songs_45_60',
 'logins_last_60',
 'over_50perc_last_7_AVG',
 'SUM_songs_45_60',
 'AVG_unq_songs_30_45',
 'AVG_songs50_45_60',
 'over_50perc_last_60_AVG',
 'over_50perc_last_7',
 'DIFSUM_unq_songs_0_15_15_30',
 'num_unq_last_60_AVG',
 'AVG_unq_songs_45_60',
 'DIFSUM_songs50_15_30_30_45',
 'songs_last_60',
 'DIFSUM_logins_30_45_45_60',
 'total_secs_last_60_AVG',
 'sum_over_75pec',
 'STD_songs_30_45',
 'SUM_repeats_30_45',
 'sum_over_50pec',
 'sum_over_985pec',
 'SUM_songs_30_45',
 'songs_last_30',
 'sum_num_unq',
 'num_unq_last_15_AVG',
 'sum_num_repeat',
 'DIFAVG_secs_30_45_45_60',
 'SUM_unq_songs_15_30',
 'SUM_unq_songs_0_15',
 'total_secs',
 'over_985perc_last_120',
 'SUM_unq_songs_30_45',
 'logins_last_30',
 'over_985perc_last_60',
 'songs_last_120_AVG',
 'over_75perc_last_30_AVG',
 'STD_unq_songs_30_45',
 'over_50perc_last_30_AVG',
 'DIFAVG_logins_0_15_15_30',
 'total_secs_last_15_AVG',
 'logins_last_120',
 'SUM_logins_0_15',
 'num_unq_last_120_AVG',
 'DIFAVG_logins_30_45_45_60',
 'SUM_songs50_45_60',
 'AVG_songs_30_45',
 'SUM_songs50_30_45',
 'over_50perc_last_120',
 'num_repeat_last_60',
 'over_50perc_last_15_AVG',
 'total_secs_last_30',
 'num_unq_last_7',
 'logins_last_30_AVG',
 'SUM_secs_30_45',
 'DIFSUM_logins_0_15_15_30',
 'songs_last_120',
 'AVG_secs_0_15',
 'songs_last_30_AVG',
 'STD_songs50_15_30',
 'over_75perc_last_120',
 'over_985perc_last_7',
 'over_75perc_last_60_AVG']

In [108]:
quart50_feats = ['total_transactions',
 'login_after_expire_20',
 'is_auto_renew_vec',
 'total_spent',
 'login_after_expire_30',
 'total_cancelations',
 'avg_spent_trans',
 'login_after_expire_10',
 'total_spent_zero_vec',
 'plan_list_price',
 'total_logins',
 'net_paid_amount',
 'is_cancel',
 'AVG_secs_45_60',
 'STD_songs50_45_60',
 'SUM_logins_45_60',
 'STD_secs_45_60',
 'over_75perc_last_7_AVG',
 'payment_plan_days',
 'total_secs_last_7_AVG',
 'DIFSUM_logins_15_30_30_45',
 'spent_per_song',
 'expire_last_login',
 'num_unq_last_60',
 'over_985perc_last_7_AVG',
 'spent_per_num_unq',
 'songs_last_7',
 'spent_per_num_repeats',
 'membership_length',
 'logins_last_7_AVG',
 'AVG_songs_45_60',
 'songs_last_15',
 'DIFAVG_logins_15_30_30_45',
 'spent_per_logins',
 'num_repeat_last_7',
 'over_75perc_last_15_AVG',
 'AVG_unq_songs_0_15',
 'num_unq_last_7_AVG',
 'DIFAVG_secs_15_30_30_45',
 'AVG_logins_45_60',
 'spent_per_secs',
 'total_secs_last_30_AVG',
 'AVG_logins_30_45',
 'songs_last_60_AVG',
 'num_unq_last_120',
 'payment_method_agg_vec',
 'over_985perc_last_15_AVG',
 'num_unq_last_30',
 'AVG_logins_15_30',
 'SUM_logins_30_45',
 'SUM_secs_45_60',
 'logins_last_120_AVG',
 'logins_last_60_AVG',
 'total_songs',
 'SUM_unq_songs_45_60',
 'logins_last_60',
 'over_50perc_last_7_AVG',
 'SUM_songs_45_60',
 'AVG_unq_songs_30_45',
 'AVG_songs50_45_60',
 'over_50perc_last_60_AVG',
 'over_50perc_last_7',
 'DIFSUM_unq_songs_0_15_15_30',
 'num_unq_last_60_AVG',
 'AVG_unq_songs_45_60',
 'DIFSUM_songs50_15_30_30_45',
 'songs_last_60',
 'DIFSUM_logins_30_45_45_60',
 'total_secs_last_60_AVG',
 'sum_over_75pec',
 'STD_songs_30_45',
 'SUM_repeats_30_45',
 'sum_over_50pec',
 'sum_over_985pec',
 'SUM_songs_30_45',
 'songs_last_30',
 'sum_num_unq',
 'num_unq_last_15_AVG',
 'sum_num_repeat',
 'DIFAVG_secs_30_45_45_60',
 'SUM_unq_songs_15_30',
 'SUM_unq_songs_0_15',
 'total_secs',
 'over_985perc_last_120',
 'SUM_unq_songs_30_45',
 'logins_last_30',
 'over_985perc_last_60',
 'songs_last_120_AVG',
 'over_75perc_last_30_AVG',
 'STD_unq_songs_30_45',
 'over_50perc_last_30_AVG',
 'DIFAVG_logins_0_15_15_30',
 'total_secs_last_15_AVG',
 'logins_last_120',
 'SUM_logins_0_15',
 'num_unq_last_120_AVG',
 'DIFAVG_logins_30_45_45_60',
 'SUM_songs50_45_60',
 'AVG_songs_30_45',
 'SUM_songs50_30_45',
 'over_50perc_last_120',
 'num_repeat_last_60',
 'over_50perc_last_15_AVG',
 'total_secs_last_30',
 'num_unq_last_7',
 'logins_last_30_AVG',
 'SUM_secs_30_45',
 'DIFSUM_logins_0_15_15_30',
 'songs_last_120',
 'AVG_secs_0_15',
 'songs_last_30_AVG',
 'STD_songs50_15_30',
 'over_75perc_last_120',
 'over_985perc_last_7',
 'over_75perc_last_60_AVG']

In [107]:
quart75_feats = ['total_transactions',
 'login_after_expire_20',
 'is_auto_renew_vec',
 'total_spent',
 'login_after_expire_30',
 'total_cancelations',
 'avg_spent_trans',
 'login_after_expire_10',
 'total_spent_zero_vec',
 'plan_list_price',
 'total_logins',
 'net_paid_amount',
 'is_cancel',
 'AVG_secs_45_60',
 'STD_songs50_45_60',
 'SUM_logins_45_60',
 'STD_secs_45_60',
 'over_75perc_last_7_AVG',
 'payment_plan_days',
 'total_secs_last_7_AVG',
 'DIFSUM_logins_15_30_30_45',
 'spent_per_song',
 'expire_last_login',
 'num_unq_last_60',
 'over_985perc_last_7_AVG',
 'spent_per_num_unq',
 'songs_last_7',
 'spent_per_num_repeats',
 'membership_length',
 'logins_last_7_AVG',
 'AVG_songs_45_60',
 'songs_last_15',
 'DIFAVG_logins_15_30_30_45',
 'spent_per_logins',
 'num_repeat_last_7',
 'AVG_unq_songs_0_15',
 'num_unq_last_7_AVG',
 'DIFAVG_secs_15_30_30_45',
 'AVG_logins_45_60',
 'spent_per_secs',
 'total_secs_last_30_AVG',
 'AVG_logins_30_45',
 'songs_last_60_AVG',
 'payment_method_agg_vec',
 'over_985perc_last_15_AVG',
 'num_unq_last_30',
 'SUM_logins_30_45',
 'logins_last_60_AVG',
 'SUM_unq_songs_45_60',
 'logins_last_60',
 'over_50perc_last_7_AVG',
 'SUM_songs_45_60',
 'AVG_unq_songs_45_60',
 'songs_last_60',
 'sum_over_985pec',
 'songs_last_30',
 'songs_last_120_AVG',
 'STD_songs50_15_30']

--------------

### - <font color=blue>Train Model: Mean Features, All Splits</font> -

In [None]:
# Create master list of feature names for model
final_features = mean_feats

In [None]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [None]:
# Create pipeline objects
rfc_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,rfc])

In [None]:
# Parameter Grid for Random Forest Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(rfc.impurity,['entropy']) \
    .addGrid(rfc.maxDepth,[7]) \
    .build()

# Instantiate Cross Validation block
rfc_cv = CrossValidator(estimator=rfc_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=7)

***Random Forest Classifiers***

In [None]:
# GBT Training on Trainset 1 to 1
start = time.time()
rfc_model_1to1mean = rfc_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 74.0


In [None]:
# GBT Training on Trainset 3 to 1
start = time.time()
rfc_model_3to1mean = rfc_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 88.0


In [None]:
# GBT Training on Trainset 5 to 1
start = time.time()
rfc_model_5to1mean = rfc_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 105.0


In [None]:
# GBT Training on Trainset 7 to 1
start = time.time()
rfc_model_7to1mean = rfc_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 121.0


In [None]:
# GBT Training on Trainset 9 to 1
start = time.time()
rfc_model_9to1mean = rfc_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 137.0


In [None]:
# GBT Training on Trainset 11 to 1
start = time.time()
rfc_model_11to1mean = rfc_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 152.0


In [None]:
# GBT Training on Trainset 13 to 1
start = time.time()
rfc_model_13to1mean = rfc_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 162.0


In [None]:
# Dictionary of Models that were created
models_created = {
                  'rfc_model_1to1mean' : (rfc_model_1to1mean, DRV_Jan2016_1to1),
                  'rfc_model_3to1mean' : (rfc_model_3to1mean, DRV_Jan2016_3to1),
                  'rfc_model_5to1mean' : (rfc_model_5to1mean, DRV_Jan2016_5to1),
                  'rfc_model_7to1mean' : (rfc_model_7to1mean, DRV_Jan2016_7to1),
                  'rfc_model_9to1mean' : (rfc_model_9to1mean, DRV_Jan2016_9to1),
                  'rfc_model_11to1mean' : (rfc_model_11to1mean, DRV_Jan2016_11to1),
                  'rfc_model_13to1mean' : (rfc_model_13to1mean, DRV_Jan2016_13to1)}

#### <font color=purple>Model Evaluation: Optimal Parameters</font>

In [None]:
opti_params = pd.DataFrame()

for model_name, (model1, train_set) in models_created.items():
    scores = model1.avgMetrics
    params = [{p.name: v for p, v in m.items()} for m in model1.getEstimatorParamMaps()]
    params_pd = pd.DataFrame(params)
    params_pd['AUC'] = scores
    params_pd['Model'] = model_name
    best = params_pd.sort_values('AUC', ascending=False).head(1)
    opti_params = opti_params.append(best)

opti_params

### - <font color=blue>Evaluate Train Model: Mean Features, All Splits</font> -

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_results = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results = train_results.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_3to1mean
[[638171  34580]
 [  5154  13994]]
 
rfc_model_7to1mean
[[659273  13478]
 [  8488  10660]]
 
rfc_model_1to1mean
[[584466  88285]
 [  2025  17123]]
 
rfc_model_5to1mean
[[651458  21293]
 [  6956  12192]]
 
rfc_model_9to1mean
[[663602   9149]
 [  9691   9457]]
 
rfc_model_13to1mean
[[667287   5464]
 [ 10982   8166]]
 
rfc_model_11to1mean
[[665443   7308]
 [ 10395   8753]]
 


In [None]:
# Evaluate Training Model
train_results.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to1mean,0.881507,0.928277,0.86877,0.996547,0.130525
0,rfc_model_3to1mean,0.839716,0.969804,0.948599,0.991988,0.057427
0,rfc_model_5to1mean,0.802537,0.978774,0.968349,0.989435,0.040828
0,rfc_model_7to1mean,0.768341,0.983609,0.979966,0.987289,0.031747
0,rfc_model_9to1mean,0.740145,0.985998,0.986401,0.985607,0.027229
0,rfc_model_11to1mean,0.72313,0.986868,0.989137,0.984619,0.025586
0,rfc_model_13to1mean,0.709173,0.987822,0.991878,0.983809,0.023769


### - <font color=blue>Evaluate Validation Model: Mean Features, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results = validation_results.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_3to1mean
[[499640  28564]
 [  6426  13497]]
 
rfc_model_7to1mean
[[517401  10803]
 [ 11056   8867]]
 
rfc_model_1to1mean
[[450209  77995]
 [  2457  17466]]
 
rfc_model_5to1mean
[[511113  17091]
 [  9010  10913]]
 
rfc_model_9to1mean
[[520717   7487]
 [ 12260   7663]]
 
rfc_model_13to1mean
[[523298   4906]
 [ 13601   6322]]
 
rfc_model_11to1mean
[[522165   6039]
 [ 13099   6824]]
 


In [None]:
# Evaluate Validation Model
validation_results.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to1mean,0.864507,0.917974,0.852339,0.994572,0.146776
0,rfc_model_3to1mean,0.81169,0.966164,0.945922,0.987302,0.063836
0,rfc_model_5to1mean,0.757701,0.975097,0.967643,0.982677,0.047619
0,rfc_model_7to1mean,0.712306,0.979308,0.979548,0.979079,0.039879
0,rfc_model_9to1mean,0.685228,0.981386,0.985826,0.976997,0.036026
0,rfc_model_11to1mean,0.665543,0.981999,0.988567,0.975528,0.034915
0,rfc_model_13to1mean,0.654017,0.982619,0.990712,0.974667,0.033764


### - <font color=blue>Train Model: 75th Percentile, All Splits</font> -

In [None]:
# Create master list of feature names for model
final_features = quart75_feats

In [None]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [None]:
# Create pipeline objects
rfc_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,rfc])

In [None]:
# Parameter Grid for Random Forest Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(rfc.impurity,['entropy']) \
    .addGrid(rfc.maxDepth,[7]) \
    .build()

# Instantiate Cross Validation block
rfc_cv = CrossValidator(estimator=rfc_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=7)

***Random Forest Classifiers***

In [None]:
# GBT Training on Trainset 1 to 1
start = time.time()
rfc_model_1to175 = rfc_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 68.0


In [None]:
# GBT Training on Trainset 3 to 1
start = time.time()
rfc_model_3to175 = rfc_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 91.0


In [None]:
# GBT Training on Trainset 5 to 1
start = time.time()
rfc_model_5to175 = rfc_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 102.0


In [None]:
# GBT Training on Trainset 7 to 1
start = time.time()
rfc_model_7to175 = rfc_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 121.0


In [None]:
# GBT Training on Trainset 9 to 1
start = time.time()
rfc_model_9to175 = rfc_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 140.0


In [None]:
# GBT Training on Trainset 11 to 1
start = time.time()
rfc_model_11to175 = rfc_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 156.0


In [None]:
# GBT Training on Trainset 13 to 1
start = time.time()
rfc_model_13to175 = rfc_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 166.0


In [None]:
# Dictionary of Models that were created
models_created75 = {
                  'rfc_model_1to175' : (rfc_model_1to175, DRV_Jan2016_1to1),
                  'rfc_model_3to175' : (rfc_model_3to175, DRV_Jan2016_3to1),
                  'rfc_model_5to175' : (rfc_model_5to175, DRV_Jan2016_5to1),
                  'rfc_model_7to175' : (rfc_model_7to175, DRV_Jan2016_7to1),
                  'rfc_model_9to175' : (rfc_model_9to175, DRV_Jan2016_9to1),
                  'rfc_model_11to175' : (rfc_model_11to175, DRV_Jan2016_11to1),
                  'rfc_model_13to175' : (rfc_model_13to175, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 75th Percentile, All Splits</font> -

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_results75 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created75.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results75 = train_results75.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_3to175
[[638770  33981]
 [  5298  13850]]
 
rfc_model_9to175
[[663520   9231]
 [  9735   9413]]
 
rfc_model_13to175
[[667783   4968]
 [ 11176   7972]]
 
rfc_model_7to175
[[660958  11793]
 [  8900  10248]]
 
rfc_model_11to175
[[665763   6988]
 [ 10548   8600]]
 
rfc_model_5to175
[[653701  19050]
 [  7173  11975]]
 
rfc_model_1to175
[[591102  81649]
 [  2594  16554]]
 


In [None]:
# Evaluate Training Model
train_results75.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to175,0.871582,0.933476,0.878634,0.995631,0.121756
0,rfc_model_3to175,0.836401,0.970166,0.949489,0.991774,0.05677
0,rfc_model_5to175,0.798538,0.980332,0.971683,0.989146,0.0379
0,rfc_model_7to175,0.758835,0.984582,0.98247,0.986714,0.029908
0,rfc_model_9to175,0.738935,0.985904,0.986279,0.98554,0.027412
0,rfc_model_11to175,0.719373,0.986996,0.989613,0.984404,0.025345
0,rfc_model_13to175,0.704476,0.988052,0.992615,0.98354,0.023333


### - <font color=blue>Evaluate Validation Model: 75th Percentile, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results75 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created75.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results75 = validation_results75.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_3to175
[[501233  26971]
 [  6592  13331]]
 
rfc_model_9to175
[[521062   7142]
 [ 12416   7507]]
 
rfc_model_13to175
[[523839   4365]
 [ 13852   6071]]
 
rfc_model_7to175
[[519051   9153]
 [ 11482   8441]]
 
rfc_model_11to175
[[522725   5479]
 [ 13322   6601]]
 
rfc_model_5to175
[[513130  15074]
 [  9475  10448]]
 
rfc_model_1to175
[[456949  71255]
 [  3277  16646]]
 


In [None]:
# Evaluate Validation Model
validation_results75.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to175,0.850308,0.924591,0.865099,0.99288,0.135976
0,rfc_model_3to175,0.809032,0.967599,0.948938,0.987019,0.061232
0,rfc_model_5to175,0.74794,0.976633,0.971462,0.98187,0.044787
0,rfc_model_7to175,0.703176,0.980505,0.982671,0.978358,0.037646
0,rfc_model_9to175,0.68164,0.981573,0.986479,0.976726,0.035682
0,rfc_model_11to175,0.660476,0.982329,0.989627,0.975148,0.0343
0,rfc_model_13to175,0.64823,0.982904,0.991736,0.974238,0.033235


### - <font color=blue>Train Model: 50th Percentile, All Splits</font> -

In [None]:
# Create master list of feature names for model
final_features = quart50_feats

In [None]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [None]:
# # Create pipeline objects
rfc_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,rfc])

In [None]:
# Parameter Grid for Random Forest Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(rfc.impurity,['entropy']) \
    .addGrid(rfc.maxDepth,[7]) \
    .build()

# Instantiate Cross Validation block
rfc_cv = CrossValidator(estimator=rfc_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=7)

***Random Forest Classifiers***

In [None]:
# GBT Training on Trainset 1 to 1
start = time.time()
rfc_model_1to150 = rfc_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 85.0


In [None]:
# GBT Training on Trainset 3 to 1
start = time.time()
rfc_model_3to150 = rfc_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 102.0


In [None]:
# GBT Training on Trainset 5 to 1
start = time.time()
rfc_model_5to150 = rfc_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 120.0


In [None]:
# GBT Training on Trainset 7 to 1
start = time.time()
rfc_model_7to150 = rfc_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 138.0


In [None]:
# GBT Training on Trainset 9 to 1
start = time.time()
rfc_model_9to150 = rfc_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 156.0


In [None]:
# GBT Training on Trainset 11 to 1
start = time.time()
rfc_model_11to150 = rfc_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 177.0


In [None]:
# GBT Training on Trainset 13 to 1
start = time.time()
rfc_model_13to150 = rfc_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 186.0


In [None]:
# Dictionary of Models that were created
models_created50 = {
                  'rfc_model_1to150' : (rfc_model_1to150, DRV_Jan2016_1to1),
                  'rfc_model_3to150' : (rfc_model_3to150, DRV_Jan2016_3to1),
                  'rfc_model_5to150' : (rfc_model_5to150, DRV_Jan2016_5to1),
                  'rfc_model_7to150' : (rfc_model_7to150, DRV_Jan2016_7to1),
                  'rfc_model_9to150' : (rfc_model_9to150, DRV_Jan2016_9to1),
                  'rfc_model_11to150' : (rfc_model_11to150, DRV_Jan2016_11to1),
                  'rfc_model_13to150' : (rfc_model_13to150, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 50th Percentile, All Splits</font> -

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_results50 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created50.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results50 = train_results50.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_9to150
[[663874   8877]
 [ 10048   9100]]
 
rfc_model_13to150
[[667296   5455]
 [ 11619   7529]]
 
rfc_model_3to150
[[638826  33925]
 [  5351  13797]]
 
rfc_model_5to150
[[656084  16667]
 [  7976  11172]]
 
rfc_model_7to150
[[662386  10365]
 [  9606   9542]]
 
rfc_model_11to150
[[665845   6906]
 [ 10835   8313]]
 
rfc_model_1to150
[[594630  78121]
 [  3037  16111]]
 


In [None]:
# Evaluate Training Model
train_results50.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to150,0.862636,0.936112,0.883878,0.994919,0.117297
0,rfc_model_3to150,0.835059,0.970171,0.949573,0.991693,0.056766
0,rfc_model_5to150,0.77934,0.981561,0.975226,0.987989,0.035616
0,rfc_model_7to150,0.741461,0.985144,0.984593,0.985705,0.028864
0,rfc_model_9to150,0.731025,0.985942,0.986805,0.98509,0.027352
0,rfc_model_11to150,0.71194,0.986848,0.989735,0.983988,0.025641
0,rfc_model_13to150,0.692546,0.987363,0.991892,0.982886,0.024677


### - <font color=blue>Evaluate Validation Model: 50th Percentile, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results50 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created50.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results50 = validation_results50.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_9to150
[[521314   6890]
 [ 12749   7174]]
 
rfc_model_13to150
[[524013   4191]
 [ 14431   5492]]
 
rfc_model_3to150
[[501016  27188]
 [  6444  13479]]
 
rfc_model_5to150
[[515107  13097]
 [ 10369   9554]]
 
rfc_model_7to150
[[520397   7807]
 [ 12314   7609]]
 
rfc_model_11to150
[[522948   5256]
 [ 13620   6303]]
 
rfc_model_1to150
[[465413  62791]
 [  3784  16139]]
 


In [None]:
# Evaluate Validation Model
validation_results50.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to150,0.845596,0.933247,0.881124,0.991935,0.121459
0,rfc_model_3to150,0.812541,0.967521,0.948527,0.987301,0.061358
0,rfc_model_5to150,0.727375,0.977724,0.975205,0.980267,0.042811
0,rfc_model_7to150,0.68357,0.981029,0.98522,0.976884,0.036709
0,rfc_model_9to150,0.673521,0.981507,0.986956,0.976128,0.035829
0,rfc_model_11to150,0.653209,0.982267,0.990049,0.974616,0.034437
0,rfc_model_13to150,0.633863,0.982537,0.992066,0.973199,0.033974


### - <font color=blue>Train Model: 25th Percentile, All Splits</font> -

In [None]:
# Create master list of feature names for model
final_features = quart25_feats

In [None]:
# Specify the numeric features we will be transforming, and the name of the resulting output feature
assembler = VectorAssembler(
    inputCols= final_features,
    outputCol='features')

In [None]:
# # Create pipeline objects
rfc_pipe = Pipeline(stages=[is_auto_renew_encoder,never_active_subscriber_encoder,
                            total_spent_zero_encoder,city_agg_encoder,payment_method_agg_encoder,
                            assembler,scaler,rfc])

In [None]:
# Parameter Grid for Random Forest Hyperparameterization
paramGrid = ParamGridBuilder() \
    .addGrid(rfc.impurity,['entropy']) \
    .addGrid(rfc.maxDepth,[7]) \
    .build()

# Instantiate Cross Validation block
rfc_cv = CrossValidator(estimator=rfc_pipe,
                          estimatorParamMaps=paramGrid,
                          evaluator=binary_evaluator,
                          numFolds=7)

***Random Forest Classifiers***

In [None]:
# GBT Training on Trainset 1 to 1
start = time.time()
rfc_model_1to125 = rfc_cv.fit(DRV_Jan2016_1to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 78.0


In [None]:
# GBT Training on Trainset 3 to 1
start = time.time()
rfc_model_3to125 = rfc_cv.fit(DRV_Jan2016_3to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 100.0


In [None]:
# GBT Training on Trainset 5 to 1
start = time.time()
rfc_model_5to125 = rfc_cv.fit(DRV_Jan2016_5to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 117.0


In [None]:
# GBT Training on Trainset 7 to 1
start = time.time()
rfc_model_7to125 = rfc_cv.fit(DRV_Jan2016_7to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 144.0


In [None]:
# GBT Training on Trainset 9 to 1
start = time.time()
rfc_model_9to125 = rfc_cv.fit(DRV_Jan2016_9to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 160.0


In [None]:
# GBT Training on Trainset 11 to 1
start = time.time()
rfc_model_11to125 = rfc_cv.fit(DRV_Jan2016_11to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 187.0


In [None]:
# GBT Training on Trainset 13 to 1
start = time.time()
rfc_model_13to125 = rfc_cv.fit(DRV_Jan2016_13to1)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 190.0


In [None]:
# Dictionary of Models that were created
models_created25 = {
                  'rfc_model_1to125' : (rfc_model_1to125, DRV_Jan2016_1to1),
                  'rfc_model_3to125' : (rfc_model_3to125, DRV_Jan2016_3to1),
                  'rfc_model_5to125' : (rfc_model_5to125, DRV_Jan2016_5to1),
                  'rfc_model_7to125' : (rfc_model_7to125, DRV_Jan2016_7to1),
                  'rfc_model_9to125' : (rfc_model_9to125, DRV_Jan2016_9to1),
                  'rfc_model_11to125' : (rfc_model_11to125, DRV_Jan2016_11to1),
                  'rfc_model_13to125' : (rfc_model_13to125, DRV_Jan2016_13to1)}

### - <font color=blue>Evaluate Train Model: 25th Percentile, All Splits</font> -

In [None]:
## Create empty dataframe and populate with Train Set transformation results.
train_results25 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set_test) in models_created25.items():
    temp = model1.transform(DRV_Jan2016)
    
    # Create a Dataframe of Train Results
    train_results25 = train_results25.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_11to125
[[666116   6635]
 [ 11128   8020]]
 
rfc_model_1to125
[[589672  83079]
 [  2930  16218]]
 
rfc_model_5to125
[[658129  14622]
 [  8551  10597]]
 
rfc_model_9to125
[[665391   7360]
 [ 10702   8446]]
 
rfc_model_7to125
[[662669  10082]
 [  9767   9381]]
 
rfc_model_13to125
[[668023   4728]
 [ 12182   6966]]
 
rfc_model_3to125
[[641624  31127]
 [  5841  13307]]
 


In [None]:
# Evaluate Training Model
train_results25.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to125,0.861745,0.932023,0.876509,0.995056,0.124309
0,rfc_model_3to125,0.824343,0.971994,0.953732,0.990979,0.05343
0,rfc_model_5to125,0.765846,0.982694,0.978265,0.987174,0.033492
0,rfc_model_7to125,0.737467,0.985239,0.985014,0.985475,0.028688
0,rfc_model_9to125,0.715075,0.986604,0.98906,0.984171,0.026105
0,rfc_model_11to125,0.70449,0.986837,0.990138,0.983569,0.025673
0,rfc_model_13to125,0.678385,0.987496,0.992972,0.982091,0.02444


### - <font color=blue>Evaluate Validation Model: 25th Percentile, All Splits</font> -

In [None]:
# Create empty dataframe and populate with Train Set transformation results.
validation_results25 = pd.DataFrame()

# Transform Train Sets
for model_name, (model1, train_set) in models_created25.items():
    temp = model1.transform(DRV_Feb2016)    
    
    # Create a Dataframe of Train Results
    validation_results25 = validation_results25.append(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).evaluate())
    print('{}'.format(model_name))
    print(ClassEvaluator(resultname=model_name, resultdata=temp, model=model1).confusionmatrix())
    print(' ')

rfc_model_11to125
[[523263   4941]
 [ 13930   5993]]
 
rfc_model_1to125
[[460768  67436]
 [  3702  16221]]
 
rfc_model_5to125
[[517324  10880]
 [ 10920   9003]]
 
rfc_model_9to125
[[522636   5568]
 [ 13412   6511]]
 
rfc_model_7to125
[[520712   7492]
 [ 12477   7446]]
 
rfc_model_13to125
[[524635   3569]
 [ 14938   4985]]
 
rfc_model_3to125
[[504084  24120]
 [  7714  12209]]
 


In [None]:
# Evaluate Validation Model
validation_results25.sort_values('AUC', ascending=False)

Unnamed: 0,resultname,AUC,f1,precision,recall,error
0,rfc_model_1to125,0.843257,0.928332,0.87233,0.99203,0.129784
0,rfc_model_3to125,0.783573,0.969385,0.954336,0.984928,0.058078
0,rfc_model_5to125,0.715646,0.97936,0.979402,0.979328,0.039772
0,rfc_model_7to125,0.679777,0.981181,0.985816,0.976599,0.036431
0,rfc_model_9to125,0.658133,0.982161,0.989459,0.97498,0.034627
0,rfc_model_11to125,0.645727,0.982282,0.990646,0.974069,0.034428
0,rfc_model_13to125,0.621728,0.982663,0.993243,0.972315,0.033764


## <font color=black>Evaluation of Generalization over All Models</font>

In [None]:
results_all = train_resultsall[train_resultsall.columns[1:]] - validation_resultsall[validation_resultsall.columns[1:]]
results_all['resultname'] = train_resultsall['resultname']
results_all.sort_values('resultname', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.062111,0.004806,-0.000327,0.00981,-0.009253,rfc_model_11to1
0,0.060594,0.004905,-0.000347,0.009996,-0.009464,rfc_model_13to1
0,0.017969,0.003995,0.004709,0.003035,-0.005986,rfc_model_1to1
0,0.052734,0.003182,-0.000369,0.006986,-0.005756,rfc_model_3to1
0,0.058254,0.004123,-0.000251,0.008526,-0.007803,rfc_model_5to1
0,0.058983,0.004231,-0.000476,0.008906,-0.008067,rfc_model_7to1
0,0.061485,0.004411,-0.000711,0.009446,-0.00846,rfc_model_9to1


In [None]:
results_mean = train_results[train_results.columns[1:]] - validation_results[validation_results.columns[1:]]
results_mean['resultname'] = train_results['resultname']
results_mean.sort_values('resultname', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.057587,0.004869,0.00057,0.009091,-0.009329,rfc_model_11to1mean
0,0.055156,0.005203,0.001166,0.009141,-0.009995,rfc_model_13to1mean
0,0.017,0.010303,0.016431,0.001975,-0.016251,rfc_model_1to1mean
0,0.028026,0.003639,0.002677,0.004686,-0.006408,rfc_model_3to1mean
0,0.044836,0.003677,0.000706,0.006758,-0.00679,rfc_model_5to1mean
0,0.056035,0.004301,0.000418,0.00821,-0.008132,rfc_model_7to1mean
0,0.054917,0.004612,0.000575,0.008609,-0.008797,rfc_model_9to1mean


In [None]:
results_75 = train_results75[validation_results25.columns[1:]] - validation_results75[validation_results25.columns[1:]]
results_75['resultname'] = train_results75['resultname']
results_75.sort_values('resultname', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.058897,0.004667,-1.4e-05,0.009256,-0.008956,rfc_model_11to175
0,0.056246,0.005147,0.000879,0.009302,-0.009902,rfc_model_13to175
0,0.021273,0.008885,0.013535,0.002751,-0.01422,rfc_model_1to175
0,0.027369,0.002567,0.000551,0.004755,-0.004462,rfc_model_3to175
0,0.050597,0.003699,0.000222,0.007277,-0.006887,rfc_model_5to175
0,0.055659,0.004078,-0.000201,0.008356,-0.007739,rfc_model_7to175
0,0.057296,0.004331,-0.0002,0.008814,-0.00827,rfc_model_9to175


In [None]:
results_50 = train_results50[validation_results25.columns[1:]] - validation_results50[validation_results25.columns[1:]]
results_50['resultname'] = train_results50['resultname']
results_50.sort_values('resultname', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.058731,0.004581,-0.000315,0.009372,-0.008796,rfc_model_11to150
0,0.058682,0.004827,-0.000174,0.009687,-0.009297,rfc_model_13to150
0,0.01704,0.002866,0.002755,0.002983,-0.004162,rfc_model_1to150
0,0.022518,0.00265,0.001045,0.004392,-0.004593,rfc_model_3to150
0,0.051965,0.003836,2.1e-05,0.007722,-0.007195,rfc_model_5to150
0,0.057891,0.004115,-0.000627,0.008821,-0.007845,rfc_model_7to150
0,0.057504,0.004435,-0.000151,0.008962,-0.008477,rfc_model_9to150


In [None]:
results_25 = train_results25[validation_results25.columns[1:]] - validation_results25[validation_results25.columns[1:]]
results_25['resultname'] = train_results25['resultname']
results_25.sort_values('resultname', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,resultname
0,0.058763,0.004555,-0.000508,0.0095,-0.008755,rfc_model_11to125
0,0.056657,0.004834,-0.000271,0.009776,-0.009324,rfc_model_13to125
0,0.018488,0.003691,0.004179,0.003026,-0.005475,rfc_model_1to125
0,0.040771,0.002608,-0.000604,0.006051,-0.004648,rfc_model_3to125
0,0.0502,0.003335,-0.001137,0.007846,-0.00628,rfc_model_5to125
0,0.05769,0.004058,-0.000802,0.008876,-0.007744,rfc_model_7to125
0,0.056942,0.004443,-0.000399,0.009191,-0.008522,rfc_model_9to125


Based on generalization, the best in class features were as follows.
- **rfc_model_1to1mean** - 0.017000
- **rfc_model_3to150** - 0.022518
- **rfc_model_5to1mean** - 0.044836
- **rfc_model_7to175** - 0.055659
- **rfc_model_9to1mean** - 0.054917
- **rfc_model_11to1mean** - 0.057587
- **rfc_model_13to1mean** - 0.055156

-------------

## Recursive Feature Elimination - Ensemble of Models

In [56]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [5]:
# Custom evaluator that we can use instead of BinaryClassificationEvaluator() in grid search
class ClassEvaluatorPandas:

    def __init__(self, modelname, model, y_pred, y_true):
        
        # Initialize variables
        self.modelname = modelname
        self.y_pred = y_pred 
        self.y_true = y_true
        self.model = model
        
        # Calculate confusion matrix
        from sklearn.metrics import confusion_matrix
        self.cm = confusion_matrix(y_true,y_pred)
        
        # Calculate confusion matrix values
        self.tp = self.cm[0][0]
        self.fp = self.cm[0][1]
        self.tn = self.cm[1][1]
        self.fn = self.cm[1][0]
        
    def evaluate(self):
        
        # Calculate Metrics and add epsilon to prevent division by zero
        precision = self.tp / float(self.tp + self.fp + 0.00001)
        recall = self.tp / float(self.tp + self.fn + 0.00001)
        f1 = (2 * precision * recall) / float(precision + recall + 0.00001)
        error = (self.fp + self.fn + 0.00001) / (self.tp + self.fp + self.tn + self.fn + 0.00001)
        
        # Instantiate Evaluator and call AUC metric
        from sklearn.metrics import confusion_matrix, roc_curve, auc
        false_positive_rate, true_positive_rate, thresholds = roc_curve(self.y_true, self.y_pred)
        AUC = round(auc(false_positive_rate, true_positive_rate), ndigits=5)
        
        return pd.DataFrame(data=[[self.modelname, AUC, f1, precision, recall, error]], 
                            columns=['modelname', 'AUC', 'f1', 'precision', 'recall', 'error'])
    
    def confusionmatrix(self):
        from sklearn.metrics import confusion_matrix
        # Print Confusion Matrix
        return self.cm
        
    
    def modelparams(self):
        scores = self.model.avgMetrics
        params = [{p.name: v for p, v in m.items()} for m in self.model.getEstimatorParamMaps()]
        params_pd = pd.DataFrame(params)
        params_pd['AUC score'] = scores
        return params_pd

### - <font color=blue>Import Data</font> -

#### <font color=purple>Build Ensemble - Train Set</font>

In [None]:
# Return Probability Values as Pandas DF
rfc_model_1to1_train = rfc_model_1to125.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_3to1_train = rfc_model_3to1mean.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_5to1_train = rfc_model_5to1mean.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_7to1_train = rfc_model_7to1mean.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_9to1_train = rfc_model_9to150.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_11to1_train = rfc_model_11to1mean.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_13to1_train = rfc_model_13to150.transform(DRV_Jan2016).select(['msno','prediction','is_churn']).toPandas()

In [None]:
# Create a single DF with all Predictions and Convert back to Spark DF
Jan2016_predictsrfc = pd.merge(rfc_model_1to1_train[['msno','prediction']], rfc_model_3to1_train[['msno','prediction']], on='msno')
Jan2016_predictsrfc = pd.merge(Jan2016_predictsrfc, rfc_model_5to1_train[['msno','prediction']], on='msno')
Jan2016_predictsrfc = pd.merge(Jan2016_predictsrfc, rfc_model_7to1_train[['msno','prediction']], on='msno')
Jan2016_predictsrfc = pd.merge(Jan2016_predictsrfc, rfc_model_9to1_train[['msno','prediction']], on='msno')
Jan2016_predictsrfc = pd.merge(Jan2016_predictsrfc, rfc_model_11to1_train[['msno','prediction']], on='msno')
Jan2016_predictsrfc = pd.merge(Jan2016_predictsrfc, rfc_model_13to1_train[['msno','prediction', 'is_churn']], on='msno')

# Rename Columns
Jan2016_predictsrfc.columns = ['msno', 'rfc_model_1to125', 'rfc_model_3to1mean', 'rfc_model_5to1mean', 'rfc_model_7to1mean', 'rfc_model_9to150', 'rfc_model_11to1mean', 'rfc_model_13to150', 'is_churn']

In [None]:
Jan2016_predictsrfc.shape

In [None]:
# Export to GCS
sparkDf = spark.createDataFrame(Jan2016_predictsrfc)    
sparkDf.coalesce(1).write.option("header","true").csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Jan2016_predictsrfc')

#### <font color=purple>Build Ensemble - Validation Set</font>

In [None]:
# Return Probability Values as Pandas DF
rfc_model_1to1_valid = rfc_model_1to125.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_3to1_valid = rfc_model_3to1mean.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_5to1_valid = rfc_model_5to1mean.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_7to1_valid = rfc_model_7to1mean.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_9to1_valid = rfc_model_9to150.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_11to1_valid = rfc_model_11to1mean.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()
rfc_model_13to1_valid = rfc_model_13to150.transform(DRV_Feb2016).select(['msno','prediction','is_churn']).toPandas()

In [None]:
# Create a single DF with all Predictions and Convert back to Spark DF
Feb2016_predictsrfc = pd.merge(rfc_model_1to1_valid[['msno','prediction']], rfc_model_3to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsrfc = pd.merge(Feb2016_predictsrfc, rfc_model_5to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsrfc = pd.merge(Feb2016_predictsrfc, rfc_model_7to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsrfc = pd.merge(Feb2016_predictsrfc, rfc_model_9to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsrfc = pd.merge(Feb2016_predictsrfc, rfc_model_11to1_valid[['msno','prediction']], on='msno')
Feb2016_predictsrfc = pd.merge(Feb2016_predictsrfc, rfc_model_13to1_valid[['msno','prediction', 'is_churn']], on='msno')

# Rename Columns
Feb2016_predictsrfc.columns = ['msno', 'rfc_model_1to125', 'rfc_model_3to1mean', 'rfc_model_5to1mean', 'rfc_model_7to1mean', 'rfc_model_9to150', 'rfc_model_11to1mean', 'rfc_model_13to150', 'is_churn']

In [None]:
Feb2016_predictsrfc.shape

In [None]:
# Export to GCS
sparkDf = spark.createDataFrame(Feb2016_predictsrfc)    
sparkDf.coalesce(1).write.option("header","true").csv('gs://dataproc-fb3fa26d-011a-4757-afb9-5efdd6e75d60-us-east1/Datasets/KKBox User Data/Feb2016_predictsrfc')

#### <font color=purple>Import Ensemble Sets (if already built)</font>

In [8]:
Jan2016_predictsrfc = pd.read_csv('D:\J-5 Local\Jan2016_predictsrfc.csv')
Feb2016_predictsrfc = pd.read_csv('D:\J-5 Local\Feb2016_predictsrfc.csv')

In [9]:
# Instantiate Train x and y
train_x = Jan2016_predictsrfc[Jan2016_predictsrfc.columns[1:-1]]
train_y = Jan2016_predictsrfc['is_churn']

In [10]:
# Instantiate Validation x and y
valid_x = Feb2016_predictsrfc[Feb2016_predictsrfc.columns[1:-1]]
valid_y = Feb2016_predictsrfc['is_churn']

### <font color=blue>Train Model: All Splits, All Splits, XGB + RFECV</font>

#### <font color=purple>XGBOOST - Recursive Feature Selection</font>

In [18]:
# Instantiate Estimators
rfc = RandomForestClassifier(max_depth=5)
gbm = GradientBoostingClassifier(max_depth=5)
xgb = XGBClassifier()

In [19]:
start = time.time()
rfcrfc1 = RFECV(rfc, min_features_to_select=1, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 70


In [20]:
start = time.time()
rfcrfc2 = RFECV(rfc, min_features_to_select=2, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 62


In [21]:
start = time.time()
rfcrfc3 = RFECV(rfc, min_features_to_select=3, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 54


In [46]:
start = time.time()
rfcrfc4 = RFECV(rfc, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 51


In [47]:
start = time.time()
rfcrfc5 = RFECV(rfc, min_features_to_select=5, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 39


In [24]:
start = time.time()
rfcgmb4 = RFECV(gbm, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 914


In [25]:
start = time.time()
rfcxgb4 = RFECV(xgb, min_features_to_select=4, cv=10, scoring='roc_auc').fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Time spent for training: 692


In [48]:
# Dictionary of Models that were createdgg
ensembles_created = {
                  'RFC_RFC1' : rfcrfc1,
                  'RFC_RFC2' : rfcrfc2,
                  'RFC_RFC3' : rfcrfc3,
                  'RFC_RFC4' : rfcrfc4,
                  'RFC_RFC5' : rfcrfc5,
                  'RFC_GBM4' : rfcgmb4,
                  'RFC_XGB4' : rfcxgb4}

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + RFECV </font> -

In [49]:
# Train Model Results
train_ensemble_results = pd.DataFrame()

for model_name, model1 in ensembles_created.items():

    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_ensemble_results = train_ensemble_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    end = time.time()
    print('Time spent for training: {}'.format(round(end-start)))
    print('')

RFC_RFC1
[[668410   4341]
 [ 12026   7122]]
Time spent for training: 40

RFC_RFC2
[[668410   4341]
 [ 12025   7123]]
Time spent for training: 41

RFC_RFC3
[[668410   4341]
 [ 12025   7123]]
Time spent for training: 42

RFC_RFC4
[[668410   4341]
 [ 12026   7122]]
Time spent for training: 43

RFC_RFC5
[[668410   4341]
 [ 12026   7122]]
Time spent for training: 44

RFC_GBM4
[[668385   4366]
 [ 12010   7138]]
Time spent for training: 45

RFC_XGB4
[[668410   4341]
 [ 12026   7122]]
Time spent for training: 46



In [50]:
train_ensemble_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,RFC_RFC1,0.68275,0.9879,0.993547,0.982326,0.023655
0,RFC_RFC2,0.68277,0.987901,0.993547,0.982327,0.023654
0,RFC_RFC3,0.68277,0.987901,0.993547,0.982327,0.023654
0,RFC_RFC4,0.68275,0.9879,0.993547,0.982326,0.023655
0,RFC_RFC5,0.68275,0.9879,0.993547,0.982326,0.023655
0,RFC_GBM4,0.68315,0.987893,0.99351,0.982348,0.023668
0,RFC_XGB4,0.68275,0.9879,0.993547,0.982326,0.023655


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + RFECV </font> -

In [51]:
# Validation Model Results
valid_ensemble_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    valid_ensemble_results = valid_ensemble_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    end = time.time()
    print('Time spent for training: {}'.format(round(end-start)))
    print('')

RFC_RFC1
[[524727   3477]
 [ 14806   5117]]
Time spent for training: 47

RFC_RFC2
[[524727   3477]
 [ 14806   5117]]
Time spent for training: 48

RFC_RFC3
[[524727   3477]
 [ 14806   5117]]
Time spent for training: 49

RFC_RFC4
[[524727   3477]
 [ 14806   5117]]
Time spent for training: 49

RFC_RFC5
[[524727   3477]
 [ 14806   5117]]
Time spent for training: 50

RFC_GBM4
[[524700   3504]
 [ 14778   5145]]
Time spent for training: 51

RFC_XGB4
[[524727   3477]
 [ 14806   5117]]
Time spent for training: 52



In [52]:
valid_ensemble_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,RFC_RFC1,0.62513,0.982872,0.993417,0.972558,0.033355
0,RFC_RFC2,0.62513,0.982872,0.993417,0.972558,0.033355
0,RFC_RFC3,0.62513,0.982872,0.993417,0.972558,0.033355
0,RFC_RFC4,0.62513,0.982872,0.993417,0.972558,0.033355
0,RFC_RFC5,0.62513,0.982872,0.993417,0.972558,0.033355
0,RFC_GBM4,0.62581,0.982872,0.993366,0.972607,0.033354
0,RFC_XGB4,0.62513,0.982872,0.993417,0.972558,0.033355


#### <font color=purple>Generalization Between Train and Validation</font>

In [54]:
train_valid_results = train_ensemble_results[train_ensemble_results.columns[1:]] - valid_ensemble_results[valid_ensemble_results.columns[1:]]
train_valid_results['modelname'] = train_ensemble_results['modelname']
train_valid_results.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05734,0.005021,0.000144,0.009742,-0.009685,RFC_GBM4
0,0.05762,0.005028,0.00013,0.009768,-0.0097,RFC_RFC1
0,0.05762,0.005028,0.00013,0.009768,-0.0097,RFC_RFC4
0,0.05762,0.005028,0.00013,0.009768,-0.0097,RFC_RFC5
0,0.05762,0.005028,0.00013,0.009768,-0.0097,RFC_XGB4
0,0.05764,0.005029,0.00013,0.00977,-0.009702,RFC_RFC2
0,0.05764,0.005029,0.00013,0.00977,-0.009702,RFC_RFC3


### <font color=blue>Train Model: All Splits, XGB + GridCV </font>

#### <font color=purple>XGBOOST Parameter Tuning</font>

In [78]:
# Create Param Grid

param_rfc = {
         'bootstrap': [True, False],
         'max_depth': [3, 5, 7],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
         'n_estimators': [100, 500, 1000]
        }

param_gbm = {
        'learning_rate': [.1, .5, .01],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7],
        'n_estimators': [100, 500, 1000],
        }

param_xgb = {
        'learning_rate': [.1, .5, .01],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7],
        'n_estimators': [100, 500, 1000],
        }

# Instatiate Esitmator Object
rfc = RandomForestClassifier()
gbm = GradientBoostingClassifier()
xgb = XGBClassifier()

# Instatiate StratKFold Object
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle = True)

# Instatiate Random Search CV Object
rscv_rfc = RandomizedSearchCV(rfc, param_distributions=param_rfc, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)

rscv_gbm = RandomizedSearchCV(gbm, param_distributions=param_gbm, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)

rscv_xgb = RandomizedSearchCV(xgb, param_distributions=param_xgb, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)

In [73]:
start = time.time()
rfcrfc = rscv_rfc.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 13.2min finished


Time spent for training: 888


In [79]:
start = time.time()
rfcgmb = rscv_gbm.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 29.2min finished


Time spent for training: 1810


In [80]:
start = time.time()
rfcxgb = rscv_xgb.fit(train_x, train_y)
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:  8.9min finished


Time spent for training: 564


In [81]:
# Dictionary of Models that were created
ensembles_created1 = {
                  'RFC_RFC_rscv' : rfcrfc,
                  'RFC_GBM_rscv' : rfcgmb,
                  'RFC_XGB_rscv' : rfcxgb}

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [82]:
# Train Model Results
train_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(train_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=train_y)
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_rscv_results = train_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

RFC_RFC_rscv
[[668410   4341]
 [ 12026   7122]]

RFC_GBM_rscv
[[666912   5839]
 [ 11001   8147]]

RFC_XGB_rscv
[[668384   4367]
 [ 12010   7138]]



In [83]:
train_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,RFC_RFC_rscv,0.68275,0.9879,0.993547,0.982326,0.023655
0,RFC_GBM_rscv,0.7084,0.987527,0.991321,0.983772,0.024339
0,RFC_XGB_rscv,0.68314,0.987892,0.993509,0.982348,0.02367


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [84]:
# Validation Model Results
validation_rscv_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created1.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(valid_x)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=valid_y)

    # Validation Results and Print Confusion Matrixes
    validation_rscv_results = validation_rscv_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

RFC_RFC_rscv
[[524727   3477]
 [ 14806   5117]]

RFC_GBM_rscv
[[523347   4857]
 [ 13743   6180]]

RFC_XGB_rscv
[[524700   3504]
 [ 14778   5145]]



In [85]:
validation_rscv_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,RFC_RFC_rscv,0.62513,0.982872,0.993417,0.972558,0.033355
0,RFC_GBM_rscv,0.6505,0.982535,0.990805,0.974412,0.033934
0,RFC_XGB_rscv,0.62581,0.982872,0.993366,0.972607,0.033354


#### <font color=purple>Generalization Between Train and Validation</font>

In [86]:
results_all = train_rscv_results[train_rscv_results.columns[1:]] - validation_rscv_results[validation_rscv_results.columns[1:]]
results_all['modelname'] = train_rscv_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.05733,0.00502,0.000143,0.009742,-0.009684,RFC_XGB_rscv
0,0.05762,0.005028,0.00013,0.009768,-0.0097,RFC_RFC_rscv
0,0.0579,0.004992,0.000516,0.00936,-0.009595,RFC_GBM_rscv
