# Churn H2O v10 upsamled RF grid search

### Start cluster and import aux functions

In [1]:
import h2o
import aux_functions_v2 as af
import numpy as np
import pandas as pd
from pandasql import sqldf, load_meat, load_births
pysqldf = lambda q: sqldf(q, globals())
#https://github.com/yhat/pandasql
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
h2o.no_progress()
h2o.init(min_mem_size_GB=15)
path = '/Users/donny.ho/AnacondaProjects/churn_model_poc/saved_h20_models'


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (Zulu 8.20.0.5-macosx) (build 1.8.0_121-b15); OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-macosx) (build 25.121-b15, mixed mode)
  Starting server from /Users/donny.ho/anaconda3/envs/churn_model_27/lib/python2.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/jh/c7r9rq7571jgzybkpcdg7vcr0000gp/T/tmpWqT3wg
  JVM stdout: /var/folders/jh/c7r9rq7571jgzybkpcdg7vcr0000gp/T/tmpWqT3wg/h2o_donny_ho_started_from_python.out
  JVM stderr: /var/folders/jh/c7r9rq7571jgzybkpcdg7vcr0000gp/T/tmpWqT3wg/h2o_donny_ho_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,03 secs
H2O cluster version:,3.16.0.2
H2O cluster version age:,1 month and 21 days
H2O cluster name:,H2O_from_python_donny_ho_8i7uwb
H2O cluster total nodes:,1
H2O cluster free memory:,14.38 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


### Clean data

In [2]:
%load_ext sql
%sql postgresql://@localhost/postgres

f = open("churn_query.sql")
sql_statement=f.read()
data = %sql $sql_statement
data = data.DataFrame()
data = af.upsample(data_frame=h2o.H2OFrame(data), minority_to_majority_ratio=1).as_data_frame()
data['rid'] = data.index
data['fold']  = np.random.randint(1, 11, data.shape[0])

#columns_to_convert=['rf_predict','gbm_predict','glm_predict','kmeans_predict','hascrcard','isactivemember','geography','gender','exited']
columns_to_convert=['rf_predict','gbm_predict','glm_predict','kmeans_predict','hascrcard','isactivemember','geography','gender','exited']
training_columns = ['age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender']
response_column = 'exited'
data = h2o.H2OFrame(data)

train, test, valid = data.split_frame([0.70,0.15], seed=1234)
train       = train.as_data_frame()
test        = test.as_data_frame()
train_meta  = train.copy(deep=True)
test_meta   = test.copy(deep=True)

train      = af.column_to_factors(train,columns_to_convert)
test       = af.column_to_factors(test,columns_to_convert)
train_meta = af.column_to_factors(train_meta,columns_to_convert)
test_meta  = af.column_to_factors(test_meta,columns_to_convert)
print('Done')


u'Connected: postgres@postgres'

10000 rows affected.
Done


# Random Forrest

In [3]:
from h2o.estimators import H2ORandomForestEstimator

model_id="rf"

rf = H2ORandomForestEstimator(
    model_id                          = model_id,
    nfolds                            = 10,
    ntrees                            = 700,
    max_depth                         = 15,
    stopping_rounds                   = 3,
    stopping_tolerance                = 0.005,
    #nbins_cats                        = 128,
    #nbins                             = 128,
    score_each_iteration              = True,
    fold_assignment                   = "Modulo",
    keep_cross_validation_predictions = True,
    stopping_metric                   = 'AUC',
    seed                              = 3000000,
    col_sample_rate_change_per_level  = 0.85
)

for k in range(1,12):
    print('Training model: ',k)
    if k<11:
        test_fold  = train[train['fold'] == k]
        train_fold = train[train['fold'] != k]
        table_to_insert = train_meta.as_data_frame()
    if k==11:
        test_fold  = test
        train_fold = train
        table_to_insert = test_meta.as_data_frame()

    rf.train(x=training_columns, 
                      y=response_column, 
                      training_frame=train_fold,
                      validation_frame=valid
                     )
    #intermediate_frame = pd.concat([test_fold.as_data_frame(), rf.predict(test_fold).as_data_frame()], axis=1)[['fold','rid','predict']]
    intermediate_frame = test_fold[['fold','rid','exited']].cbind(rf.predict(test_fold)[['p1','predict']]).as_data_frame()

    sql_query = '''
    SELECT
        A.fold,
        A.rid,
        COALESCE(B.predict, A.rf_predict) as rf_predict,
        COALESCE(B.p1, A.rf_predict_prob) as rf_predict_prob,
        A.gbm_predict,
        A.gbm_predict_prob,
        A.glm_predict,
        A.glm_predict_prob,
        A.xgboost_predict,
        A.xgboost_predict_prob,
        A.kmeans_predict,
        A.age,
        A.tenure,
        A.balance,
        A.numofproducts,
        A.estimatedsalary,
        A.ratio,
        A.hascrcard,
        A.isactivemember,
        A.geography,
        A.gender,
        A.exited
    FROM table_to_insert AS A 
    LEFT JOIN intermediate_frame AS B ON A.fold = B.fold AND A.rid = B.rid 
    '''
    #pysqldf(sql_query)
    if k<11:
        train_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)
        
    if k==11:
        test_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)

('Training model: ', 1)
('Training model: ', 2)
('Training model: ', 3)
('Training model: ', 4)
('Training model: ', 5)
('Training model: ', 6)
('Training model: ', 7)
('Training model: ', 8)
('Training model: ', 9)
('Training model: ', 10)
('Training model: ', 11)


In [4]:
#train_meta
#test_fold[['fold','rid','exited']].cbind(glm.predict(test_fold)[['p1','predict']]).as_data_frame()
#rf.predict(test_fold)
#test_fold[['fold','rid','exited']].cbind(rf.predict(test_fold)['p1']).as_data_frame()

# GBM

In [5]:
from h2o.estimators import H2OGradientBoostingEstimator

model_id="gbm"

gbm = H2OGradientBoostingEstimator(
    model_id                          = model_id,
    nfolds                            = 10,
    ntrees                            = 700,
    max_depth                         = 15,
    stopping_rounds                   = 3,
    stopping_tolerance                = 0.005,
    #nbins_cats                        = 128,
    #nbins                             = 128,
    score_each_iteration              = True,
    fold_assignment                   = "Modulo",
    keep_cross_validation_predictions = True,
    distribution                      ='AUTO',
    stopping_metric                   = 'AUC',
    seed                              = 3000000,
    col_sample_rate_change_per_level  = 0.85
)

for k in range(1,12):
    print('Training model: ',k)
    if k<11:
        test_fold  = train[train['fold'] == k]
        train_fold = train[train['fold'] != k]
        table_to_insert = train_meta.as_data_frame()
    if k==11:
        test_fold  = test
        train_fold = train
        table_to_insert = test_meta.as_data_frame()

    gbm.train(x=training_columns, 
                      y=response_column, 
                      training_frame=train_fold,
                      validation_frame=valid
                     )
    #intermediate_frame = pd.concat([test_fold.as_data_frame(), gbm.predict(test_fold).as_data_frame()], axis=1)[['fold','rid','predict']]
    intermediate_frame = test_fold[['fold','rid','exited']].cbind(gbm.predict(test_fold)[['p1','predict']]).as_data_frame()
    sql_query = '''
    --SELECT COUNT(*) FROM intermediate_frame
    SELECT
        A.fold,
        A.rid,
        A.rf_predict,
        A.rf_predict_prob,
        COALESCE(B.predict, A.gbm_predict) as gbm_predict,
        COALESCE(B.p1, A.gbm_predict_prob) as gbm_predict_prob,
        A.glm_predict,
        A.glm_predict_prob,
        A.xgboost_predict,
        A.xgboost_predict_prob,
        A.kmeans_predict,
        A.age,
        A.tenure,
        A.balance,
        A.numofproducts,
        A.estimatedsalary,
        A.ratio,
        A.hascrcard,
        A.isactivemember,
        A.geography,
        A.gender,
        A.exited
    FROM table_to_insert AS A 
    LEFT JOIN intermediate_frame AS B ON A.fold = B.fold AND A.rid = B.rid 
    '''
    #pysqldf(sql_query)
    if k<11:
        train_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)
        
    if k==11:
        test_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)

('Training model: ', 1)
('Training model: ', 2)
('Training model: ', 3)
('Training model: ', 4)
('Training model: ', 5)
('Training model: ', 6)
('Training model: ', 7)
('Training model: ', 8)
('Training model: ', 9)
('Training model: ', 10)
('Training model: ', 11)


# GLM

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

model_id="glm"

glm = H2OGeneralizedLinearEstimator(
    model_id                          = model_id,
    family                            ='binomial', 
    lambda_search                     = True,
    score_each_iteration              = True,
    nfolds                            = 10,
    fold_assignment                   = "Modulo",
    keep_cross_validation_predictions = True, 
    seed                              = 3000000
)

for k in range(1,12):
    print('Training model: ',k)
    if k<11:
        test_fold  = train[train['fold'] == k]
        train_fold = train[train['fold'] != k]
        table_to_insert = train_meta.as_data_frame()
    if k==11:
        test_fold  = test
        train_fold = train
        table_to_insert = test_meta.as_data_frame()

    glm.train(x=training_columns, 
                      y=response_column, 
                      training_frame=train_fold,
                      validation_frame=valid
                     )
    intermediate_frame = test_fold[['fold','rid','exited']].cbind(glm.predict(test_fold)[['p1','predict']]).as_data_frame()

    sql_query = '''
    --SELECT COUNT(*) FROM intermediate_frame
    SELECT
        A.fold,
        A.rid,
        A.rf_predict,
        A.rf_predict_prob,
        A.gbm_predict,
        A.gbm_predict_prob,
        COALESCE(B.predict, A.glm_predict) as glm_predict,
        COALESCE(B.p1, A.glm_predict_prob) as glm_predict_prob,
        A.xgboost_predict,
        A.xgboost_predict_prob,
        A.kmeans_predict,
        A.age,
        A.tenure,
        A.balance,
        A.numofproducts,
        A.estimatedsalary,
        A.ratio,
        A.hascrcard,
        A.isactivemember,
        A.geography,
        A.gender,
        A.exited
    FROM table_to_insert AS A 
    LEFT JOIN intermediate_frame AS B ON A.fold = B.fold AND A.rid = B.rid 
    '''
    #pysqldf(sql_query)
    if k<11:
        train_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)
        
    if k==11:
        test_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)

('Training model: ', 1)
('Training model: ', 2)
('Training model: ', 3)
('Training model: ', 4)
('Training model: ', 5)
('Training model: ', 6)
('Training model: ', 7)
('Training model: ', 8)
('Training model: ', 9)
('Training model: ', 10)
('Training model: ', 11)


# XGBOOST

In [None]:
from h2o.estimators.xgboost import H2OXGBoostEstimator

model_id="xgboost"

param = {
    "model_id"                          : model_id,
    "ntrees"                            : 1000,
    "max_depth"                         : 14,
    "learn_rate"                        : 0.02,
    "sample_rate"                       : 0.85,
    #"col_sample_rate_per_tree"          : 0.9,
    #"min_rows"                          : 5,
    "nfolds"                            : 10,
    "fold_assignment"                   : "Modulo",
    "seed"                              : 3000000,
    "score_tree_interval"               : 100,
    "score_each_iteration"              : True,
    "fold_assignment"                   : "Modulo",
    "keep_cross_validation_predictions" : True
}

xgboost = H2OXGBoostEstimator(**param)

for k in range(1,12):
    print('Training model: ',k)
    if k<11:
        test_fold  = train[train['fold'] == k]
        train_fold = train[train['fold'] != k]
        table_to_insert = train_meta.as_data_frame()
    if k==11:
        test_fold  = test
        train_fold = train
        table_to_insert = test_meta.as_data_frame()

    xgboost.train(x=training_columns, 
                  y=response_column,
                  training_frame=train_fold
                  #validation_frame=valid
                     )
    intermediate_frame = test_fold[['fold','rid','exited']].cbind(xgboost.predict(test_fold)[['p1','predict']]).as_data_frame()

    sql_query = '''
    SELECT
        A.fold,
        A.rid,
        A.rf_predict,
        A.rf_predict_prob,
        A.gbm_predict,
        A.gbm_predict_prob,
        A.glm_predict,
        A.glm_predict_prob,
        COALESCE(B.predict, A.xgboost_predict) AS xgboost_predict,
        COALESCE(B.p1, A.xgboost_predict_prob) AS xgboost_predict_prob,
        A.kmeans_predict,
        A.age,
        A.tenure,
        A.balance,
        A.numofproducts,
        A.estimatedsalary,
        A.ratio,
        A.hascrcard,
        A.isactivemember,
        A.geography,
        A.gender,
        A.exited
    FROM table_to_insert AS A 
    LEFT JOIN intermediate_frame AS B ON A.fold = B.fold AND A.rid = B.rid 
    '''
    #pysqldf(sql_query)
    if k<11:
        train_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)
        
    if k==11:
        test_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)

('Training model: ', 1)
('Training model: ', 2)
('Training model: ', 3)


# K-MEANS

In [None]:
from h2o.estimators import H2OKMeansEstimator

model_id="kmeans"
training_columns = ['age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender']
response_column = 'exited'

kmeans = H2OKMeansEstimator(
    model_id                          = model_id,
    k                                 = 10,
    #score_each_iteration              = True,
    #fold_assignment                   = "Modulo",
    #keep_cross_validation_predictions = True,
    #stopping_metric                   = 'MSE',
    seed                              = 3000000
)

for k in range(1,12):
    print('Training model: ',k)
    if k<11:
        test_fold  = train[train['fold'] == k]
        train_fold = train[train['fold'] != k]
        table_to_insert = train_meta.as_data_frame()
    if k==11:
        test_fold  = test
        train_fold = train
        table_to_insert = test_meta.as_data_frame()

    kmeans.train(x=training_columns, 
                      training_frame=train_fold,
                      #validation_frame=valid
                     )
    #intermediate_frame = pd.concat([test_fold.as_data_frame(), kmeans.predict(test_fold).as_data_frame()], axis=1)[['fold','rid','predict']]
    intermediate_frame = test_fold[['fold','rid','exited']].cbind(kmeans.predict(test_fold)['predict']).as_data_frame()

    sql_query = '''
    SELECT
        A.fold,
        A.rid,
        A.rf_predict,
        A.rf_predict_prob,
        A.gbm_predict,
        A.gbm_predict_prob,
        A.glm_predict,
        A.glm_predict_prob,
        A.xgboost_predict,
        A.xgboost_predict_prob,
        COALESCE(B.predict, A.kmeans_predict) AS kmeans_predict,
        A.age,
        A.tenure,
        A.balance,
        A.numofproducts,
        A.estimatedsalary,
        A.ratio,
        A.hascrcard,
        A.isactivemember,
        A.geography,
        A.gender,
        A.exited
    FROM table_to_insert AS A 
    LEFT JOIN intermediate_frame AS B ON A.fold = B.fold AND A.rid = B.rid 
    '''
    #pysqldf(sql_query)
    if k<11:
        train_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)
        
    if k==11:
        test_meta = af.column_to_factors(pysqldf(sql_query),columns_to_convert)

In [None]:
from h2o.estimators.xgboost import H2OXGBoostEstimator
param = {
      "ntrees" : 1000
    , "max_depth" : 14
    , "learn_rate" : 0.02
    , "sample_rate" : 0.85
    #, "col_sample_rate_per_tree" : 0.9
    #, "min_rows" : 5
    , "seed": 3000000
    , "score_tree_interval": 100
}
print('Start')

#training_columns =  ['rf_predict','gbm_predict','glm_predict','kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender']
#training_columns = ['rf_predict','kmeans_predict','age','tenure','balance','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender']

#training_columns = ['age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.982254
#training_columns = ['rf_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.81296
#training_columns = ['rf_predict','age','tenure','balance','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.7859
#training_columns = ['kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.977489
#training_columns = ['glm_predict','kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.962398
training_columns = ['glm_predict','kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.

response_column  = 'exited'
xgboost = H2OXGBoostEstimator(**param)
xgboost.train(x=training_columns, 
   y=response_column, 
   training_frame=af.column_to_factors(train_meta,columns_to_convert)
   #validation_frame=valid
             )

print('Done')

# H2O STACKING

In [None]:
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

all_model_ids=[rf.model_id,gbm.model_id,glm.model_id,xgboost.model_id]

ensemble = H2OStackedEnsembleEstimator(base_models = all_model_ids)

ensemble.train(x=training_columns, y=response_column, training_frame=train, validation_frame=valid)

#h2o.save_model(model=ensemble, path=path, force=True)


In [None]:
import metric_validation as mv
#mv.metrics(ensemble,test,'p1')
mv1.roc_plot(xgboost,test_meta,'p1')

# GBM STACKED

In [None]:
from h2o.estimators import H2OGradientBoostingEstimator

model_id="gbm_stacked"

#training_columns = ['rf_predict','rf_predict_prob','gbm_predict','gbm_predict_prob','glm_predict','glm_predict_prob','xgboost_predict','xgboost_predict_prob','kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender']
# 0.98097
#training_columns = ['rf_predict','rf_predict_prob','gbm_predict','gbm_predict_prob','glm_predict_prob','xgboost_predict','xgboost_predict_prob','kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.
# 0.97908
#training_columns = ['rf_predict','rf_predict_prob','gbm_predict','gbm_predict_prob','glm_predict_prob','xgboost_predict_prob','kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.
# 0.981793
#training_columns = ['rf_predict_prob','gbm_predict','gbm_predict_prob','glm_predict_prob','xgboost_predict_prob','kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.
# 0.976479
#training_columns = ['rf_predict_prob','gbm_predict','gbm_predict_prob','glm_predict_prob','xgboost_predict_prob','kmeans_predict','age','tenure','balance','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.
# 0.983668
#training_columns = ['rf_predict_prob','gbm_predict_prob','glm_predict_prob','xgboost_predict_prob','kmeans_predict','age','tenure','balance','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender'] # 0.
# 0.975641
#training_columns = ['rf_predict_prob','gbm_predict_prob','glm_predict_prob','xgboost_predict_prob','kmeans_predict','age','tenure','balance','estimatedsalary','ratio','hascrcard','geography','gender'] # 0.
# 0.979778

gbm_stacked = H2OGradientBoostingEstimator(
    model_id                          = model_id,
    nfolds                            = 10,
    ntrees                            = 700,
    max_depth                         = 15,
    stopping_rounds                   = 3,
    stopping_tolerance                = 0.005,
    #nbins_cats                        = 128,
    #nbins                             = 128,
    score_each_iteration              = True,
    fold_assignment                   = "Modulo",
    keep_cross_validation_predictions = True,
    distribution                      ='AUTO',
    stopping_metric                   = 'AUC',
    seed                              = 3000000,
    col_sample_rate_change_per_level  = 0.85
)


gbm_stacked.train(x=training_columns, 
                  y=response_column, 
                  training_frame=train_meta
                  #validation_frame=valid
                     )

In [None]:
import metric_validation as mv1
mv1.roc_plot(gbm_stacked,test_meta,'p1')

In [None]:
mv.metrics(gbm_stacked,test_meta) 
mv.metrics(rf,test_meta) 
mv.metrics(gbm,test_meta) 
mv.metrics(glm,test_meta) 
mv.metrics(xgboost,test_meta) 

In [None]:
def random_training_columns():
    train_col = ['rf_predict','rf_predict_prob','gbm_predict','gbm_predict_prob','glm_predict','glm_predict_prob','xgboost_predict','xgboost_predict_prob','kmeans_predict','age','tenure','balance','numofproducts','estimatedsalary','ratio','hascrcard','isactivemember','geography','gender']
    pwrset = list(powerset(train_col))
    i=0
    traning_col_list = []
    gen = (x for x in iter(pwrset) if (len(list(x))>=8) )
    for x in range(1,10):
        col_info = {}
        col_info['training_columns'] = random.choice(pwrset)
        col_info['index'] = i
        traning_col_list.append(col_info)
        i=i+1
    return traning_col_list


In [None]:
r = random_training_columns()
#list(r[0]['training_columns'])
i=0
for col in r:
    #print(col)
    #print('training set: ', col['index'])
    gbm_stacked.train(x=list(col['training_columns']), 
                  y=response_column, 
                  training_frame=train_meta
                     )
    
    r[i]['model'] = gbm_stacked
    r[i]['metrics'] = mv.metrics(gbm_stacked,train_meta)
    i=i+1


In [None]:
r[0]['metrics']
r[0]['training_columns']

In [None]:
import metric_validation as mv
mv.roc_plot(xgboost,test_meta,'p1')
traning_col_list

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random
classifier = xgboost
test_frame = test_meta

column='p1'

actual = test_frame.as_data_frame()
print(classifier.predict(test_frame).columns)
predictions = classifier.predict(test_frame).as_data_frame()[column].tolist()

false_positive_rate, true_positive_rate, thresholds = roc_curve(actual.exited.astype('float'), predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.6f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.plot([0,1],[1,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


In [None]:
metrics(xgboost)

# ROC curve

In [None]:
import metric_validation as mv
# with original training column because the one I had in XGBOOST was accidentally wrong auc = 0.982254
# without pre_trained_models  auc = 0.970926
# without hascard auc = 0.965060
# with kmeans auc = 0.966456
# with kmeans but without hascard auc = 0.965060

mv.roc_plot(xgboost,test_frame=test_meta,column='p1')
mv.roc_plot(rf,test_frame=test_meta,column='p1')
mv.roc_plot(gbm,test_frame=test_meta,column='p1')
mv.roc_plot(glm,test_frame=test_meta,column='p1')

In [None]:
def metrics(predictor,test_frame):
    #consolidate_frame = test_meta['exited'].cbind(xgboost.predict(test_meta)['predict'])
    consolidate_frame = test_frame['exited'].cbind(predictor.predict(test_frame)['predict'])

    accuracy = float(len(consolidate_frame[consolidate_frame['exited']==consolidate_frame['predict']]))/len(consolidate_frame)
    accuracy



    true_positives = consolidate_frame[(consolidate_frame['exited']==consolidate_frame['predict'])
                                      &(consolidate_frame['exited'].asnumeric() ==1)]

    false_positives = consolidate_frame[(consolidate_frame['exited']!=consolidate_frame['predict'])
                                      &(consolidate_frame['exited'].asnumeric() ==1)]

    true_positive_rate = float(len(true_positives))/(len(false_positives)+len(true_positives))
    true_positive_rate



    true_negatives = consolidate_frame[(consolidate_frame['exited']==consolidate_frame['predict'])
                                      &(consolidate_frame['exited'].asnumeric() ==0)]

    false_negatives = consolidate_frame[(consolidate_frame['exited']!=consolidate_frame['predict'])
                                      &(consolidate_frame['exited'].asnumeric() ==0)]

    true_negative_rate = float(len(true_negatives))/(len(false_negatives)+len(true_negatives))
    true_negative_rate
    return [accuracy, true_positive_rate, true_negative_rate]

In [None]:
metrics(xgboost, test_frame = test_meta)
metrics(rf, test_frame = test_meta)
metrics(gbm, test_frame = test_meta)
metrics(glm, test_frame = test_meta)

In [None]:
%load_ext autoreload
%autoreload mv
%autoreload metric_validation

In [None]:
import itertools

def powerset(L):
  pset = set()
  for n in xrange(len(L) + 1):
    for sset in itertools.combinations(L, n):
      pset.add(sset)
  return pset