Background: (Jira: DA-422) 

Retrived person features per user device from Quest Mobile (3rd party persona service). 

To determine whether the QM features improve classification score on top of LLS common persona features.

## Classifier Score

In [1]:
from helper_sqlbuffet import *
# from bokeh.io import output_notebook, show
# from bokeh.plotting import figure
# from bokeh.models import ColumnDataSource
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from bayes_opt import BayesianOptimization

In [2]:
# read raw
# some source tables were imported from offline data
df_raw = query2df('./sql_training_data_with_lls_qm_persona_features.sql')

In [3]:
# set label and feature columns
_label = 'ordered_cc'

_lls_features = ['lls_profession', 'gender', 'is_child', 'city_level', 'province', 'platform', 'channel']
_qm_features = ['age', 'marriage', 'children', 'qm_profession', 'online_purchasing_power']

_lls_cate_features = ['lls_profession', 'gender', 'province', 'platform', 'channel']
_qm_cate_features = ['age', 'marriage', 'children', 'qm_profession']

# convert object to string columns
for col in _lls_cate_features + _qm_cate_features:
    df_raw[col] = df_raw[col].astype('category')

In [4]:
# Bayesian Opt
# ref: https://gist.github.com/mohit-sinha/be3f2999eb21d1992d03b7590fe2d88b
def lgb_eval(num_leaves,
             max_depth,
             min_child_weight,
             subsample,
             colsample_bytree):
    
    clf = lgb.LGBMClassifier(
        objective = 'binary',
        metric= 'auc',
        eval_metric= 'auc',
        n_estimators= 2000,
        # early_stopping_rounds = 50,
        num_leaves= int(num_leaves),
        max_depth= int(max_depth),
        min_child_weight= min_child_weight,
        subsample= subsample,
        colsample_bytreeple_bytree= colsample_bytree,
        verbose =-1
    )
    
    scores = cross_val_score(clf, train_x, train_y, cv=4, scoring='roc_auc')
    return np.mean(scores)

params = {
          'num_leaves': (3, 20),
          'max_depth': (2, 10),
          'min_child_weight': (0.01, 70),
          'subsample': (0.4, 1),                                                
          'colsample_bytree': (0.4, 1)
          }

def bo_opt(eval_model, params):
    optimizer = BayesianOptimization(eval_model, params)
    optimizer.maximize(init_points=5, n_iter=10)
    print(optimizer.max['params'])
    print(optimizer.max['target'])

In [5]:
# use lls + qm features
train_x = df_raw[_lls_features + _qm_features]
train_y = df_raw[_label]

bo_opt(lgb_eval, params)

|   iter    |  target   | colsam... | max_depth | min_ch... | num_le... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6667  [0m | [0m 0.6567  [0m | [0m 7.872   [0m | [0m 7.412   [0m | [0m 12.73   [0m | [0m 0.9997  [0m |
| [95m 2       [0m | [95m 0.7132  [0m | [95m 0.4164  [0m | [95m 5.698   [0m | [95m 67.84   [0m | [95m 6.234   [0m | [95m 0.5365  [0m |
| [0m 3       [0m | [0m 0.6705  [0m | [0m 0.7529  [0m | [0m 4.068   [0m | [0m 16.84   [0m | [0m 19.38   [0m | [0m 0.4073  [0m |
| [0m 4       [0m | [0m 0.6756  [0m | [0m 0.5727  [0m | [0m 6.514   [0m | [0m 44.16   [0m | [0m 16.7    [0m | [0m 0.695   [0m |
| [0m 5       [0m | [0m 0.701   [0m | [0m 0.9604  [0m | [0m 3.334   [0m | [0m 28.67   [0m | [0m 9.405   [0m | [0m 0.5422  [0m |
| [95m 6       [0m | [95m 0.7535  [0m | [95m 0.8148  [0m | [95m 2.0     [0m | [95m 70.0    [0m | [95

In [6]:
# use lls features only
train_x = df_raw[_lls_features]
train_y = df_raw[_label]

bo_opt(lgb_eval, params)

|   iter    |  target   | colsam... | max_depth | min_ch... | num_le... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7142  [0m | [0m 0.6403  [0m | [0m 6.02    [0m | [0m 40.13   [0m | [0m 18.6    [0m | [0m 0.7314  [0m |
| [95m 2       [0m | [95m 0.7171  [0m | [95m 0.6098  [0m | [95m 7.847   [0m | [95m 44.58   [0m | [95m 17.44   [0m | [95m 0.9163  [0m |
| [95m 3       [0m | [95m 0.7342  [0m | [95m 0.971   [0m | [95m 5.26    [0m | [95m 63.94   [0m | [95m 18.15   [0m | [95m 0.5056  [0m |
| [0m 4       [0m | [0m 0.7113  [0m | [0m 0.7882  [0m | [0m 6.009   [0m | [0m 33.32   [0m | [0m 12.03   [0m | [0m 0.8579  [0m |
| [95m 5       [0m | [95m 0.7452  [0m | [95m 0.8589  [0m | [95m 3.731   [0m | [95m 62.35   [0m | [95m 10.9    [0m | [95m 0.9453  [0m |
| [95m 6       [0m | [95m 0.764   [0m | [95m 0.6681  [0m | [95m 2.374   [0m | [95m 69.79

So Quest Mobile features doesn't help improving AUC.

## Effect Size

In [7]:
import scipy.stats as ss

def cramers_stat(crosstab):
    chi2 = ss.chi2_contingency(crosstab)[0]
    n = crosstab.values.sum()
    return np.sqrt(chi2 / (n*(min(crosstab.shape)-1)))

In [8]:
print("--- LLS features")
for col in _lls_cate_features:
    cm = pd.crosstab(df_raw[_label].astype(str),df_raw[col])
    print("Cramer's V for {0}: {1}".format(col, round(cramers_stat(cm),4)))

print("--- QM features")
for col in _qm_cate_features:
    cm = pd.crosstab(df_raw[_label].astype(str),df_raw[col])
    print("Cramer's V for {0}: {1}".format(col, round(cramers_stat(cm),4)))

--- LLS features
Cramer's V for lls_profession: 0.1358
Cramer's V for gender: 0.0339
Cramer's V for province: 0.1122
Cramer's V for platform: 0.0552
Cramer's V for channel: 0.0774
--- QM features
Cramer's V for age: 0.0312
Cramer's V for marriage: 0.0194
Cramer's V for children: 0.0202
Cramer's V for qm_profession: 0.033


In [9]:
def eta_squared(col, val):
    mean_global = val.mean()
    ss_total = np.sum(np.square(val - mean_global))
    grouped = pd.DataFrame({'y': col, 'x': val}).groupby('y').agg({np.mean, len}).reset_index()
    ss_treat = np.sum(np.square(grouped['x']['mean'] - mean_global)*grouped['x']['len'])
    return ss_treat/ss_total

In [11]:
print("--- LLS features")
for col in _lls_features:
    if col not in _lls_cate_features:
        eta_2 = eta_squared(df_raw[_label].astype(str),df_raw[col])
        print("Eta-Squared for {0}: {1}".format(col, round(eta_2,4)))

print("--- QM features")
for col in _qm_features:
    if col not in _qm_cate_features:
        eta_2 = eta_squared(df_raw[_label].astype(str),df_raw[col])
        print("Eta-Squared for {0}: {1}".format(col, round(eta_2,4)))

--- LLS features
Eta-Squared for is_child: 0.0022
Eta-Squared for city_level: 0.0006
--- QM features
Eta-Squared for online_purchasing_power: 0.0013
