In [3]:
import xgboost as xgb

In [4]:
from scipy.stats import uniform
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import train_test_split, StratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  
plt.rcParams['figure.figsize'] = [15, 5]

In [5]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


In [6]:
data = pd.read_csv('results.csv')

In [7]:
data['outcome'] = data['test_out'].apply(str) + data['control_out'].apply(str)

In [8]:
data.head()

Unnamed: 0,control_ix,test_ix,control_out,test_out,distance,f0,f1,f2,f3,f4,...,f3c,f4c,f5c,f6c,f7c,f8c,f9c,f10c,f11c,outcome
0,1,19114443,0,0,0.007983,1.478699,3.417861,8.390882,2.34476,3.506733,...,2.307094,3.506733,10.840586,-2.526549,-0.75298,-9.904933,16.894056,-1.8609,4.157648,0
1,9,9637197,0,0,0.01176,0.732334,3.263641,8.678399,2.703948,3.506733,...,2.61387,3.506733,10.161281,-1.343855,-0.166689,-8.286829,13.193869,-1.8609,4.157648,0
2,16,152609,0,0,0.003313,-0.742603,3.263641,8.980436,3.735871,3.506733,...,3.735871,3.506733,10.161281,-2.039146,-0.166689,-16.247004,9.850093,-1.8609,4.157648,0
3,33,81790,0,0,0.01197,0.947677,3.263641,8.442494,3.022606,4.013466,...,2.877999,3.803153,10.161281,-0.728742,-0.166689,-23.066055,24.184974,-8.601271,4.157648,0
4,38,3238104,0,0,0.020262,1.527733,3.263641,8.552558,3.162731,3.506733,...,2.703948,3.506733,10.840586,-4.16901,-2.751289,-15.614602,16.250365,-1.8609,4.157648,0


In [9]:
data.columns

Index(['control_ix', 'test_ix', 'control_out', 'test_out', 'distance', 'f0',
       'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11',
       'f0c', 'f1c', 'f2c', 'f3c', 'f4c', 'f5c', 'f6c', 'f7c', 'f8c', 'f9c',
       'f10c', 'f11c', 'outcome'],
      dtype='object')

In [10]:
features = ['f0','f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11']
target   = 'outcome'

In [11]:
br = pd.DataFrame(data['outcome'].value_counts()).reset_index()

In [12]:
br

Unnamed: 0,index,outcome
0,0,3375295
1,1,5002
2,10,4767
3,11,709


In [13]:
# as we can see the 00 (lost causes) segment is hugely overrepresented 

In [14]:
# sampling strategy.
# balance by undersampling
# binary between sleeping dogs and persuadables 
# convert to regression 01 lowest to 11 highest
# three class: persuadbles, (00,11), sleeping dogs: balance 

In [15]:
# undersampling
br = pd.DataFrame(data['outcome'].value_counts()).reset_index()

sample = pd.concat([data[data.outcome == '00'][features + [target]].sample(n=min(data['outcome'].value_counts()))
                         ,data[data.outcome == '01'][features + [target]].sample(n=min(data['outcome'].value_counts()))
                         ,data[data.outcome == '10'][features + [target]].sample(n=min(data['outcome'].value_counts()))
                         ,data[data.outcome == '11'][features + [target]].sample(n=min(data['outcome'].value_counts()))])

In [46]:
# binary 
br = data[(data.outcome == '01') | (data.outcome == '10')]['outcome'].value_counts()

sample = pd.concat([data[data.outcome == '01'][features + [target]].sample(n=min(data['outcome'].value_counts()))
                   ,data[data.outcome == '10'][features + [target]].sample(n=min(data['outcome'].value_counts()))])



In [47]:
# seed = 7
# test_size = 0.3
# X_trian, X_test, y_train, y_test = train_test_split(sample[features], sample[target], test_size=test_size, random_state=seed)

In [48]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [49]:
xgbc = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='multi:softmax', silent=True, nthread=1, num_class=4)

In [50]:
folds = 4
param_comb = 5

#skf = StratifiedKFold(y=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgbc, param_distributions=params, n_iter=param_comb, scoring='accuracy', n_jobs=4, cv=5, verbose=3, random_state=1001 )

# Here we go
#start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(sample[features], sample[target])
#timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4 
[CV] gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4 
[CV] gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4 
[CV] gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4 


  if diff:


[CV]  gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4, score=0.658451 -   1.4s
[CV] gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4 


  if diff:


[CV]  gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4, score=0.626761 -   1.5s
[CV] gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4, score=0.640845 -   1.6s


  if diff:


[CV] gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4 
[CV]  gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4, score=0.630282 -   1.6s
[CV] gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=2, colsample_bytree=1.0, subsample=1.0, min_child_weight=5, max_depth=4, score=0.627660 -   1.4s
[CV] gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4, score=0.619718 -   1.7s
[CV] gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4, score=0.602113 -   2.0s
[CV] gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3 


  if diff:


[CV]  gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4, score=0.602113 -   2.3s
[CV] gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3 


  if diff:


[CV]  gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4, score=0.619718 -   1.8s
[CV] gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3 


  if diff:


[CV]  gamma=5, colsample_bytree=1.0, subsample=0.6, min_child_weight=10, max_depth=4, score=0.609929 -   1.7s
[CV] gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3 


  if diff:


[CV]  gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3, score=0.602113 -   1.6s
[CV] gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3 


  if diff:


[CV]  gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3, score=0.612676 -   1.6s
[CV] gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3, score=0.598592 -   1.5s
[CV] gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3, score=0.598592 -   1.3s
[CV] gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=1.5, colsample_bytree=0.8, subsample=0.6, min_child_weight=1, max_depth=3, score=0.574468 -   1.8s
[CV] gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4, score=0.630282 -   2.0s
[CV] gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4 


  if diff:


[CV]  gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4, score=0.633803 -   1.9s
[CV] gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4 


  if diff:


[CV]  gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4, score=0.626761 -   1.7s
[CV] gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4 


  if diff:


[CV]  gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4, score=0.605634 -   1.3s
[CV] gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4 


  if diff:


[CV]  gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4, score=0.616197 -   1.3s
[CV] gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4 


  if diff:


[CV]  gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4, score=0.633803 -   2.5s
[CV] gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4 


  if diff:


[CV]  gamma=0.5, colsample_bytree=1.0, subsample=0.8, min_child_weight=10, max_depth=4, score=0.581560 -   2.4s


  if diff:


[CV]  gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4, score=0.595070 -   1.1s


  if diff:


[CV]  gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4, score=0.598592 -   1.2s


  if diff:


[CV]  gamma=0.5, colsample_bytree=0.6, subsample=1.0, min_child_weight=5, max_depth=4, score=0.609929 -   1.2s


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:   10.7s finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
       n_jobs=1, nthread=1, num_class=4, objective='multi:softmax',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=5, n_jobs=4,
          param_distributions={'gamma': [0.5, 1, 1.5, 2, 5], 'colsample_bytree': [0.6, 0.8, 1.0], 'subsample': [0.6, 0.8, 1.0], 'min_child_weight': [1, 5, 10], 'max_depth': [3, 4, 5]},
          pre_dispatch='2*n_jobs', random_state=1001, refit=True,
          scoring='accuracy', verbose=3)

In [51]:
test = pd.read_csv('data_000000000007.csv')

In [52]:
test.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure
0,-0.179662,3.263641,9.187297,3.581365,3.506733,10.161281,-3.64503,-0.166689,-11.067974,9.850093,-1.8609,4.157648,1,0,0,0
1,1.569166,3.417861,8.782523,2.839861,3.506733,10.840586,-1.455483,-1.270076,-17.381754,15.228113,-1.8609,4.157648,1,0,1,0
2,-0.27457,3.263641,8.547123,3.735871,3.506733,10.161281,-0.606249,-0.166689,-17.455612,11.700187,-1.8609,4.157648,1,0,0,0
3,1.735415,3.263641,8.739826,3.317236,3.506733,10.161281,-2.491569,-0.166689,-21.01216,19.182694,-1.8609,4.157648,1,0,0,0
4,-1.016393,3.263641,9.08564,3.735871,3.506733,10.161281,-0.946781,-0.166689,-8.89859,9.850093,-1.8609,4.157648,1,0,0,0


In [53]:
y_test = random_search.predict_proba(test[features])

In [54]:
y_test

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [56]:
y_test_pred = random_search.predict(test[features])

  if diff:


In [57]:
y_test_pred

array(['10', '10', '10', ..., '10', '10', '10'], dtype=object)

In [58]:
outcomes = {} #'00', '01', '10', '11']:
    

In [59]:
i = 0 
for pred in y_test_pred:
    if pred not in outcomes:
        outcomes[pred] = np.argmax(y_test[i])
    else:
        pass
    i += 1

In [60]:
outcomes

{'01': 0, '10': 1}

In [None]:
from pylift.eval import UpliftEval


In [None]:
upev = UpliftEval(test['treatment'], test['conversion'], y_test[:,1])

In [None]:
upev.q1_qini

In [None]:
upev.plot(plot_type='qini',show_practical_max=True, show_no_dogs=True)