In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn import linear_model, neighbors, ensemble
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix


from querents import Querent
import utils


data_fp = '/mnt/c/data/b2w'
historic_fp = data_fp + '/past_bids.csv'


In [2]:
historic = pd.read_csv(historic_fp)
historic = historic.fillna({'profit': 0})

train, test = train_test_split(historic, test_size = 0.2)

In [3]:
train_feat = utils.frame_to_features(train)
test_feat = utils.frame_to_features(test)
train_target = train.purchase

res = pd.DataFrame(index = test.index)
res['truth'] = test.purchase

In [5]:
mod = linear_model.LogisticRegression()
mod.fit(train_feat, train_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
res['pred'] = mod.predict(test_feat)
res['score'] = mod.predict_proba(test_feat)[:, 1]

In [19]:
res.head()

Unnamed: 0,truth,pred,score
16325,True,False,0.434919
5625,False,False,0.215304
17591,True,False,0.431855
1091,True,False,0.085566
29879,False,False,0.188876


In [20]:
confusion_matrix(res.truth, res.pred, labels = [True, False])

array([[ 342, 2427],
       [ 289, 6942]])

In [21]:
## Look at the range of scores for prediction of True
res.loc[res.pred, :].score.min(), res.loc[res.pred, :].score.max()

(0.50008583322955147, 0.66345431718145076)

In [25]:
## Look at the range of scores for prediction of False
res.loc[~res.pred, 'score'].min(), res.loc[~res.pred, 'score'].max()

(0.06227240768820471, 0.49993959236442664)

In [27]:
## Average profit from those predicted to buy something
test.loc[res.pred, 'profit'].mean()

5.481774960380348

In [28]:
## Average profit from those predicted NOT to buy something
test.loc[~res.pred, 'profit'].mean()

2.631871064147721

The above analysis of our model is useful, but we may be able to get better results by dropping out users we are uncertian about. Unfortunately, the model does not seem to be very certian about anyone... our max score is .66 out or 1.00

In [61]:
test.loc[res.score < 0.25, 'profit'].mean(), test.loc[res.score < 0.25, 'profit'].count()

(1.7326895418122046, 4867)

In [62]:
test.loc[ np.logical_and(0.25 < res.score, res.score < 0.5), 'profit'].mean(), test.loc[ np.logical_and(0.25 < res.score, res.score < 0.5), 'profit'].count()

(3.6039537983118612, 4502)

In [63]:
test.loc[ 0.5 < res.score, 'profit'].mean(), test.loc[ 0.5 < res.score, 'profit'].count()

(5.481774960380348, 631)

In [64]:
pickle.dump(mod, open( data_fp+'/model.p', 'wb'))