In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn import linear_model, neighbors, ensemble
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error


from querents import Querent
import utils


data_fp = '/mnt/c/data/b2w'
historic_fp = data_fp + '/past_bids.csv'


In [2]:
historic = pd.read_csv(historic_fp)
historic = historic.fillna({'profit': 0})

train, test = train_test_split(historic, test_size = 0.2)

In [3]:
mean_purchase_amt = train.loc[train.purchase == True, 'profit'].mean()
mean_purchase_amt

10.130785791173304

In [4]:
train_feat = utils.frame_to_features(train)
test_feat = utils.frame_to_features(test)

res = pd.DataFrame(index = test.index)
res['purchase'] = test.purchase
res['profit'] = test.profit

In [5]:
log_mod = linear_model.LogisticRegression().fit(train_feat, train.purchase)
lin_mod = linear_model.LinearRegression().fit(train_feat, train.profit)
#gbm_mod = ensemble.GradientBoostingRegressor().fit(train_feat, train.profit)
bayes_mod = linear_model.BayesianRidge().fit(train_feat, train.profit)

In [17]:
foo = train_feat.loc[0]
bar = log_mod.predict_proba(foo)[:,1][0]
bar



0.27983035375775622

In [6]:
res['log_class'] = log_mod.predict(test_feat)
res['log_purchase'] = log_mod.predict_proba(test_feat)[:, 1]
res['log_profit'] = res['log_purchase'] * mean_purchase_amt
res['pred_profit'] = lin_mod.predict(test_feat)
res['bayes_profit'] = bayes_mod.predict(test_feat)

In [7]:
res.head(30)

Unnamed: 0,purchase,profit,log_class,log_purchase,log_profit,pred_profit,bayes_profit
26355,True,15.0,False,0.381313,3.882245,3.891602,3.915134
20977,False,0.0,True,0.542515,5.523484,5.124023,5.12509
34246,False,0.0,False,0.081714,0.831951,0.198242,0.215557
5809,False,0.0,False,0.318072,3.238373,3.470703,3.4131
3947,True,11.0,False,0.422203,4.298554,4.233887,4.256306
84,True,9.0,False,0.258034,2.627109,2.849609,2.816658
9210,True,15.0,False,0.334707,3.40774,3.508789,3.532997
16232,False,0.0,False,0.460734,4.690852,4.477051,4.482026
26057,True,7.0,True,0.611455,6.225384,5.694824,5.652383
19111,False,0.0,False,0.454498,4.627361,4.441895,4.446139


In [8]:
mean_squared_error(res.profit, res.log_profit)

23.30501447995071

In [9]:
mean_squared_error(res.profit, res.pred_profit)

23.371519838142394

In [10]:
mean_squared_error(res.profit, res.bayes_profit)

23.377296380963681

In [None]:
confusion_matrix(res.truth, res.pred, labels = [True, False])

In [None]:
## Look at the range of scores for prediction of True
res.loc[res.pred, :].score.min(), res.loc[res.pred, :].score.max()

In [None]:
## Look at the range of scores for prediction of False
res.loc[~res.pred, 'score'].min(), res.loc[~res.pred, 'score'].max()

In [None]:
## Average profit from those predicted to buy something
test.loc[res.pred, 'profit'].mean()

In [None]:
## Average profit from those predicted NOT to buy something
test.loc[~res.pred, 'profit'].mean()

The above analysis of our model is useful, but we may be able to get better results by dropping out users we are uncertian about. Unfortunately, the model does not seem to be very certian about anyone... our max score is .66 out or 1.00

In [None]:
test.loc[res.score < 0.25, 'profit'].mean(), test.loc[res.score < 0.25, 'profit'].count()

In [None]:
test.loc[ np.logical_and(0.25 < res.score, res.score < 0.5), 'profit'].mean(), test.loc[ np.logical_and(0.25 < res.score, res.score < 0.5), 'profit'].count()

In [None]:
test.loc[ 0.5 < res.score, 'profit'].mean(), test.loc[ 0.5 < res.score, 'profit'].count()

In [None]:
pickle.dump(mod, open( data_fp+'/model.p', 'wb'))

In [None]:
any(res.pred)