In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max.columns', None)
pd.set_option('display.max_colwidth', None)
%matplotlib inline

In [37]:
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("../data/AdSmartABdatav4.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no,engagement
0,0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0,0
1,1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0,0
2,2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1,1
3,3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0,0
4,4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0,0


In [38]:
df.set_index('auction_id',inplace=True)

In [39]:
df['experiment'] = pd.Categorical(df.experiment)
df['device_make'] = pd.Categorical(df.device_make)
df['browser'] = pd.Categorical(df.browser)

cat_columns = df.select_dtypes(['category']).columns

df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

In [40]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,experiment,date,hour,device_make,platform_os,browser,yes,no,engagement
auction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0008ef63-77a7-448b-bd1e-075f42c55e39,0,1,2020-07-10,8,46,6,2,0,0,0
000eabc5-17ce-4137-8efe-44734d914446,1,1,2020-07-07,10,46,6,2,0,0,0
0016d14a-ae18-4a02-a204-6ba53b52f2ed,2,1,2020-07-05,2,29,6,3,0,1,1
00187412-2932-4542-a8ef-3633901c98d9,3,0,2020-07-03,15,137,6,6,0,0,0
001a7785-d3fe-4e11-a344-c8735acacc2c,4,0,2020-07-03,15,46,6,2,0,0,0


In [41]:
dataX = df.iloc[:,[1,3,4,5,8]].values  
dataY = df.iloc[:, 9].values 

In [42]:
# splitting the data into train test
from sklearn.model_selection import train_test_split
train_ratio = 0.75
x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)


In [44]:
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_predict

import xgboost as xgb
from sklearn import tree
from sklearn.linear_model import LogisticRegression

import pickle

lrm = LogisticRegression()
dtm=tree.DecisionTreeClassifier(criterion='entropy',max_depth=3)
xgbm = xgb.XGBClassifier(eval_metric='mlogloss')

#fitting our classifiers
lrm.fit(x_train, y_train)
dtm.fit(x_train, y_train)
xgbm.fit(x_train, y_train)

pickle.dump(lrm, open("../models/lrmv1.pkl", 'wb'))
pickle.dump(dtm, open("../models/dtmv1.pkl", 'wb'))
pickle.dump(xgbm, open("../models/xgbmv1.pkl", 'wb'))

#make predictions
predictedlrm = cross_val_predict(lrm, x_test, y_test, cv=10)
predictedclf = cross_val_predict(dtm, x_test, y_test, cv=10)
predictedxgb = cross_val_predict(xgbm, x_test, y_test, cv=10)

#print the rmse and r2 scores
print('lrm RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedlrm)))
print('dtm RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedclf)))
print('xgb RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedxgb)))
print('lrm R2:', r2_score(y_test, predictedlrm))
print('dtm r2:', r2_score(y_test, predictedclf))
print('xgb r2:', r2_score(y_test, predictedxgb))

lrm RMSE: 0.2706793799547229
dtm RMSE: 0.2706793799547229
xgb RMSE: 0.28231714488679166
lrm R2: 0.4253543001358958
dtm r2: 0.4253543001358958
xgb r2: 0.37487866433702177


In [45]:
feature_importance=pd.DataFrame({'feature':['experiment','hour','device_make','platform_os','browser'],'feature_importance':[abs(i) for i in lrm.coef_[0]]})
feature_importance.sort_values('feature_importance',ascending=False)

Unnamed: 0,feature,feature_importance
4,browser,6.830974
3,platform_os,1.888344
0,experiment,0.255923
1,hour,0.011018
2,device_make,0.001258


In [46]:
feature_importances=pd.DataFrame({'features':['experiment','hour','device_make','platform_os','browser'],'feature_importance':dtm.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)

Unnamed: 0,features,feature_importance
4,browser,0.975711
2,device_make,0.01795
1,hour,0.006339
0,experiment,0.0
3,platform_os,0.0


In [47]:
feature_importances=pd.DataFrame({'features':['experiment','hour','device_make','platform_os','browser'],'feature_importance':xgbm.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)

Unnamed: 0,features,feature_importance
4,browser,0.989931
2,device_make,0.003419
0,experiment,0.003002
1,hour,0.002693
3,platform_os,0.000954
