In [1]:
%matplotlib inline
import numpy as np
import scipy
import pandas
import matplotlib.pyplot as plt
#import statsmodels.formula.api as sm
import seaborn as sns
import sklearn as sl
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
pandas.set_option('display.max_columns', 20)
pandas.set_option('display.width', 350)

  import pandas.util.testing as tm


In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.metrics import confusion_matrix

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
#set seed to be able to reproduce the results
np.random.seed(4684)
#read data 

In [7]:
data = pandas.read_csv('./dataset/emails.csv')

In [8]:
data.head()

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,clicked
0,8,short_email,generic,9,Thursday,US,3,0
1,33,long_email,personalized,6,Monday,US,0,0
2,46,short_email,generic,14,Tuesday,US,3,0
3,49,long_email,personalized,11,Thursday,US,10,0
4,65,short_email,generic,8,Wednesday,UK,3,0


In [9]:
#Bin the variables accoding to the rules described above
#Hour
data['hour_binned']=pandas.cut(data['hour'], bins=[1,5, 13, 21, 24], include_lowest=True, labels=['night', 'morning', 'afternoon', 'night2'])

In [10]:
#replace night2 with night
data['hour_binned']=data['hour_binned'].replace('night2', 'night').cat.remove_unused_categories()

In [11]:
#Bin purchases
data['purchase_binned']=pandas.cut(data['user_past_purchases'], bins=[0,1, 4, 8, 23], include_lowest=True, right=False, labels=['None', 'Low', 'Medium', 'High'])

In [13]:
#prepare the data for the model
data_dummy = pandas.get_dummies(data, drop_first=True).drop(['email_id', 'hour', 'user_past_purchases'], axis=1)

In [14]:
#split into train and test to avoid overfitting
train, test = train_test_split(data_dummy, test_size = 0.34)

In [15]:
#build the model. We choose a RF, but this personalization approach works with any kinds of models
rf = RandomForestClassifier(class_weight={0:0.05,1:0.95}, n_estimators=50, oob_score=True)
rf.fit(train.drop('clicked', axis=1), train['clicked'])

RandomForestClassifier(bootstrap=True, class_weight={0: 0.05, 1: 0.95},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=50, n_jobs=None, oob_score=True,
                       random_state=None, verbose=0, warm_start=False)

In [16]:
print(pandas.DataFrame(confusion_matrix(train['clicked'], rf.oob_decision_function_[:,1].round(), labels=[0, 1])))

       0     1
0  58999  5612
1   1035   321


In [17]:
print(pandas.DataFrame(confusion_matrix(test['clicked'], rf.predict(test.drop('clicked', axis=1)), labels=[0, 1])))


       0     1
0  30433  2837
1    543   170


In [18]:
#We remove the label, we don't need it here
data_unique = data_dummy.drop(['clicked'], axis=1)
  
#We create all unique combinations of our features
data_unique = data_unique.drop_duplicates()
  
#Now we feed this into our model and get a pre
predictions = rf.predict_proba(data_unique)
#Finally, we add these predictions to the dataset
data_unique['prediction'] = [x[1] for x in predictions]

In [19]:
data_unique.head(3)

Unnamed: 0,email_text_short_email,email_version_personalized,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,user_country_FR,user_country_UK,user_country_US,hour_binned_morning,hour_binned_afternoon,purchase_binned_Low,purchase_binned_Medium,purchase_binned_High,prediction
0,1,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0.30329
1,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0.0
2,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0.119575


In [20]:
#Sort by prediction. This way highest predictions will be at the top of the dataset 
data_unique = data_unique.sort_values('prediction', ascending=False)
  
#Remove duplicates for country and purchase binned. This way, for each unique combination of country and purchase,
#we will only have the top 1 value, which means the highest prediction
best_segment = data_unique.drop_duplicates(subset=['user_country_FR', 'user_country_UK', 'user_country_US', 
                                         'purchase_binned_Low', 'purchase_binned_Medium', 'purchase_binned_High'
                                         ]).copy()

In [21]:
#This is not strictly needed. However, it is pretty hard to read that dataset cause we have all the dummy variables
#So let's reconstruct manually the original categorical varibles. It will be so much clearer that way
#Country
best_segment['user_country'] = np.where(best_segment['user_country_UK'] == 1, "UK", 
                                   np.where(best_segment['user_country_US'] == 1, "US", 
                                      np.where(best_segment['user_country_FR'] == 1, "FR",
                                     "ES"
)))

In [22]:
best_segment = best_segment.drop([e for e in list(data_unique) if e.startswith('user_country_')], axis=1)
  
#Number_purchases
best_segment['purchase_binned'] = np.where(best_segment['purchase_binned_High'] == 1, "High", 
                                   np.where(best_segment['purchase_binned_Medium'] == 1, "Medium", 
                                    np.where(best_segment['purchase_binned_Low'] == 1, "Low",
                                     "None"
)))
best_segment = best_segment.drop([e for e in list(data_unique) if e.startswith('purchase_binned_')], axis=1)

In [23]:
#Email Text
best_segment['email_text'] = np.where(best_segment['email_text_short_email'] == 1, "short_email", "long_email")
best_segment = best_segment.drop('email_text_short_email', axis=1)
  
#Email version
best_segment['email_version'] = np.where(best_segment['email_version_personalized'] == 1, "personalized", "generic")
best_segment = best_segment.drop('email_version_personalized', axis=1)
  
#Weekday
best_segment['weekday'] = np.where(best_segment['weekday_Monday'] == 1, "Monday", 
                                    np.where(best_segment['weekday_Saturday'] == 1, "Saturday", 
                                       np.where(best_segment['weekday_Sunday'] == 1, "Sunday",
                                          np.where(best_segment['weekday_Thursday'] == 1, "Thursday", 
                                              np.where(best_segment['weekday_Tuesday'] == 1, "Tuesday",
                                                   np.where(best_segment['weekday_Wednesday'] == 1, "Wednesday",
                                                      "Friday"
))))))
best_segment = best_segment.drop([e for e in list(data_unique) if e.startswith('weekday_')], axis=1)

In [24]:
#Hour
best_segment['hour_binned'] = np.where(best_segment['hour_binned_afternoon'] == 1, "afternoon", 
                                   np.where(best_segment['hour_binned_morning'] == 1, "morning", 
                                     "night"
))
best_segment = best_segment.drop([e for e in list(data_unique) if e.startswith('hour_binned_')], axis=1) 

In [25]:
best_segment

Unnamed: 0,prediction,user_country,purchase_binned,email_text,email_version,weekday,hour_binned
483,0.752838,UK,High,short_email,personalized,Sunday,morning
1359,0.746932,US,High,long_email,personalized,Wednesday,night
2560,0.676526,UK,Medium,long_email,personalized,Wednesday,night
7361,0.653233,FR,High,short_email,personalized,Monday,morning
55,0.600676,ES,High,short_email,personalized,Tuesday,morning
144,0.574458,US,Medium,short_email,personalized,Tuesday,afternoon
1807,0.56872,UK,Low,short_email,personalized,Sunday,morning
334,0.565262,FR,Medium,short_email,personalized,Thursday,morning
10352,0.530126,ES,Medium,short_email,personalized,Monday,afternoon
619,0.524689,US,Low,short_email,personalized,Thursday,afternoon


In [26]:
count_segment = data[['user_country','purchase_binned']].groupby(['user_country','purchase_binned']).size().reset_index(name='counts')
  
#Get the proportion instead of the counts. Just easier to deal with to later get weighted average
count_segment['weight'] = count_segment['counts'].div(count_segment['counts'].sum())
  
#Merge it, so in our final dataset we also have weight
best_segment = pandas.merge(best_segment, count_segment).sort_values('prediction',ascending=False)

In [27]:
best_segment

Unnamed: 0,prediction,user_country,purchase_binned,email_text,email_version,weekday,hour_binned,counts,weight
0,0.752838,UK,High,short_email,personalized,Sunday,morning,2712,0.027134
1,0.746932,US,High,long_email,personalized,Wednesday,night,8325,0.083292
2,0.676526,UK,Medium,long_email,personalized,Wednesday,night,6622,0.066253
3,0.653233,FR,High,short_email,personalized,Monday,morning,1444,0.014447
4,0.600676,ES,High,short_email,personalized,Tuesday,morning,1422,0.014227
5,0.574458,US,Medium,short_email,personalized,Tuesday,afternoon,20008,0.20018
6,0.56872,UK,Low,short_email,personalized,Sunday,morning,7803,0.078069
7,0.565262,FR,Medium,short_email,personalized,Thursday,morning,3314,0.033157
8,0.530126,ES,Medium,short_email,personalized,Monday,afternoon,3389,0.033907
9,0.524689,US,Low,short_email,personalized,Thursday,afternoon,23364,0.233757


In [28]:
#Now let's add class1 and class 0 errors to the dataset. We will take it from the test error confusion matrix
conf_matrix = pandas.DataFrame(confusion_matrix(test['clicked'], rf.predict(test.drop('clicked', axis=1)), labels=[0, 1]))
  
#We define positive predictive value (ppv) as the proportion of times the model is right when it predicts 1, this is also called precision 
ppv = conf_matrix.loc[1,1]/(conf_matrix.loc[1,1]+conf_matrix.loc[0,1])
  
#We also need false positive rate (FPR). Indeed, those are actual clicks (the model is mistakenly predicting non-click, but it is actually a click)
fpr = conf_matrix.loc[1,0]/(conf_matrix.loc[1,0]+conf_matrix.loc[0,0])
  
#Adjusted predicted click-rate for each segment
best_segment['adjusted_prediction'] = best_segment['prediction'] * ppv + (1-best_segment['prediction']) * fpr
  
#Finally, let's multiply this by the weight of each segment in the dataset and compare it with the starting click-rate
CTR_comparison = pandas.DataFrame( {'predicted_click_rate':[(best_segment['adjusted_prediction']*best_segment['weight']).sum()],
                                    'old_click_rate':[data['clicked'].mean()]
                                    })

In [29]:
CTR_comparison

Unnamed: 0,predicted_click_rate,old_click_rate
0,0.03741,0.0207
