# Pulling in the Data

In [26]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

offers_recieved = pd.read_csv('for_model.csv')
#offers_recieved = offers_recieved.drop(['id_x', 'id_y','value','person','offer_id','channels','influence_start',
#                                       'influence_end','Unnamed: 0','event','offer_type','index'], axis=1)


transcript = pd.read_json('data/transcript.json', orient='records', lines=True)
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)

#remove top 1% of spending, model will focus too much on these
offers_recieved = offers_recieved[offers_recieved['spending_during_offer'] < offers_recieved['spending_during_offer'].quantile(q=.99)]

offers_recieved.info()
offers_recieved.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17017 entries, 0 to 17188
Data columns (total 23 columns):
Unnamed: 0               17017 non-null int64
offer_type               17017 non-null object
offer_id                 17017 non-null object
person                   17017 non-null object
time                     17017 non-null int64
difficulty               17017 non-null int64
duration                 17017 non-null int64
reward                   17017 non-null int64
web                      17017 non-null int64
email                    17017 non-null int64
mobile                   17017 non-null int64
social                   17017 non-null int64
bogo                     17017 non-null int64
discount                 17017 non-null int64
informational            17017 non-null int64
age                      17017 non-null int64
income                   17017 non-null float64
member_date_int          17017 non-null int64
F                        17017 non-null int64
M           

Unnamed: 0.1,Unnamed: 0,time,difficulty,duration,reward,web,email,mobile,social,bogo,discount,informational,age,income,member_date_int,F,M,typical_spending_m1,num_offers,spending_during_offer
count,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0,17017.0
mean,38294.377211,334.227185,7.704413,6.772404,4.525181,0.794441,1.0,0.971382,0.791855,0.422871,0.464418,0.112711,64.182465,65770.022624,1303.206029,0.356232,0.48193,0.164729,2.067462,2.05627
std,21922.15499,195.430578,3.877208,2.060776,3.413907,0.404121,0.0,0.166736,0.405993,0.49403,0.498747,0.316248,27.261797,18510.039129,403.79846,0.478899,0.499688,0.189353,1.032515,5.721521
min,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,18.0,30000.0,1.0,0.0,0.0,7.1e-05,1.0,0.0
25%,19256.0,168.0,5.0,5.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,46.0,54000.0,1025.0,0.0,0.0,0.037141,1.0,0.0
50%,38458.0,408.0,10.0,7.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,59.0,65405.0,1459.0,0.0,0.0,0.11707,2.0,0.0
75%,57255.0,504.0,10.0,7.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,75.0,76000.0,1609.0,1.0,1.0,0.23283,3.0,0.0
max,76271.0,576.0,20.0,10.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,118.0,120000.0,1823.0,1.0,1.0,2.30802,6.0,31.67


# Training XGBoost Model and Finding the Best Parameter Values

In [27]:
offers_for_model = offers_recieved.drop(['Unnamed: 0','offer_type'],axis=1)

In [28]:
X, y = offers_for_model.iloc[:,:-1],offers_recieved.iloc[:,-1]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,shuffle=True)

X_train_for_model = X_train.drop(['person','offer_id'],axis=1)
X_test_for_model = X_test.drop(['person','offer_id'],axis=1)

data_dmatrix = xgb.DMatrix(data=X_train_for_model,label=y_train)

In [21]:
#mini grid search over max depth and n_estimators to define the best parameter
rmse = []
for i in range(1,25,1):
    for j in range(100,600,100):
        xg = xgb.XGBRegressor(objective="reg:squarederror",colsample_bytree = 0.3, learning_rate = .05,
                    max_depth = i, n_estimators = j,alpha = 2)
        xg.fit(X_train_for_model,y_train)
        test_preds = xg.predict(X_test_for_model)
        train_preds = xg.predict(X_train_for_model)
        test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
        train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
        rmse.append([i,j,train_rmse,test_rmse])
    
rmse = pd.DataFrame(rmse)
rmse.to_csv('rmse.csv')
    

In [29]:
#Using the best parameters, train a model and look at rmse outcomes for train and test
xg = xgb.XGBRegressor(objective="reg:squarederror",colsample_bytree = 0.3, learning_rate = .05,
                    max_depth = 5, n_estimators = 100,alpha = 2)
xg.fit(X_train_for_model,y_train)

preds = xg.predict(X_train_for_model)

train_rmse = np.sqrt(mean_squared_error(y_train, preds))
print("train rmse: %f" % (train_rmse))

test_preds = xg.predict(X_test_for_model)

test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
print("test rmse: %f" % (test_rmse))

train rmse: 5.284352
test rmse: 5.548542


# Making Model Actionable

In [30]:
#I want to make the model actionable by using it to identify which offer is "optimal" for each customer 
#ie, what offer does our model predict will lead to the highest spending by customer

#first some simple data processing to get ready to make these predictions for each customer
profile = X_test.groupby('person').mean()

#same transformations used in data cleaning to get the same columns in portfolio of offers
portfolio['web'] = pd.Series([0 for i in range(0,len(portfolio))])
portfolio['email'] = pd.Series([0 for i in range(0,len(portfolio))])
portfolio['mobile'] = pd.Series([0 for i in range(0,len(portfolio))])
portfolio['social'] = pd.Series([0 for i in range(0,len(portfolio))])


for index, i in enumerate(portfolio['channels']):
    for j in i:
        portfolio[j][index] = 1
        
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['offer_type'])], axis=1)
     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [None]:
def predict_portfolio(profile,portfolio,person,time):
    '''For a given person and portolio of offers, predict spending during offer for each offer in the portfolio.  This helps
    us a way to compare predictions for each type offer so we can decide which offer to provide. 
    inputs:
    profile: the profile dataframe, essentially the test data with one row for each person for faster processing
    portfolio: portfolio of offers with relevant data, formatted like train/test data
    person: the id of the person of interest
    time: time the offer will be made
    returns: 
    dataframe of predicting variables then predicted spending outcomes for each type of offer, an optimal offer is
    identified by highest predicted spending.   
    '''
    person_port = profile[profile.index == person]
    person_port_ref = profile[profile.index == person]
    for i in range(0,len(portfolio)):
        person_port.loc[i,'time'] = time
        person_port.loc[i,'difficulty'] = portfolio.loc[i,'difficulty']
        person_port.loc[i,'duration'] = portfolio.loc[i,'duration']
        person_port.loc[i,'reward'] = portfolio.loc[i,'reward']
        person_port.loc[i,'web'] = portfolio.loc[i,'web']
        person_port.loc[i,'email'] = portfolio.loc[i,'email']
        person_port.loc[i,'mobile'] = portfolio.loc[i,'mobile']
        person_port.loc[i,'social'] = portfolio.loc[i,'social']
        person_port.loc[i,'bogo'] = portfolio.loc[i,'bogo']
        person_port.loc[i,'discount'] = portfolio.loc[i,'discount']
        person_port.loc[i,'informational'] = portfolio.loc[i,'informational']
        person_port.loc[i,'age'] = person_port_ref.loc[person,'age']
        person_port.loc[i,'income'] = person_port_ref.loc[person,'income']
        person_port.loc[i,'member_date_int'] = person_port_ref.loc[person,'member_date_int']
        person_port.loc[i,'M'] = person_port_ref.loc[person,'M']
        person_port.loc[i,'F'] = person_port_ref.loc[person,'F']
        person_port.loc[i,'typical_spending_m1'] = person_port_ref.loc[person,'typical_spending_m1']
        person_port.loc[i,'num_offers'] = person_port_ref.loc[person,'num_offers']
    person_port = person_port.iloc[1:]
    preds = xg.predict(person_port)
    person_port['person'] = person
    person_port['portfolio_option'] = person_port.index
    person_port.reset_index
    person_port['preds'] = preds
    person_port['offer_id'] = portfolio['id']
    person_port['spending_bench'] = person_port['typical_spending_m1'] * person_port['duration'] 
    person_port['optimal'] = pd.Series([1 if i == max(preds) else 0 for i in preds])
    final_df = person_port[['person','time','offer_id','portfolio_option','preds','optimal']]
    
    return final_df

#run this function for each person in the test set
for i in range(0,len(profile.index)):
    time = 0
    person = profile.index[i]
    if i == 0:
        results_by_person = predict_portfolio(profile,portfolio,person,350)
    else:
        results_by_person = results_by_person.append(predict_portfolio(profile,portfolio,person,350))

#send this csv to the marketing department! Use optimal offer to get customers to spend more
results_by_person.to_csv('results_by_person_test.csv')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Business Value 

In [None]:
#Ultimately Starbucks wants to make more money. Now that we know the optimal offer by person and time, can we draw some 
#conclusions about whether customers will spend more money if given such an offer?  Looking at offers in the test set, 
#I look at times the "optimal" offer was provided and when it was not.  We'd hope to see that customers tend to spend 
#more when they given the optimal offer.   

test_set = pd.concat([X_test,y_test],axis=1)
test_set = test_set.reset_index()

optimal_offers = results_by_person[results_by_person['optimal'] == 1]

opt_pred_list = []
preds_index = optimal_offers.columns.get_loc('preds')
for i in range(0,len(test_set)):
    opt_pred = optimal_offers[optimal_offers['person'] == test_set.person[i]].iloc[0,preds_index]
    opt_pred_list.append(opt_pred)

opt_offerid_list = []
offerid_index = optimal_offers.columns.get_loc('offer_id')
for i in range(0,len(test_set)):
    opt_offerid = optimal_offers[optimal_offers['person'] == test_set.person[i]].iloc[0,offerid_index]
    opt_offerid_list.append(opt_offerid)
    
    
test_set['opt_pred'] = pd.Series(opt_pred_list)
opt_pred_index = test_set.columns.get_loc('opt_pred')
test_set['pred'] = test_preds
test_set['optimal_offer_id'] = pd.Series(opt_offerid_list)
test_set['optimal'] = test_set['optimal_offer_id'] == test_set['offer_id']


optimal_true = test_set[test_set['optimal'] == True]
optimal_false = test_set[test_set['optimal'] == False]
pct_diff = np.mean(optimal_true['spending_during_offer'])/np.mean(optimal_false['spending_during_offer'])-1

offers_optimal = len(optimal_true)
print('optimal offer given {}/{} times, {}%'.format(len(optimal_true),len(test_set),(len(optimal_true)/len(test_set))*100))
print('avg spending when optimal offer given: {}'.format(np.mean(optimal_true['spending_during_offer'])))
print('avg spending when optimal offer not given: {}'.format(np.mean(optimal_false['spending_during_offer'])))
print('pct diff: ' + str(pct_diff*100) + '%')

Far from conclusive, but encouraging to see that spending is higher for those given what the model suggests could be the "optimal" offer.  Ultimate test is a more statistically rigourous test, but not bad to see.  Optimal offer is only provided ~10% of the time, likely because there are 10 offers probably being assigned randomly when data was created.  However, this suggests room for improvement and potential business value in using the model.  This approach does not take into account how much someone typically spends without an offer, there may be individuals who sending an offer to might decrease their spending. The following analysis attempts to account for this: