In [1]:
import torch
import torch.nn as nn
import numpy as np
import pickle

In [None]:
!pip install space-bandits
from space_bandits import load_model, BanditAlgorithm

In [5]:
## plug in gdrive to load the data from gdrive
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
import sys
## path to the contextual_dataset_wu.py and toy_problem_wu.py files
path = '/content/drive/MyDrive/Bandit_Problem/'
sys.path.append(path)

from contextual_dataset_wu import ContextualDataset
from toy_problem_wu import generate_dataframe

# Define Models

Wide Model:
*   Input - User ID
*   Output - Expected reward for each action

Deep Model:
*   Input - Context
*   Output - Expected reward for each action

Wide and Deep Model: 

*   Combines output from the Wide model and the Deep model



In [4]:
class Wide_Model(nn.Module):
    def __init__(self, embed_size=100, n_action=2, embed_dim=64):
        ## Learns expected reward for each action given User ID
        ## Uses embeddings to 'memorize' individual users
        ## embed_size - size of the dictionary of embeddings
        ## embed_dim -  size of each embedding vector
        ## n_action - number of possible actions

        super(Wide_Model, self).__init__()
        self.embed_size = embed_size
        self.n_action = n_action
        self.embed_dim = embed_dim
        
        self.embedding = nn.Embedding(self.embed_size, self.embed_dim)
        self.lr = nn.Linear(self.embed_dim, self.n_action)
    
    def forward(self, x):
        ## Input: user ID
        x = self.embedding(x)
        x = self.lr(x)
        return x.squeeze(axis=0)


class Deep_Model(nn.Module):
    def __init__(self, context_size=5, layer_sizes=[50,100], n_action=2):
        ## Learns expected reward for each action given context
        ## layer_sizes (list of integers): defines neural network architecture: n_layers = len(layer_sizes), 
        ## value is per-layer width. (default [50,100])
        super(Deep_Model, self).__init__()
        self.context_size = context_size
        self.layer_sizes = layer_sizes
        self.n_action = n_action

        self.layers = []
        self.build_model()
        self.activation = nn.ReLU()
    
    def build_layer(self, inp_dim, out_dim):
        """Builds a layer in deep model """
        layer = nn.modules.linear.Linear(inp_dim,out_dim)
        nn.init.uniform_(layer.weight)
        name = f'layer {len(self.layers)}'
        self.add_module(name, layer)
        return layer
    
    def build_model(self):
        """
        Defines the actual NN model with fully connected layers.
        """
        for i, layer in enumerate(self.layer_sizes):
            if i==0:
                inp_dim = self.context_size
            else:
                inp_dim = self.layer_sizes[i-1]
            out_dim = self.layer_sizes[i]
            new_layer = self.build_layer(inp_dim, out_dim)
            self.layers.append(new_layer)
        output_layer = self.build_layer(out_dim, self.n_action)
        self.layers.append(output_layer)

    def forward(self, x):
        """forward pass of the neural network"""
        ## Input: context
        for i, layer in enumerate(self.layers):
            x = layer(x)
            if i != len(self.layers)-1:
                x = self.activation(x)
        return x.squeeze(axis=0)
  

class Wide_and_Deep_Model(nn.Module):
    def __init__(self, context_size=5, deep_layer_sizes=[50,100], n_action=2, embed_size=100, wide_embed_dim=64):
        super(Wide_and_Deep_Model, self).__init__()
        self.n_action = n_action
        self.context_size = context_size
        self.deep_layer_sizes = deep_layer_sizes
        self.embed_size = embed_size
        self.wide_embed_dim = wide_embed_dim

        self.wide_model = Wide_Model(embed_size=self.embed_size, n_action=self.n_action, embed_dim=self.wide_embed_dim)
        self.deep_model = Deep_Model(context_size=self.context_size, layer_sizes=self.deep_layer_sizes, n_action=self.n_action)
    
    def forward(self, wide_input, deep_input):
        x_wide = self.wide_model(wide_input)
        x_deep = self.deep_model(deep_input)

        x = x_wide + x_deep

        return x.squeeze(-1), x_wide.squeeze(-1), x_deep.squeeze(-1)

# Test the wide and deep model
Three modes: wide, deep, or wide_deep (Use the model_type keyword)

In [11]:
class Test_Wide_Deep_Bandits(BanditAlgorithm):
    def __init__(
        self,
        num_actions,
        num_features,
        wide_embed_size=100,
        wide_embed_dim=64,
        model_type = 'wide_deep', ## model_type = 'wide', 'deep', or 'wide_deep'
        name='test_deep_bandits'):
      
        hparams = {
                    'num_actions':num_actions,
                    'context_dim':num_features,
                    'max_grad_norm':5.0,
        }

        ## Raise error if model_type is not one of the available models
        possible_models = ['deep','wide','wide_deep']
        if model_type not in possible_models:
          raise NameError('model_type must be "deep", "wide", or "wide_deep"')

        self.name = name
        self.model_type = model_type
        self.wide_embed_dim = wide_embed_dim
        self.wide_embed_size = wide_embed_size
        self.hparams = hparams

        ## Initialize model and optimizer depeding on model_type
        if self.model_type == 'deep':
          self.deep_model = Deep_Model(context_size=self.hparams['context_dim'],
                                       n_action=self.hparams['num_actions'])
          self.optim = torch.optim.RMSprop(self.deep_model.parameters())

        if self.model_type == 'wide':
          self.wide_model = Wide_Model(embed_size=self.wide_embed_size, 
                                      n_action=self.hparams['num_actions'], 
                                      embed_dim=self.wide_embed_dim)
          self.optim = torch.optim.RMSprop(self.wide_model.parameters())
        

        if self.model_type == 'wide_deep':
          self.wide_deep_model = Wide_and_Deep_Model(context_size=self.hparams['context_dim'],
                                                    embed_size=self.wide_embed_size, 
                                                    n_action=self.hparams['num_actions'], 
                                                    wide_embed_dim=self.wide_embed_dim) 
          self.optim = torch.optim.RMSprop(self.wide_deep_model.parameters())
        
        self.loss = nn.modules.loss.MSELoss()

        self.t = 0
        self.update_freq_nn = 1
        self.num_epochs = 100
        self.data_h = ContextualDataset(self.hparams['context_dim'],
                                        self.hparams['num_actions'],
                                        intercept=False)
        self.user_dict = []

    def expected_values(self, user_id, context):
        ## Return expected reward for each possible action
        context = torch.tensor(context).float()

        if self.model_type == 'deep':
          x = self.deep_model.forward(context)
        if self.model_type == 'wide':
          x = self.wide_model.forward(user_id)
        if self.model_type == 'wide_deep':
          x, x_wide, x_deep = self.wide_deep_model.forward(user_id, context)
        #return x, x_wide, x_deep
        return x

    def action(self, user_id, context):
        ## Select and action based on expected values of reward
        
        if self.model_type == 'deep':
          vals = self.expected_values(user_id, context)  
        if self.model_type == 'wide':
          vals = self.expected_values(user_id, context)   
        if self.model_type == 'wide_deep':
          vals = self.expected_values(user_id, context)  
        return np.argmax(vals.detach().numpy())
        
    def update(self, user_id, context, action, reward):
        """
        Args:
          context: Last observed context.
          action: Last observed action.
          reward: Last observed reward.
        """
        self.t += 1
        self.data_h.add(user_id, context, action, reward)

        if self.t % self.update_freq_nn == 0:
          self.train(self.data_h, self.num_epochs)
                
    def do_step(self, u, x, y, w, step):

        if self.model_type == 'deep':
          y_hat = self.deep_model.forward(x.float())
        if self.model_type == 'wide':
          y_hat = self.wide_model.forward(u)
        if self.model_type == 'wide_deep':
          y_hat, y_wide, y_deep = self.wide_deep_model(u,x.float())

        y_hat *= w
        ls = self.loss(y_hat, y.float())
        ls.backward()

        clip = self.hparams['max_grad_norm']

        if self.model_type == 'deep':
          torch.nn.utils.clip_grad_norm_(self.deep_model.parameters(), clip)
        if self.model_type == 'wide':
          torch.nn.utils.clip_grad_norm_(self.wide_model.parameters(), clip)
        if self.model_type == 'wide_deep':
          torch.nn.utils.clip_grad_norm_(self.wide_deep_model.parameters(), clip)

        self.optim.step()
        self.optim.zero_grad()

    def train(self, data, num_steps):
        """Trains the network for num_steps, using the provided data.
        Args:
          data: ContextualDataset object that provides the data.
          num_steps: Number of minibatches to train the network for.
        """
        #print("Training {} for {} steps...".format(self.name, num_steps))

        batch_size = 512

        data.scale_contexts()

        for step in range(num_steps):
            #x, y, w = data.get_batch_with_weights(batch_size, scaled=True)
            u, x, y, w = data.get_batch_with_weights(batch_size, scaled=True)

            ## Training at time step 1 will cause problem if scaled=True, 
            ## because standard deviation=0, and scaled_context will equal nan
            if self.t != 1:   
              self.do_step(u, x, y, w, step)
    
    def save(self, path):
        """saves model to path"""
        with open(path, 'wb') as f:
            pickle.dump(self, f)

# Toy problem
The space-bandits toy_problem divides users/customers into two groups

Group 1: age ~25, ARPU ~100, user_id range 0 - 19

Group 2: age ~45, ARPU ~50, user_id range 20 - 39

    There are three action:
    promo 1: low value. 10 dollar if accept
    promo 2: mid value. 25 dollar if accept
    promo 3: high value. 100 dollar if accept

    Both groups are unlikely to accept promo 2.
    Group 1 is more likely to accept promo 1.
    Group 2 is slightly more likely to accept promo 3.

    The optimal choice for group 1 is promo 1; 90% acceptance for
    an expected reward of 9 dollars each.
    Group 2 accepts with 25% rate for expected 2.5 dollar reward

    The optimal choice for group 2 is promo 3; 20% acceptance for an expected
    reward of 20 dollars each.
    Group 1 accepts with 2% for expected reward of 2 dollars.

    The least optimal choice in all cases is promo 2; 10% acceptance rate for both groups
    for an expected reward of 2.5 dollars.

In [6]:
## Get test data from toy_problem
df = generate_dataframe(1000)
context_cols = ['age','ARPU']
action_col = ['action']
reward_col = ['reward']
user_id_col = ['user_id']
df.head()

Unnamed: 0,age,ARPU,action,reward,user_id
0,49.0,81.927246,2,0,33
1,25.0,70.390527,1,0,1
2,56.0,35.528346,2,100,27
3,45.0,74.296467,2,100,32
4,24.0,112.076534,2,0,2


In [7]:
num_actions = df[action_col].nunique()[0]
num_features = len(context_cols)
num_users = df[user_id_col].nunique()[0]
print("Number of actions:", num_actions)
print("Number of features:", num_features)
print("Number of users:", num_users)

Number of actions: 3
Number of features: 2
Number of users: 40


# Train/Save/Load models

In [11]:
#%%time
##CPU times: user 26min 10s, sys: 3.57 s, total: 26min 13s
##Wall time: 26min 16s

### Train models 
#demo_model_wide = Test_Wide_Deep_Bandits(num_actions, num_features, wide_embed_size=num_users, model_type='wide')
#demo_model_deep = Test_Wide_Deep_Bandits(num_actions, num_features, wide_embed_size=num_users, model_type='deep')
#demo_model_wide_deep = Test_Wide_Deep_Bandits(num_actions, num_features, wide_embed_size=num_users, model_type='wide_deep')

#for i in range(1000):
#  test_context = df[context_cols].iloc[i].values
#  test_action = df[action_col].iloc[i].values
#  test_reward = df[reward_col].iloc[i].values
#  test_user_id = df[user_id_col].iloc[i].values

#  demo_model_wide.update(torch.tensor(df[user_id_col].iloc[i].values), test_context,test_action,test_reward)
#  demo_model_deep.update(torch.tensor(df[user_id_col].iloc[i].values), test_context,test_action,test_reward)
#  demo_model_wide_deep.update(torch.tensor(df[user_id_col].iloc[i].values), test_context,test_action,test_reward)


In [12]:
## Save models
#demo_model_wide.save(path+'test_model_wide.pkl')
#demo_model_deep.save(path+'test_model_deep.pkl')
#demo_model_wide_deep.save(path+'test_model_wide_deep.pkl')

In [12]:
## Load existing models
demo_model_wide = load_model(path+'test_model_wide.pkl')
demo_model_deep = load_model(path+'test_model_deep.pkl')
demo_model_wide_deep = load_model(path+'test_model_wide_deep.pkl')

In [9]:
## Context columns means and standard deviations for scaling the input
context_means = df[context_cols].mean().values
context_std = df[context_cols].std().values

In [13]:
models = ['wide', 'deep', 'wide_deep']

for model in models:

  df["exp_reward_action_0"] = 0.0
  df["exp_reward_action_1"] = 0.0
  df["exp_reward_action_2"] = 0.0
  df["best_action"] = 0

  if model =='wide':
    demo_model = demo_model_wide
  elif model == 'deep':
    demo_model = demo_model_deep
  elif model == 'wide_deep':
    demo_model = demo_model_wide_deep


  for i in range(1000):
    test_context = (df[context_cols].iloc[i].values - context_means) / context_std
    test_action = df[action_col].iloc[i].values
    test_reward = df[reward_col].iloc[i].values
    test_user_id = torch.tensor(df[user_id_col].iloc[i].values)

    exp = demo_model.expected_values(test_user_id, test_context)
    action = demo_model.action(test_user_id, test_context)

    df['exp_reward_action_0'].iloc[i] = exp[0].tolist()
    df['exp_reward_action_1'].iloc[i] = exp[1].tolist()
    df['exp_reward_action_2'].iloc[i] = exp[2].tolist()
    df['best_action'].iloc[i] = action.tolist()
  
  if model == 'wide':
    df_wide = df.copy()
  elif model == 'deep':
    df_deep = df.copy()
  elif model == 'wide_deep':
    df_wide_deep = df.copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


# Model outputs

The wide only network will always predict the same reward for a user regardless of context

In [14]:
df_wide[df_wide['user_id']==14].head()

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2,best_action
89,26.0,119.05847,1,0,14,8.130596,1.867701,-0.136166,0
98,24.0,98.708003,2,100,14,8.130596,1.867701,-0.136166,0
124,28.0,93.231954,0,10,14,8.130596,1.867701,-0.136166,0
146,21.0,103.779834,0,10,14,8.130596,1.867701,-0.136166,0
342,31.0,111.697954,0,10,14,8.130596,1.867701,-0.136166,0


In [15]:
df_wide[df_wide['user_id']==28].head()

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2,best_action
21,41.0,49.553875,2,0,28,3.721442,1.723631,20.617413,2
122,43.0,66.869186,1,0,28,3.721442,1.723631,20.617413,2
126,47.0,79.193301,2,0,28,3.721442,1.723631,20.617413,2
178,39.0,33.757376,1,25,28,3.721442,1.723631,20.617413,2
179,43.0,55.434528,2,0,28,3.721442,1.723631,20.617413,2


The deep only network takes the context into account, but does not learn  about individual users

In [16]:
df_deep[df_deep['age']<35].head(10)

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2,best_action
1,25.0,70.390527,1,0,1,10.857512,7.098053,3.855397,0
4,24.0,112.076534,2,0,2,8.41955,2.633406,0.179958,0
6,20.0,74.115168,1,0,0,8.197994,-0.358547,-1.267184,0
8,22.0,94.259298,0,10,5,9.728422,13.814867,18.065399,2
9,22.0,92.116664,2,0,12,9.602438,-3.929533,-6.655853,0
10,24.0,98.229477,1,0,19,10.135986,18.64201,12.252966,1
11,18.0,96.045678,2,0,15,-5.955736,0.406215,-2.329772,1
12,22.0,105.154348,0,10,7,10.367142,-0.706902,0.567716,0
16,32.0,85.984811,2,0,18,8.404955,-6.638456,-35.877537,0
17,24.0,115.394914,1,0,16,8.956281,2.723162,1.236022,0


In [17]:
df_deep[df_deep['age']>35].head(10)

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2,best_action
0,49.0,81.927246,2,0,33,-0.486328,3.26346,15.680501,2
2,56.0,35.528346,2,100,27,2.749516,1.455314,3.076756,2
3,45.0,74.296467,2,100,32,-1.83071,18.599018,31.808071,2
5,43.0,12.369001,1,0,22,1.921343,3.302185,-10.224971,1
7,43.0,86.909563,1,0,27,0.314395,14.205988,-7.478526,1
13,48.0,49.963766,1,0,30,5.818552,11.239658,-20.19294,1
14,55.0,74.605336,1,0,35,2.749516,1.455314,3.076756,2
15,46.0,47.395378,2,100,31,1.863938,0.973558,27.716747,2
18,49.0,61.12911,0,0,30,9.827328,9.848066,46.914532,2
19,42.0,64.304893,1,0,29,-3.458362,-13.739792,50.240574,2


The wide and deep network should learn something about both the individual user behavior and the context

In [23]:
df_wide_deep[df_wide_deep['user_id']==11].head()

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2,best_action
92,28.0,107.778958,1,0,11,10.258002,1.295838,20.597254,2
182,28.0,79.870485,1,25,11,10.294828,5.855971,-0.960586,0
267,29.0,94.653919,2,0,11,2.016341,7.576851,159.078659,2
407,33.0,88.86801,2,0,11,11.952343,-1.420714,-62.12001,0
431,28.0,86.892412,1,0,11,9.859776,6.14391,-16.293882,0


In [20]:
df_wide_deep[df_wide_deep['age']<35].head(10)

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2,best_action
1,25.0,70.390527,1,0,1,9.62735,-2.58086,6.293558,0
4,24.0,112.076534,2,0,2,9.138901,6.736061,1.990497,0
6,20.0,74.115168,1,0,0,9.373483,6.949791,8.104754,0
8,22.0,94.259298,0,10,5,8.905104,6.601733,11.590864,2
9,22.0,92.116664,2,0,12,11.845016,-1.910176,-2.437893,0
10,24.0,98.229477,1,0,19,9.939683,5.266436,-0.552806,0
11,18.0,96.045678,2,0,15,13.571187,-0.294013,25.748976,2
12,22.0,105.154348,0,10,7,9.043765,-6.391127,12.712336,2
16,32.0,85.984811,2,0,18,7.688446,-2.078303,-0.609743,0
17,24.0,115.394914,1,0,16,9.844912,3.077079,0.917151,0


In [19]:
df_wide_deep[df_wide_deep['age']>35].head(10)

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2,best_action
0,49.0,81.927246,2,0,33,1.366076,10.175884,-20.491436,1
2,56.0,35.528346,2,100,27,1.798753,10.01261,-34.14608,1
3,45.0,74.296467,2,100,32,2.410696,-12.050975,-4.121323,0
5,43.0,12.369001,1,0,22,7.51567,1.919976,125.393532,2
7,43.0,86.909563,1,0,27,1.200537,29.767799,14.386299,1
13,48.0,49.963766,1,0,30,-1.143704,-1.202628,-26.330706,0
14,55.0,74.605336,1,0,35,3.475697,23.755005,17.310415,1
15,46.0,47.395378,2,100,31,4.566599,2.349478,106.977188,2
18,49.0,61.12911,0,0,30,-1.530456,-1.007847,40.307869,2
19,42.0,64.304893,1,0,29,1.950434,7.142441,-15.844952,1
