In [1]:
import torch
import torch.nn as nn
import numpy as np
import pickle

In [None]:
## Install and load BanditAlgorithm from space-bandits

#!pip install space-bandits
from space_bandits import BanditAlgorithm

In [3]:
import sys
## Path to the contextual_dataset_wu.py and toy_problem_wu.py files
## These files are copied from space-bandits and modified to handle/generate user IDs
path = '/content/drive/MyDrive/Fellowship_Deep_and_Wide_Bandit/'
sys.path.append(path)

from contextual_dataset_wu import ContextualDataset
from toy_problem_wu import generate_dataframe

In [38]:
## Get test data from toy_problem
## The toy_problem divides users/customers into two groups
## Group 1 age ~25, ARPU ~100, user_id range 0 - 19
## Group 2 age ~45, ARPU ~50, user_id range 20 - 39

df = generate_dataframe(1000)
context_cols = ['age','ARPU']
action_col = ['action']
reward_col = ['reward']
user_id_col = ['user_id']
df.head()

Unnamed: 0,age,ARPU,action,reward,user_id
0,34.0,75.911225,2,0,9
1,26.0,98.346251,2,0,12
2,43.0,34.373265,0,0,38
3,30.0,113.324396,0,10,19
4,22.0,95.14217,1,0,14


In [5]:
num_actions = df[action_col].nunique()[0]
num_features = len(context_cols)
num_users = df[user_id_col].nunique()[0]
print("Number of actions", num_actions)
print("Number of features", num_features)
print("Number of users", num_users)

Number of actions 3
Number of features 2
Number of users 40


In [7]:
class Wide_Model(nn.Module):
    ## The wide component of the model, takes the user IDs as inputs and learn an embedding of the user IDs
    def __init__(self, n_user=100, n_action=2, embed_dim=64):
        super(Wide_Model, self).__init__()
        self.n_user = n_user
        self.n_action = n_action
        self.embed_dim = embed_dim
        
        self.embedding = nn.Embedding(self.n_user, self.embed_dim)
        self.lr = nn.Linear(self.embed_dim, self.n_action)
    
    def forward(self, x):
        ## input user_id
        x = self.embedding(x)
        x = self.lr(x)
        return x


class Deep_Model(nn.Module):
    ## The deep part of the model, takes the context vector as input
    def __init__(self, context_size=5, layer_sizes=[50,100], n_action=2):
        ## layer_sizes (list of integers): defines neural network architecture: n_layers = len(layer_sizes), 
        ## value is per-layer width. (default [50])
        super(Deep_Model, self).__init__()
        self.context_size = context_size
        self.layer_sizes = layer_sizes
        self.n_action = n_action

        self.layers = []
        self.build_model()
        self.activation = nn.ReLU()
    
    def build_layer(self, inp_dim, out_dim):
        """Builds a layer in deep model """

        layer = nn.modules.linear.Linear(inp_dim,out_dim)
        nn.init.uniform_(layer.weight)
        name = f'layer {len(self.layers)}'
        self.add_module(name, layer)
        return layer
    
    def build_model(self):
        """
        Defines the actual NN model with fully connected layers.
        """
        for i, layer in enumerate(self.layer_sizes):
            if i==0:
                inp_dim = self.context_size
            else:
                inp_dim = self.layer_sizes[i-1]
            out_dim = self.layer_sizes[i]
            new_layer = self.build_layer(inp_dim, out_dim)
            self.layers.append(new_layer)
        output_layer = self.build_layer(out_dim, self.n_action)
        self.layers.append(output_layer)

    def forward(self, x):
        """forward pass of the neural network"""
        for i, layer in enumerate(self.layers):
            x = layer(x)
            if i != len(self.layers)-1:
                x = self.activation(x)
        return x
  
class Wide_and_Deep_Model(nn.Module):
    ## Combines the wide and deep model
    ## The wide and deep components both tries to predict the reward for each action, 
    ## this model combines adds output from both the wide and deep components
    def __init__(self, context_size=5, deep_layer_sizes=[50,100], n_action=2, n_user=100, wide_embed_dim=64):
        super(Wide_and_Deep_Model, self).__init__()
        self.n_action = n_action
        self.context_size = context_size
        self.deep_layer_sizes = deep_layer_sizes
        self.n_user = n_user
        self.wide_embed_dim = wide_embed_dim

        self.wide_model = Wide_Model(n_user=self.n_user, n_action=self.n_action, embed_dim=self.wide_embed_dim)
        self.deep_model = Deep_Model(context_size=self.context_size, layer_sizes=self.deep_layer_sizes, n_action=self.n_action)
    
    def forward(self, wide_input, deep_input):
        x_wide = self.wide_model(wide_input)
        x_deep = self.deep_model(deep_input)

        x = x_wide + x_deep

        return x.squeeze(-1)

In [29]:
## Based on the BanditAlgorithm from space-bandits, modified to use the deep and wide model
class Test_Deep_Wide_Bandits(BanditAlgorithm):
    def __init__(
        self,
        num_actions,
        num_features,
        num_users,
        wide_embed_dim=64,
        name='test_deep_Wide_bandits'):
      
        hparams = {
                    'num_actions':num_actions,
                    'context_dim':num_features,
                    'num_users':num_users,
                    'max_grad_norm':5.0,
        }

        self.name = name
        self.wide_embed_dim = wide_embed_dim
        self.hparams = hparams

        self.wide_deep_model = Wide_and_Deep_Model(context_size=self.hparams['context_dim'],
                                                   n_user=self.hparams['num_users'], 
                                                   n_action=self.hparams['num_actions'], 
                                                   wide_embed_dim=self.wide_embed_dim) 
        
        self.optim = torch.optim.RMSprop(self.wide_deep_model.parameters())
        
        self.loss = nn.modules.loss.MSELoss()

        self.t = 0
        self.update_freq_nn = 1
        self.num_epochs = 100
        self.data_h = ContextualDataset(self.hparams['context_dim'],
                                        self.hparams['num_actions'],
                                        intercept=False)
        self.user_dict = []

    def expected_values(self, user_id, context):
        ## context - the context for which to get the expected values for

        context = torch.tensor(context).float()
        x = self.wide_deep_model.forward(user_id, context)
        return x

    def action(self, user_id, context):
        ## Select and action based on expected values of reward
        vals = self.expected_values(user_id, context)
        return np.argmax(vals.detach().numpy())
        
    def update(self, user_id, context, action, reward):
        """
        Args:
          context: Last observed context.
          action: Last observed action.
          reward: Last observed reward.
        """
        self.t += 1
        self.data_h.add(user_id, context, action, reward)

        if self.t % self.update_freq_nn == 0:
          self.train(self.data_h, self.num_epochs)
                
    def do_step(self, u, x, y, w, step):

        y_hat = self.wide_deep_model(u,x.float())
        y_hat *= w
        ls = self.loss(y_hat, y.float())
        ls.backward()

        clip = self.hparams['max_grad_norm']
        torch.nn.utils.clip_grad_norm_(self.wide_deep_model.parameters(), clip)

        self.optim.step()
        self.optim.zero_grad()


    def train(self, data, num_steps):
        """Trains the network for num_steps, using the provided data.
        Args:
          data: ContextualDataset object that provides the data.
          num_steps: Number of minibatches to train the network for.
        """
        #print("Training {} for {} steps...".format(self.name, num_steps))

        batch_size = 512

        data.scale_contexts()

        for step in range(num_steps):
            u, x, y, w = data.get_batch_with_weights(batch_size, scaled=True)
            if self.t != 1:   
              self.do_step(u, x, y, w, step)
    
    def save(self, path):
        """saves model to path"""
        with open(path, 'wb') as f:
            pickle.dump(self, f)

In [30]:
## Make a test model

test_model = Test_Deep_Wide_Bandits(num_actions, num_features, num_users)

In [31]:
%%time
## Iterate through the simulated data from toy_model and update the test model
## Takes about 12 minutes for 100 data points
## Perhaps try to update the model less often to speed things up when implementing on real data

for i in range(999):
  test_context = df[context_cols].iloc[i].values
  test_action = df[action_col].iloc[i].values
  test_reward = df[reward_col].iloc[i].values
  test_user_id = df[user_id_col].iloc[i].values
  test_model.update(torch.tensor(df[user_id_col].iloc[i].values), test_context,test_action,test_reward)

CPU times: user 12min 22s, sys: 2.25 s, total: 12min 24s
Wall time: 12min 27s


In [39]:
## Take a look at the expected reward for each action

df["exp_reward_action_0"] = 0.0
df["exp_reward_action_1"] = 0.0
df["exp_reward_action_2"] = 0.0

## Bandit Algorithm scales the contect vectors before feeding them to the model, 
## so we need to use the mean and std to scale our input
context_means = df[context_cols].mean().values
context_std = df[context_cols].std().values

for i in range(999):
  test_context = (df[context_cols].iloc[i].values - context_means) / context_std # scaled input context vector
  test_action = df[action_col].iloc[i].values
  test_reward = df[reward_col].iloc[i].values
  test_user_id = torch.tensor(df[user_id_col].iloc[i].values) # have to convert user_id to torch.tensor, should modify the model to handle more input format
  exp = test_model.expected_values(test_user_id, test_context)

  df['exp_reward_action_0'].iloc[i] = exp[0][0].tolist()
  df['exp_reward_action_1'].iloc[i] = exp[0][1].tolist()
  df['exp_reward_action_2'].iloc[i] = exp[0][2].tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [40]:
## Take a look at updated data frame
df.head()

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2
0,34.0,75.911225,2,0,9,9.646473,7.264092,-57.292065
1,26.0,98.346251,2,0,12,6.509017,3.498647,-8.759447
2,43.0,34.373265,0,0,38,-0.336221,-4.249358,-73.253906
3,30.0,113.324396,0,10,19,8.796981,1.122566,-19.728703
4,22.0,95.14217,1,0,14,10.446238,-0.084635,4.44153


In [41]:
## Look at one user
df[df['user_id']==9].head(10)

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2
0,34.0,75.911225,2,0,9,9.646473,7.264092,-57.292065
15,28.0,92.376872,2,0,9,9.086729,-2.186729,-7.674484
18,33.0,113.353909,2,0,9,8.179634,-4.031878,-0.343261
49,33.0,97.449121,2,0,9,9.010342,-4.611232,-2.273803
96,19.0,96.757537,1,0,9,10.117834,1.814423,-3.791076
128,28.0,121.265462,1,0,9,7.813842,7.19015,-37.604416
167,29.0,110.134965,1,0,9,8.975857,-4.216433,-6.94358
182,29.0,84.666049,2,0,9,7.698443,3.743316,-52.273098
251,31.0,100.607396,0,0,9,9.023376,-3.634111,-3.079711
286,25.0,89.651999,2,0,9,9.547598,1.115703,24.297226


In [42]:
## Take a look at different age groups
df[df['age'] > 35].head(10)

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward_action_0,exp_reward_action_1,exp_reward_action_2
2,43.0,34.373265,0,0,38,-0.336221,-4.249358,-73.253906
5,50.0,32.156297,0,0,23,11.632194,2.604059,-78.980095
6,54.0,49.596057,1,0,23,9.793988,1.694585,-40.642929
7,46.0,30.448668,0,0,25,10.398321,11.729246,121.048225
10,48.0,66.26466,1,0,29,3.394415,-2.853061,41.312874
14,47.0,59.046671,1,0,33,3.835258,2.557797,35.571533
24,55.0,30.639434,2,0,34,6.424031,4.460411,30.866501
25,39.0,40.382362,0,0,28,-0.490496,9.242406,3.993109
27,44.0,67.63036,1,0,29,3.330539,16.432341,-81.9179
29,41.0,41.380842,2,100,21,6.243629,-1.516735,-64.782745


In [None]:
## Save model
test_model.save(path+'test_model')