In [1]:
import torch
import torch.nn as nn
import numpy as np
import pickle

In [None]:
## Install and load BanditAlgorithm from space-bandits

#!pip install space-bandits
from space_bandits import BanditAlgorithm

In [3]:
import sys
## Path to the contextual_dataset_wu.py and toy_problem_wu.py files
## These files are copied from space-bandits and modified to handle/generate user IDs
path = '/content/drive/MyDrive/Fellowship_Deep_and_Wide_Bandit/'
sys.path.append(path)

from contextual_dataset_wu import ContextualDataset
from toy_problem_wu import generate_dataframe

In [4]:
## Get test data from toy_problem
## The toy_problem divides users/customers into two groups
## Group 1 age ~25, ARPU ~100, user_id range 0 - 19
## Group 2 age ~45, ARPU ~50, user_id range 20 - 39

df = generate_dataframe(1000)
context_cols = ['age','ARPU']
action_col = ['action']
reward_col = ['reward']
user_id_col = ['user_id']
df.head()

Unnamed: 0,age,ARPU,action,reward,user_id
0,49.0,26.791945,0,0,28
1,52.0,61.973526,1,0,32
2,38.0,83.352944,1,0,35
3,28.0,106.010719,2,0,3
4,27.0,104.157669,0,10,6


In [5]:
num_actions = df[action_col].nunique()[0]
num_features = len(context_cols)
num_users = df[user_id_col].nunique()[0]
print("Number of actions", num_actions)
print("Number of features", num_features)
print("Number of users", num_users)

Number of actions 3
Number of features 2
Number of users 40


In [7]:
class Wide_Model(nn.Module):
    ## The wide component of the model, takes the user IDs as inputs and learn an embedding of the user IDs
    def __init__(self, n_user=100, n_action=2, embed_dim=64):
        super(Wide_Model, self).__init__()
        self.n_user = n_user
        self.n_action = n_action
        self.embed_dim = embed_dim
        
        self.embedding = nn.Embedding(self.n_user, self.embed_dim)
        self.lr = nn.Linear(self.embed_dim, self.n_action)
    
    def forward(self, x):
        ## input user_id
        x = self.embedding(x)
        x = self.lr(x)
        return x


class Deep_Model(nn.Module):
    ## The deep part of the model, takes the context vector as input
    def __init__(self, context_size=5, layer_sizes=[50,100], n_action=2):
        ## layer_sizes (list of integers): defines neural network architecture: n_layers = len(layer_sizes), 
        ## value is per-layer width. (default [50])
        super(Deep_Model, self).__init__()
        self.context_size = context_size
        self.layer_sizes = layer_sizes
        self.n_action = n_action

        self.layers = []
        self.build_model()
        self.activation = nn.ReLU()
    
    def build_layer(self, inp_dim, out_dim):
        """Builds a layer in deep model """

        layer = nn.modules.linear.Linear(inp_dim,out_dim)
        nn.init.uniform_(layer.weight)
        name = f'layer {len(self.layers)}'
        self.add_module(name, layer)
        return layer
    
    def build_model(self):
        """
        Defines the actual NN model with fully connected layers.
        """
        for i, layer in enumerate(self.layer_sizes):
            if i==0:
                inp_dim = self.context_size
            else:
                inp_dim = self.layer_sizes[i-1]
            out_dim = self.layer_sizes[i]
            new_layer = self.build_layer(inp_dim, out_dim)
            self.layers.append(new_layer)
        output_layer = self.build_layer(out_dim, self.n_action)
        self.layers.append(output_layer)

    def forward(self, x):
        """forward pass of the neural network"""
        for i, layer in enumerate(self.layers):
            x = layer(x)
            if i != len(self.layers)-1:
                x = self.activation(x)
        return x
  
class Wide_and_Deep_Model(nn.Module):
    ## Combines the wide and deep model
    ## The wide and deep components both tries to predict the reward for each action, 
    ## this model combines adds output from both the wide and deep components
    def __init__(self, context_size=5, deep_layer_sizes=[50,100], n_action=2, n_user=100, wide_embed_dim=64):
        super(Wide_and_Deep_Model, self).__init__()
        self.n_action = n_action
        self.context_size = context_size
        self.deep_layer_sizes = deep_layer_sizes
        self.n_user = n_user
        self.wide_embed_dim = wide_embed_dim

        self.wide_model = Wide_Model(n_user=self.n_user, n_action=self.n_action, embed_dim=self.wide_embed_dim)
        self.deep_model = Deep_Model(context_size=self.context_size, layer_sizes=self.deep_layer_sizes, n_action=self.n_action)
    
    def forward(self, wide_input, deep_input):
        x_wide = self.wide_model(wide_input)
        x_deep = self.deep_model(deep_input)

        x = x_wide + x_deep

        return x.squeeze(-1)

In [29]:
## Based on the BanditAlgorithm from space-bandits, modified to use the deep and wide model
class Test_Deep_Wide_Bandits(BanditAlgorithm):
    def __init__(
        self,
        num_actions,
        num_features,
        num_users,
        wide_embed_dim=64,
        name='test_deep_Wide_bandits'):
      
        hparams = {
                    'num_actions':num_actions,
                    'context_dim':num_features,
                    'num_users':num_users,
                    'max_grad_norm':5.0,
        }

        self.name = name
        self.wide_embed_dim = wide_embed_dim
        self.hparams = hparams

        self.wide_deep_model = Wide_and_Deep_Model(context_size=self.hparams['context_dim'],
                                                   n_user=self.hparams['num_users'], 
                                                   n_action=self.hparams['num_actions'], 
                                                   wide_embed_dim=self.wide_embed_dim) 
        
        self.optim = torch.optim.RMSprop(self.wide_deep_model.parameters())
        
        self.loss = nn.modules.loss.MSELoss()

        self.t = 0
        self.update_freq_nn = 1
        self.num_epochs = 100
        self.data_h = ContextualDataset(self.hparams['context_dim'],
                                        self.hparams['num_actions'],
                                        intercept=False)
        self.user_dict = []

    def expected_values(self, user_id, context):
        ## context - the context for which to get the expected values for

        context = torch.tensor(context).float()
        x = self.wide_deep_model.forward(user_id, context)
        return x

    def action(self, user_id, context):
        ## Select and action based on expected values of reward
        vals = self.expected_values(user_id, context)
        return np.argmax(vals.detach().numpy())
        
    def update(self, user_id, context, action, reward):
        """
        Args:
          context: Last observed context.
          action: Last observed action.
          reward: Last observed reward.
        """
        self.t += 1
        self.data_h.add(user_id, context, action, reward)

        if self.t % self.update_freq_nn == 0:
          self.train(self.data_h, self.num_epochs)
                
    def do_step(self, u, x, y, w, step):

        y_hat = self.wide_deep_model(u,x.float())
        y_hat *= w
        ls = self.loss(y_hat, y.float())
        ls.backward()

        clip = self.hparams['max_grad_norm']
        torch.nn.utils.clip_grad_norm_(self.wide_deep_model.parameters(), clip)

        self.optim.step()
        self.optim.zero_grad()


    def train(self, data, num_steps):
        """Trains the network for num_steps, using the provided data.
        Args:
          data: ContextualDataset object that provides the data.
          num_steps: Number of minibatches to train the network for.
        """
        #print("Training {} for {} steps...".format(self.name, num_steps))

        batch_size = 512

        data.scale_contexts()

        for step in range(num_steps):
            u, x, y, w = data.get_batch_with_weights(batch_size, scaled=True)
            if self.t != 1:   
              self.do_step(u, x, y, w, step)
    
    def save(self, path):
        """saves model to path"""
        with open(path, 'wb') as f:
            pickle.dump(self, f)

In [30]:
## Make a test model

test_model = Test_Deep_Wide_Bandits(num_actions, num_features, num_users)

In [31]:
%%time
## Iterate through the simulated data from toy_model and update the test model
## Takes about 12 minutes for 100 data points
## Perhaps try to update the model less often to speed things up when implementing on real data

for i in range(999):
  test_context = df[context_cols].iloc[i].values
  test_action = df[action_col].iloc[i].values
  test_reward = df[reward_col].iloc[i].values
  test_user_id = df[user_id_col].iloc[i].values
  test_model.update(torch.tensor(df[user_id_col].iloc[i].values), test_context,test_action,test_reward)

CPU times: user 12min 22s, sys: 2.25 s, total: 12min 24s
Wall time: 12min 27s


In [15]:
## Take a look at the expected reward for each action

df["exp_reward_action_0"] = 0.0
df["exp_reward_action_1"] = 0.0
df["exp_reward_action_2"] = 0.0

## Bandit Algorithm scales the contect vectors before feeding them to the model, 
## so we need to use the mean and std to scale our input
context_means = df[context_cols].mean().values
context_std = df[context_cols].std().values

for i in range(999):
  test_context = (df[context_cols].iloc[i].values - context_means) / context_std # scaled input context vector
  test_action = df[action_col].iloc[i].values
  test_reward = df[reward_col].iloc[i].values
  test_user_id = torch.tensor(df[user_id_col].iloc[i].values) # have to convert user_id to torch.tensor, should modify the model to handle more input format
  exp = test_model.expected_values(test_user_id, test_context)

  df['exp_reward0'].iloc[i] = exp[0][0].tolist()
  df['exp_reward1'].iloc[i] = exp[0][1].tolist()
  df['exp_reward2'].iloc[i] = exp[0][2].tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [16]:
## Take a look at updated data frame
df.head()

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward0,exp_reward1,exp_reward2
0,49.0,26.791945,0,0,28,-0.267429,-13.222281,79.720673
1,52.0,61.973526,1,0,32,0.054349,-7.554504,76.584816
2,38.0,83.352944,1,0,35,1.105944,-0.258326,11.20478
3,28.0,106.010719,2,0,3,0.389969,3.679923,-0.385262
4,27.0,104.157669,0,10,6,9.448327,2.127077,2.116056


In [32]:
## Look at one user
df[df['user_id']==9].head(10)

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward0,exp_reward1,exp_reward2
15,24.0,72.24772,1,0,9,1.150257,1.910009,4.672597
55,20.0,106.602436,0,10,9,1.975041,-1.092273,1.950521
71,29.0,113.006893,0,10,9,1.958928,-1.076452,1.859861
87,22.0,74.939447,1,0,9,1.203977,1.456719,4.187758
91,29.0,92.204726,1,0,9,1.387149,-1.214442,1.861811
96,22.0,110.042198,1,0,9,2.028337,-1.081751,1.972392
226,18.0,125.491731,0,10,9,2.466997,-0.927744,2.527674
245,30.0,90.488324,2,0,9,1.338208,-1.059742,1.99756
296,28.0,71.849013,0,10,9,1.124645,1.973257,4.764408
348,27.0,101.13193,2,0,9,1.646422,-1.143018,1.843474


In [28]:
## Take a look at different age groups
df[df['age'] > 35].head(10)

Unnamed: 0,age,ARPU,action,reward,user_id,exp_reward0,exp_reward1,exp_reward2
0,49.0,26.791945,0,0,28,-0.267429,-13.222281,79.720673
1,52.0,61.973526,1,0,32,0.054349,-7.554504,76.584816
2,38.0,83.352944,1,0,35,1.105944,-0.258326,11.20478
5,53.0,69.406357,1,0,21,1.25318,-9.006681,80.989052
7,57.0,61.802828,1,0,27,6.276136,-10.717211,111.951996
8,51.0,74.160332,0,0,36,0.175296,-8.062878,62.953884
9,44.0,23.357967,1,0,24,0.755086,-6.771363,60.386101
11,44.0,58.471596,1,0,24,0.870913,-7.047223,41.870426
12,54.0,35.010748,2,100,34,-0.675035,-19.812857,127.895554
13,48.0,60.604183,1,0,26,-0.043861,-7.524991,58.324112


In [None]:
## Save model
test_model.save(path+'test_model')