#@authors:
Fritz poka Toukam 


ndjekoua sandjo jean thibaut

In [0]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from copy import copy, deepcopy
import random
from scipy.stats import norm

In [0]:
def random_argmax(rng, list_):
    """ similar to np.argmax but return a random element among max
        when multiple max exists."""
    return rng.choice(np.argwhere(list_ == list_.max()).flatten())

In [0]:
np.set_printoptions(precision=2)

#Environment

In [0]:
class Amazon_environment:
    """ A rating environment with explicit feedback.
        User and items are represented by points in R^k
        items_features  is the number of features that carachterizes each item
        User interest for a given item is modeled by a parametric function
        R_{u,i} = f(u,i) = f(W_u, W_i)
        Example of function include dot product (cosine similarity)
        R_{u,i} = \sum_k w_{u,k} . w_{i,k}
        action: Recommend one item for a given user among those he has never bought before
    """

    def __init__(self, nb_users=30, nb_items=10,items_featues=2, 
                 internal_embedding_size=3,nb_actions = 2,
                 displayed_users_embedding_size=2,
                 displayed_items_embedding_size=2,
                 noise_size=3,
                 seed=None):
        self.nb_users = nb_users
        self.nb_actions = nb_actions
        self.nb_items = nb_items
        self.items_featues = items_featues
        self.internal_embedding_size = internal_embedding_size
        self.displayed_users_embedding_size = displayed_users_embedding_size
        self.displayed_items_embedding_size = displayed_items_embedding_size
        self.noise_size = noise_size
        self._rng = np.random.RandomState(seed)
        
        self.action_size = self.nb_items
        self.sampling_limit = nb_users * nb_items
        self.user_mean = np.ones(self.internal_embedding_size)
        self.user_var = np.ones(self.internal_embedding_size)
        self.item_mean = np.ones(self.internal_embedding_size)
        self.item_var = np.ones(self.internal_embedding_size)
        self.users_embedding = None
        self.items_embedding = None
        self.user_item_history = None
        self.z_cut_points = None
        self.done = False

    def step(self, actions):
        # check if behind done
        if self.done: #self.user_item_history.sum() >= self.sampling_limit:
            print("You are calling step after it return done=True.\n"
                  "You should reset the environment.")

        assert len(actions) == self.nb_actions
        self.actions = actions
        
        # compute potential rewards
        potential_rewards =np.array([self.users_embedding[self.current_user].dot(self.items_embedding[i])*self.items_features_matrix[i][0]
                             for i in np.argwhere(self.user_item_history[self.current_user, :] == 0).flatten()])
        
        optimal_indexes = np.argsort(potential_rewards)[-1:0:-1][:self.nb_actions]
        optimal_return = np.sum(potential_rewards[optimal_indexes]) 

        # map action to item
        #this step is crucial because, since the user only sees the available items, it's action index is  mapped to that list, so in order to get the
        #corresponding index in the history matrix, we need to use the trick below.
        bought_items =[a for a in actions if a in optimal_indexes]
        if bought_items != []:#do this only if there is one item which have been bougth by the user.otherwise, do nothing
          bought_items = np.argwhere(self.user_item_history[self.current_user, :] == 0)[bought_items][:,0]
          # mark item as rated
          self.user_item_history[self.current_user, bought_items] = 1
          self.items_features_matrix[bought_items,1]-=1

        # compute reward R_t
        recommended_items = np.argwhere(self.user_item_history[self.current_user, :] == 0)[actions][:,0]
        self.current_rating = [self._get_user_item_reward(self.current_user, a) for a in recommended_items]
        self.reward = np.sum(self.current_rating)
        
        # check if done
        if self.user_item_history.sum() == self.sampling_limit:
            self.done = True

        # compute next state S_{t+1}
        self._next_state()

        # update action space t+1
        self.action_size = len(self.available_items)

        return self.reward, self.state, self.done, optimal_return

    def init_items_features_matrix(self,nb_items,nb_features):
        price = np.random.randint(low=100,high=1000, size=nb_items)
        disponibility = np.array([10]*nb_items)
        matrix = np.array([price,disponibility]).T
        return matrix

        items_features_matrix = np
    def reset(self, seed=None):
        self._rng = np.random.RandomState(seed)
        self.action_size = self.nb_items
        
        # create users and items embedding matrix
        self.users_embedding = self._rng.normal(loc=self.user_mean,
                                                scale=self.user_var,
                                                size=(self.nb_users, self.internal_embedding_size))
        self.items_embedding = self._rng.normal(loc=self.item_mean,
                                                scale=self.item_var,
                                                size=(self.nb_items, self.internal_embedding_size))

        #create the matrix of items_features
        self.items_features_matrix = self.init_items_features_matrix(self.nb_items,self.items_featues)

        self.user_item_history = np.zeros((self.nb_users, self.nb_items))
        self.done = False

        self._next_state()
        return self.state

    def _get_user_item_reward(self, user, item):
        mean = self.users_embedding[user].dot(self.items_embedding[item])*self.items_features_matrix[item][0] #array of expected reward where each value correspond to one action
        var = 1
        reward = self._rng.normal(loc=mean,scale=var,size= 1)
        return reward

    

    def _get_new_user(self):
        for i in range(10):
            user = self._rng.randint(0, self.nb_users)
            # check it remain at least one item
            if np.sum(self.user_item_history[user, :]) < self.nb_items:
                return user
        return self._rng.choice(np.argwhere(self.user_item_history == 0))[0]

    def _next_state(self):
        # Pick a user
        if self.user_item_history.sum() < self.sampling_limit:
            self.current_user = self._get_new_user()
        else:
            self.current_user = None

        # List available items
        #this method will return an array, where each eleemnt is also an array
        self.available_items = np.argwhere(self.user_item_history[self.current_user, :] == 0)

        self.state = list()
        for i in self.available_items:
            item = i[0]
            # Compute variables
            if self.items_features_matrix[item,1] > 0: #return only items tht are sill available.
             self.state.append([self.current_user, item])


In [0]:
a = [122,1,2,13,234,55,3]
a=np.argsort(a)[-1:0:-1][0:4]

In [108]:
env = Amazon_environment()
env.reset(seed=2020)
reward, next_state, done, optimal_return = env.step(np.array([0,1]))
print('reward: ', reward,optimal_return)

reward:  2136.0745470801758 5203.690554546546


In [0]:
class Random:
    """ Random agent. """
    def __init__(self, nb_arms, seed=None):
        self._nb_arms = nb_arms
        self._rng = np.random.RandomState(seed)
        
    def act(self, context):
        action = self._rng.randint(len(context)) # note that action size is changing
        return action
        
    def update(self, context, action, reward):
        pass
