# Jan 23  IA316  TP  

User : $u\in \mathbb{R}^{k\cdot \#user}$

Item : $i\in \mathbb{R}^{k\cdot \#item}$ 

Reward : $r \in \{1,2,3,4,5\}, r = \frac{<u,i>}{||u||\times||i||} = f(u,i)$

$k$ embedding space dimension

## Rating Environment

In [1]:
import numpy as np
import matplotlib.pyplot as plt


In [15]:
# rating environment, items, rewards
# A client comes. The environment will return a list of items
# return [user_id, [list_item_id]]
# then an agent receives the item_id list and it does an act. 

# 记录推荐历史，推荐过的物品就不再推荐

class Environment:
    """
    rating environment, users, items, rewards
    
    A client comes. The environment will return a list of items
    return [user_id, [list_item_id]]
    rewards = [1,2,3,4,5]
    """
    def __init__(self, k, nb_item, nb_user, seed=None):
        # k embedding dimension
        self._nb_arms = nb_item
        self._rng = np.random.RandomState(seed)
        self._user_feature = self._rng.uniform(0,1, nb_user*k).reshape((nb_user, k))
        self._item_feature = self._rng.uniform(0,1, nb_item*k).reshape((nb_item, k))
        # recommend history != state, state = user_id & list of items that have not been recommended 
        # mask matrix 0-1
        self._recommend_mask = np.ones((nb_user, nb_item),dtype = int) # not yet recommended = 1, recommended = 0
    
        # compute the reward matrix, then rescale and round the reward matrix to 1,2,3,4,5
        self._reward_matrix = self._user_feature @ self._item_feature.T 
        scale_coef = 5/np.amax(self._reward_matrix)
        self._reward_matrix = np.around(scale_coef * self._reward_matrix, decimals = 0) 
        self._reward_matrix = self._reward_matrix.astype(int)
      #  self._optimal_reward = np.max(self._reward)
        
    def step(self):
        # return reward, next state = [user_id, [list item_id]]
        user_to_play = self._rng.choice(self._user_feature.shape[0], 1)[0]
        items_available = np.where(self._recommend_mask[user_to_play] == 1)[0]
        return user_to_play, items_available
        
        
    def update(self, user_id, action):
        # update state according to user's action (choice)
        reward = self._reward_matrix[user_id, action]
        self._recommend_mask[user_id][action] = 0
        next_state = [user_to_play, np.array(items_available, dtype = int)]
        return reward, next_state
        
    def reset(self):
        # first_state   
        self._recommend_mask = np.ones((nb_user, nb_item),dtype = int)
        return first_state

In [9]:
class RandomAgent:
    """
    Random agent
    """
    def __init__(self, _id, nb_item, seed=None):
        self._id = _id
        self._nb_arms = nb_item
        self._rng = np.random.RandomState(seed)
        
    def act(self, state):
        # available choices change as time evolves, state contains available choices
        choice = self._rng.randint(len(state))
        return state[choice]
    
    def getID(self):
        return self._id
       
    def update(self, action, reward):
        pass
    

In [16]:
nb_item = 10
nb_user = 3
nb_iter = 20
embedding_dimension = 2

agents = [RandomAgent(i, nb_item, seed = i) for i in range(nb_user)]
env = Environment(embedding_dimension, nb_item, nb_user, seed = 2020)

for i in range(nb_iter):
    user_to_play, items_available = env.step()
    agent = agents[user_to_play]
    item_chosen = agent.act(items_available)
    print("user_to_play: {}, recommend items {}, choice {}".format(user_to_play, items_available, item_chosen))
    reward, _ = env.update(agent.getID(), item_chosen)
    print("reward:", reward)
    agent.update(item_chosen, reward)

user_to_play: 1, recommend items [0 1 2 3 4 5 6 7 8 9], choice 5
reward: 1
user_to_play: 0, recommend items [0 1 2 3 4 5 6 7 8 9], choice 5
reward: 2
user_to_play: 1, recommend items [0 1 2 3 4 6 7 8 9], choice 9
reward: 2
user_to_play: 2, recommend items [0 1 2 3 4 5 6 7 8 9], choice 8
reward: 1
user_to_play: 2, recommend items [0 1 2 3 4 5 6 7 9], choice 9
reward: 1
user_to_play: 0, recommend items [0 1 2 3 4 6 7 8 9], choice 0
reward: 2
user_to_play: 1, recommend items [0 1 2 3 4 6 7 8], choice 8
reward: 2
user_to_play: 1, recommend items [0 1 2 3 4 6 7], choice 1
reward: 2
user_to_play: 2, recommend items [0 1 2 3 4 5 6 7], choice 6
reward: 0
user_to_play: 2, recommend items [0 1 2 3 4 5 7], choice 3
reward: 1
user_to_play: 1, recommend items [0 2 3 4 6 7], choice 4
reward: 1
user_to_play: 0, recommend items [1 2 3 4 6 7 8 9], choice 4
reward: 3
user_to_play: 2, recommend items [0 1 2 4 5 7], choice 2
reward: 1
user_to_play: 0, recommend items [1 2 3 6 7 8 9], choice 6
reward: 1
us