In [4]:
import pandas as pd
import numpy as np

import sys
sys.path.append('..')  # Add the parent directory to the Python path

from src.users import (
    CheapSeekerUser,
    BrandLoverUser,
    RandomChooserUser,
    ValueOptimizerUser,
    FamiliaritySeekerUser
)

from src.recommenders import (
    RandomRecommender,
    PopularityRecommender
)

from src.env import ShopEnv
from src.utils import (
    load_catalog,
    action_to_indices,
    snake_case_to_camel_case
)
from src.data.encoders import encode_items_with_embeddings, user_to_one_hot
from src.config import Config

## DL

In [2]:
config = Config()
user_params = config.get("user_params")
username_to_user = {
    user: globals()[snake_case_to_camel_case(user) + 'User'](user, **params)
    for user, params in user_params.items()
}
name_to_recommender = {
    "random": RandomRecommender(),
    "popularity": PopularityRecommender()
}

In [3]:
catalog = load_catalog('../src/data/catalog.csv')
catalog.head()

Unnamed: 0,product_id,name,category,subcategory,price,quality_score,brand,color,popularity,release_date,description,release_days
0,1,Chair TO##,Home,Chair,17.17,0.834,BrandG,White,76,2025-04-30,Send situation town sea media wonder party fee...,20208
1,2,Perfume MX##,Beauty,Perfume,64.88,0.985,BrandL,White,92,2025-04-21,Radio sense leave real knowledge four institut...,20199
2,3,Tennis Racket MS##,Sports,Tennis Racket,23.49,0.717,BrandL,Green,70,2024-06-14,World article describe answer economy enjoy do...,19888
3,4,Novel PB##,Books,Novel,81.27,0.112,BrandI,Black,27,2023-11-12,Dog reflect explain program check letter possi...,19673
4,5,Lamp BA##,Home,Lamp,28.42,0.029,BrandE,Black,17,2025-05-17,Stand guy argue interesting hot magazine any l...,20225


In [5]:
encoded_items = encode_items_with_embeddings(catalog, config.get('catalog')['cat_features'])
encoded_items

(     product_id  category  subcategory   price  quality_score  brand  color  \
 0             1         4            3   17.17          0.834      6      4   
 1             2         0           14   64.88          0.985     11      4   
 2             3         5           19   23.49          0.717     11      2   
 3             4         1           13   81.27          0.112      8      0   
 4             5         4           10   28.42          0.029      4      0   
 ..          ...       ...          ...     ...            ...    ...    ...   
 245         246         1            1  168.41          0.061      1      2   
 246         247         0           12   36.82          0.537     14      1   
 247         248         5           20    9.82          0.555      9      2   
 248         249         0            5   33.48          0.128     12      4   
 249         250         5           19   44.16          0.262      6      5   
 
      popularity  release_days  
 0   

In [6]:
env = ShopEnv(catalog, username_to_user['cheap_seeker'])
state, info = env.reset()

In [8]:
recommender = name_to_recommender['random']
# recommender = name_to_recommender['popularity']
action = recommender.recommend(state, num_recommendations=10)
action

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0])

In [21]:
%load_ext line_profiler

In [33]:
def call_react_n_times(n):
    for _ in range(n):
        username_to_user['cheap_seeker'].react(catalog.sample(10))

%lprun -f username_to_user['cheap_seeker'].react call_react_n_times(100)

Timer unit: 1e-09 s

Total time: 0.420951 s
File: /home/ernest/Projects/simshop/notebooks/../src/users/base.py
Function: react at line 45

Line #      Hits         Time  Per Hit   % Time  Line Contents
    45                                               @profile
    46                                               def react(self, items: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    47                                                   """
    48                                                   Simulate user reactions to a set of recommended items.
    49                                                   
    50                                                   This method implements the core user decision process:
    51                                                   1. Calculate utility scores for all items
    52                                                   2. Determine clicks based on click_threshold
    53                                                   3. Determin

In [None]:
%lprun -f env.step env.step(action)

Timer unit: 1e-09 s

Total time: 0.0143668 s
File: /home/ernest/Projects/simshop/notebooks/../src/env/interaction_env.py
Function: step at line 105

Line #      Hits         Time  Per Hit   % Time  Line Contents
   105                                               def step(self, action: np.ndarray) -> Tuple[dict, float, bool, bool, dict]:
   106                                                   """Execute action, update state, compute reward, and return (obs, reward, done, truncated, info)."""
   107         1        581.0    581.0      0.0          done = False
   108                                           
   109                                                   # TAKE ACTION
   110         1      15338.0  15338.0      0.1          action_indices = np.where(action)[0]
   111         1     874454.0 874454.0      6.1          items_to_show = self.candidates.loc[action_indices] # encoded items
   112         1    1647008.0    2e+06     11.5          items_to_show = self.items.loc[sel

In [23]:
def profile_loop(n):
    for _ in range(n):
        state, info = env.reset()
        done = False
        while not done:
            action = recommender.recommend(state, num_recommendations=10)
            state, reward, done, truncated, info = env.step(action)

In [26]:
0.48761 / 58

0.008407068965517241

In [27]:
%lprun -f env.step profile_loop(100)

Timer unit: 1e-09 s

Total time: 4.50993 s
File: /home/ernest/Projects/simshop/notebooks/../src/env/interaction_env.py
Function: step at line 105

Line #      Hits         Time  Per Hit   % Time  Line Contents
   105                                               def step(self, action: np.ndarray) -> Tuple[dict, float, bool, bool, dict]:
   106                                                   """Execute action, update state, compute reward, and return (obs, reward, done, truncated, info)."""
   107       595     366731.0    616.4      0.0          done = False
   108                                           
   109                                                   # TAKE ACTION
   110       595    2099154.0   3528.0      0.0          action_indices = np.where(action)[0]
   111       595  235560072.0 395899.3      5.2          items_to_show = self.candidates.loc[action_indices] # encoded items
   112       595  504422940.0 847769.6     11.2          items_to_show = self.items.loc[self.

In [None]:
import time


times = []
for i in range(10):
    state, info = env.reset()
    done = False
    while not done:
        action = recommender.recommend(state, num_recommendations=10)
        action_indices = action_to_indices(action)
        # print("Action Indices:", action_indices)
        
        # Take a step in the environment
        start_time = time.time()
        state, reward, done, truncated, info = env.step(action)
        times.append(time.time() - start_time)
        # print_state_info(info)
        # print("reward:", reward)
        # print("done:", done)
        # print()
print("Average step time:", np.mean(np.array(times)))

Average step time: 0.00363201896349589


In [14]:
info['recommended_items']

Unnamed: 0,product_id,name,category,subcategory,price,quality_score,brand,color,popularity,release_date,description,release_days
0,3,Tennis Racket MS##,Sports,Tennis Racket,23.49,0.717,BrandL,Green,70,2024-06-14,World article describe answer economy enjoy do...,19888
1,56,Biography MP##,Books,Biography,36.15,0.138,BrandL,Yellow,26,2023-08-01,Wrong bad Democrat idea person current world add.,19570
2,105,Table QJ##,Home,Table,16.03,0.652,BrandD,Black,72,2023-08-10,Author traditional provide notice two perform ...,19579
3,109,Novel HP##,Books,Novel,19.2,0.8,BrandK,Green,82,2024-11-16,Note recognize other answer development term i...,20043
4,132,Biography HG##,Books,Biography,27.39,0.637,BrandN,Black,80,2024-08-25,Bill bring reason what southern police second ...,19960
5,190,Board Game RB##,Toys,Board Game,16.16,0.663,BrandB,Black,73,2025-02-04,Industry class poor near study network thought...,20123
6,214,Puzzle JI##,Toys,Puzzle,26.98,0.748,BrandH,Green,72,2023-08-11,Structure become research soldier again would ...,19580
7,219,Cream VU##,Beauty,Cream,16.09,0.347,BrandJ,Yellow,46,2024-02-16,Eight allow fight everyone during because very...,19769
8,220,Cream BM##,Beauty,Cream,37.73,0.265,BrandH,Black,41,2024-04-07,Thousand serve strong radio through send reduce.,19820
9,226,T-Shirt AB##,Clothing,T-Shirt,19.01,0.426,BrandJ,Red,64,2023-07-31,Room five type new administration reflect reac...,19569


In [13]:
info['click_through_rate'], info['buy_through_rate']

(np.float64(0.1), np.float64(0.0))

## RL Recommender

In [None]:
class RLRecommender:
    def __init__(self, model_path=None):
        self.model = ...

    def recommend(self, state: dict, num_recommendations: int=10) -> np.ndarray:
        # Mocking random behavior for now
        num_candidates = state['candidates_num_features'].shape[0]
        action = np.zeros(num_candidates, dtype=int)
        indices = np.random.choice(num_candidates, size=num_recommendations, replace=False)
        action[indices] = 1
        return action

## Evaluation

### Imports

In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.append('..')  # Add the parent directory to the Python path

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

from src.env import ShopEnv
from src.users import (
    CheapSeekerUser,
    BrandLoverUser,
    RandomChooserUser,
    ValueOptimizerUser,
    FamiliaritySeekerUser
)
from src.config import Config
from src.utils import (
    snake_case_to_camel_case,
    load_catalog,
    action_to_indices
)
from src.policies import TopKMultiInputPolicy
from src.recommenders import (
    RandomRecommender,
    PopularityRecommender,
    RLRecommender
)

### Global Variables

In [15]:
catalog_path = "../src/data/catalog.csv"
catalog = load_catalog(catalog_path)
config = Config()
user_params = config.get("user_params")

# Create user mapping
username_to_user = {
    user: globals()[snake_case_to_camel_case(user) + 'User'](user, **params)
    for user, params in user_params.items()
}

name_to_recommender = {
    "random": RandomRecommender(),
    "popularity": PopularityRecommender(),
    "rl": RLRecommender()
}

# DummyVecEnv with all users
users_subset = [
    "cheap_seeker",
    "brand_lover",
    "random_chooser",
    "value_optimizer",
    "familiarity_seeker"
]

env_params = {
    "catalog": catalog,
    "username_to_user": username_to_user,
    "users_subset": [
        "cheap_seeker",
        "brand_lover",
        "random_chooser",
        "value_optimizer",
        "familiarity_seeker"
    ]
}


In [16]:
rl_recommender = name_to_recommender['rl']
random_recommender = name_to_recommender['random']
popularity_recommender = name_to_recommender['popularity']
# rl_recommender.load_model('../models/ppo_with_all_users.zip')

In [None]:
rl_recommender.train(
    env_params=env_params,
    total_timesteps=10_0000,
    num_recommendations=10,
)

Output()

In [21]:
rl_recommender.evaluate(100)

AttributeError: 'RLRecommender' object has no attribute 'vec_env'

In [10]:
rl_recommender.save_model('../models/ppo_latest.zip')

Saving model to ../models/ppo_latest.zip


In [17]:
rl_recommender.load_model('../models/ppo_latest.zip', policy=TopKMultiInputPolicy)

Loading model from ../models/ppo_latest.zip


In [27]:
env = ShopEnv(catalog, username_to_user['cheap_seeker'])
state, info = env.reset()
done = False
while not done:
    random_action = random_recommender.recommend(state, num_recommendations=10)
    random_action_indices = action_to_indices(random_action)
    print("Random Action Indices:", random_action_indices)
    popularity_action = popularity_recommender.recommend(state, num_recommendations=10)
    popularity_action_indices = action_to_indices(popularity_action)
    print("Popularity Action Indices:", popularity_action_indices)
    action = rl_recommender.recommend(state, num_recommendations=10)
    action_indices = action_to_indices(action)
    print("RL Action Indices:", action_indices)
    print(len(action_indices))
    
    # Take a step in the environment
    state, reward, done, truncated, info = env.step(action)
    print("reward:", reward)
    print("done:", done)
    print()
    

Random Action Indices: [0, 3, 5, 10, 17, 24, 28, 34, 36, 45]
Popularity Action Indices: [1, 5, 7, 9, 10, 27, 29, 33, 34, 35]
RL Action Indices: [7, 9, 15, 17, 21, 30, 39, 41, 47, 49]
10
reward: 0.0
done: False

Random Action Indices: [0, 4, 13, 22, 26, 30, 32, 40, 44, 47]
Popularity Action Indices: [5, 7, 11, 19, 21, 24, 32, 36, 45, 46]
RL Action Indices: [8, 9, 11, 17, 18, 20, 22, 44, 45, 47]
10
reward: 0.0
done: False

Random Action Indices: [4, 9, 19, 24, 27, 31, 34, 43, 46, 48]
Popularity Action Indices: [3, 9, 11, 15, 23, 31, 33, 34, 35, 42]
RL Action Indices: [4, 7, 15, 17, 18, 21, 37, 39, 41, 44]
10
reward: 0.7
done: False

Random Action Indices: [10, 11, 15, 19, 34, 35, 36, 38, 41, 49]
Popularity Action Indices: [0, 1, 4, 6, 20, 21, 23, 26, 32, 46]
RL Action Indices: [0, 7, 9, 15, 17, 29, 41, 44, 47, 49]
10
reward: 0.4
done: False

Random Action Indices: [5, 8, 10, 11, 16, 19, 21, 32, 34, 47]
Popularity Action Indices: [0, 2, 9, 10, 16, 18, 21, 27, 44, 47]
RL Action Indices: [0

In [22]:
def evaluate_recommender(recommender, env, num_episodes=100):
    total_rewards = []
    for _ in range(num_episodes):
        state, info = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = recommender.recommend(state, num_recommendations=10)
            state, reward, done, truncated, info = env.step(action)
            total_reward += reward
        total_rewards.append(total_reward)
    average_reward = np.mean(total_rewards)
    print(f"Average Reward over {num_episodes} episodes: {average_reward}")
    return average_reward

In [21]:
reward.shape

(5,)

## Result Analysis

In [5]:
import pandas as pd
metrics = pd.read_csv('../src/metrics/recommender_comparison.csv')
metrics

Unnamed: 0,Recommender,User,Average Reward
0,random,cheap_seeker,1.861
1,popularity,cheap_seeker,1.843
2,rl,cheap_seeker,1.834
3,random,brand_lover,1.754
4,popularity,brand_lover,1.65
5,rl,brand_lover,1.767
6,random,value_optimizer,1.743
7,popularity,value_optimizer,1.974
8,rl,value_optimizer,1.667
9,random,familiarity_seeker,0.014


## Users

In [1]:
import sys
sys.path.append('..')  # Add the parent directory to the Python path

In [2]:
from src.users import (    
    CheapSeekerUser,
    BrandLoverUser,
    RandomChooserUser,
    ValueOptimizerUser,
    FamiliaritySeekerUser,
    FreshnessLookerUser
)
from src.utils import load_catalog

In [3]:
catalog = load_catalog('../data/catalog.csv')
catalog.head()

Unnamed: 0,product_id,name,category,subcategory,price,quality_score,brand,color,popularity,release_date,description,days_since_release
0,1,Chair TO##,Home,Chair,17.17,0.834,BrandG,White,76,2025-04-30,Send situation town sea media wonder party fee...,75
1,2,Perfume MX##,Beauty,Perfume,64.88,0.985,BrandL,White,92,2025-04-21,Radio sense leave real knowledge four institut...,84
2,3,Tennis Racket MS##,Sports,Tennis Racket,23.49,0.717,BrandL,Green,70,2024-06-14,World article describe answer economy enjoy do...,395
3,4,Novel PB##,Books,Novel,81.27,0.112,BrandI,Black,27,2023-11-12,Dog reflect explain program check letter possi...,610
4,5,Lamp BA##,Home,Lamp,28.42,0.029,BrandE,Black,17,2025-05-17,Stand guy argue interesting hot magazine any l...,58


In [4]:
catalog.describe()

Unnamed: 0,product_id,price,quality_score,popularity,release_date,days_since_release
count,250.0,250.0,250.0,250.0,250,250.0
mean,125.5,40.91148,0.525812,62.796,2024-06-06 10:50:52.800000,402.548
min,1.0,4.01,0.001,12.0,2023-06-07 00:00:00,55.0
25%,63.25,18.5575,0.25975,42.0,2023-12-14 00:00:00,220.25
50%,125.5,30.14,0.5305,65.0,2024-06-11 00:00:00,398.0
75%,187.75,51.83,0.7905,83.0,2024-12-05 18:00:00,578.0
max,250.0,196.45,1.0,110.0,2025-05-20 00:00:00,768.0
std,72.312977,32.573108,0.289574,24.0136,,207.220562


In [7]:
BRANDS = [f"Brand{chr(i)}" for i in range(65, 80)]  # A–O
COLORS = ["White", "Black", "Red", "Blue", "Green", "Yellow"]
power = 5
brand_weights = {brand: np.random.rand() ** power for brand in BRANDS}
color_weights = {brand: np.random.rand() ** power for brand in COLORS}
color_weights

{'White': 0.1357403731166312,
 'Black': 0.0023929300541853066,
 'Red': 0.03668010326474126,
 'Blue': 0.011758696594294571,
 'Green': 0.35670961655229755,
 'Yellow': 0.22685054364063406}

In [5]:
brand_weights

{'BrandA': 0.5729550570477008,
 'BrandB': 0.004980885074884081,
 'BrandC': 1.516876189441327e-10,
 'BrandD': 0.260225277118719,
 'BrandE': 0.0738844694857421,
 'BrandF': 0.05370536195408048,
 'BrandG': 0.02256994679147764,
 'BrandH': 0.07540303062942973,
 'BrandI': 0.3251875774469237,
 'BrandJ': 0.001179427675238534,
 'BrandK': 0.0014212710996049664,
 'BrandL': 0.10840986357263528,
 'BrandM': 0.0013818673721953775,
 'BrandN': 2.098993484748978e-05,
 'BrandO': 0.2941492051729788}

In [19]:
# cheap_seeker = CheapSeekerUser("user_A", 0.92, 0.95)
# brand_lover = BrandLoverUser("user_B", 0.4, 0.6, brand_weights, color_weights)
# value_optimizer = ValueOptimizerUser("user_C", 0.2, 0.4)
# familiarity_seeker = FamiliaritySeekerUser("user_D", 0.6, 0.8)
# random_chooser = RandomChooserUser("user_E", 0.85, 0.97)
freshness_looker = FreshnessLookerUser("freshness_looker", 0.40, 0.65, decay_rate=0.005)

In [None]:
start = 0
step = 5
end = 240
while start < end:
    i = start
    step = np.random.randint(1, 10)
    clicked_items, bought_items = familiarity_seeker.react(catalog[i:i + 15])
    start += step


In [20]:
# clicked_items, bought_items = cheap_seeker.react(catalog_df[:1000])
# clicked_items, bought_items = brand_lover.react(catalog_df[:1000])
# clicked_items, bought_items = value_optimizer.react(catalog_df[:1000])
# clicked_items, bought_items = familiarity_seeker.react(catalog_df[:1000])
# clicked_items, bought_items = random_chooser.react(catalog_df[:1000])
clicked_items, bought_items = freshness_looker.react(catalog[:1000])

In [21]:
ctr = sum(clicked_items) / len(clicked_items)
btr = sum(bought_items) / len(bought_items)
print(f"CTR: {ctr:.2f}, BTR: {btr:.2f}")

CTR: 0.20, BTR: 0.12


## Env Debugging

In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('..')  # Add the parent directory to the Python path

from src.users import (
    CheapSeekerUser,
    BrandLoverUser,
    RandomChooserUser,
    ValueOptimizerUser,
    FamiliaritySeekerUser
)

from src.recommenders import (
    RandomRecommender,
    PopularityRecommender
)

from src.env import ShopEnv
from src.utils import (
    load_catalog,
    username_to_user
)
from src.data.encoders import encode_items_with_embeddings, user_to_one_hot
from src.config import Config

In [2]:
config = Config()
# user_params = config.get("user_params")
# username_to_user = {
#     user: globals()[snake_case_to_camel_case(user) + 'User'](user, **params)
#     for user, params in user_params.items()
# }
name_to_recommender = {
    "random": RandomRecommender(),
    "popularity": PopularityRecommender()
}

In [3]:
catalog = load_catalog('../data/catalog.csv')
catalog.head()

Unnamed: 0,product_id,name,category,subcategory,price,quality_score,brand,color,popularity,release_date,description,days_since_release
0,1,Chair TO##,Home,Chair,17.17,0.834,BrandG,White,76,2025-04-30,Send situation town sea media wonder party fee...,72
1,2,Perfume MX##,Beauty,Perfume,64.88,0.985,BrandL,White,92,2025-04-21,Radio sense leave real knowledge four institut...,81
2,3,Tennis Racket MS##,Sports,Tennis Racket,23.49,0.717,BrandL,Green,70,2024-06-14,World article describe answer economy enjoy do...,392
3,4,Novel PB##,Books,Novel,81.27,0.112,BrandI,Black,27,2023-11-12,Dog reflect explain program check letter possi...,607
4,5,Lamp BA##,Home,Lamp,28.42,0.029,BrandE,Black,17,2025-05-17,Stand guy argue interesting hot magazine any l...,55


In [None]:
env = ShopEnv(catalog, username_to_user['cheap_seeker'])
state, info = env.reset()

In [17]:
state

{'user': array([1, 0, 0, 0, 0], dtype=int8),
 'candidates_cat_features': array([[ 2, 17,  9,  3],
        [ 3, 11,  7,  1],
        [ 2, 17, 13,  5],
        [ 4,  3,  7,  0],
        [ 2, 17,  2,  1],
        [ 3, 16, 14,  1],
        [ 3, 16, 14,  5],
        [ 4, 10,  4,  0],
        [ 1, 13, 13,  2],
        [ 2,  9,  5,  1],
        [ 6, 15, 13,  0],
        [ 5, 20,  0,  1],
        [ 6, 15, 14,  4],
        [ 3, 11, 14,  4],
        [ 3,  7, 13,  2],
        [ 0, 12, 14,  0],
        [ 5, 19,  1,  3],
        [ 6,  6,  6,  3],
        [ 0, 12,  7,  2],
        [ 4, 18,  1,  0],
        [ 5, 20,  5,  5],
        [ 6, 15,  7,  2],
        [ 2,  8,  7,  0],
        [ 4, 10,  6,  0],
        [ 6, 15,  5,  4],
        [ 4, 10,  9,  3],
        [ 2, 17,  8,  5],
        [ 4, 10, 13,  0],
        [ 3, 11,  6,  4],
        [ 4, 18, 12,  4],
        [ 1,  1,  2,  3],
        [ 3,  7,  4,  2],
        [ 6, 15, 12,  4],
        [ 5,  0, 14,  3],
        [ 1, 13,  3,  3],
        [ 4, 18,  

In [14]:
action = name_to_recommender['random'].recommend(state, num_recommendations=10)
state, reward, done, truncated, info = env.step(action)
print("Reward:", reward)
print("Done:", done)

Reward: 2.4000000000000004
Done: True


In [15]:
state

{'user': array([1, 0, 0, 0, 0], dtype=int8),
 'candidates_cat_features': array([[ 3,  7,  4,  2],
        [ 6,  2,  7,  1],
        [ 2,  8,  4,  4],
        [ 2,  9,  7,  1],
        [ 0, 14,  4,  3],
        [ 3, 11,  7,  2],
        [ 5, 20,  0,  4],
        [ 1, 13,  8,  0],
        [ 6, 15,  0,  5],
        [ 1, 13,  3,  3],
        [ 1,  1, 14,  2],
        [ 5,  0,  7,  5],
        [ 0, 14, 12,  4],
        [ 6,  2, 11,  1],
        [ 2, 17,  8,  1],
        [ 4, 18, 13,  3],
        [ 0, 12,  0,  0],
        [ 6,  2,  7,  0],
        [ 4,  3,  1,  4],
        [ 4, 10, 13,  0],
        [ 0, 14, 14,  2],
        [ 3, 16,  4,  2],
        [ 6,  6,  6,  3],
        [ 1, 13, 13,  2],
        [ 5,  0,  9,  3],
        [ 4, 18, 11,  5],
        [ 4, 10,  0,  1],
        [ 6, 15,  0,  0],
        [ 5, 19, 12,  0],
        [ 6, 15,  7,  2],
        [ 0,  5,  7,  2],
        [ 5, 20,  5,  5],
        [ 1,  1,  7,  2],
        [ 4,  3,  7,  0],
        [ 0, 12, 14,  1],
        [ 6, 15, 1

In [8]:
info

{'recommended_items':    product_id             name     category subcategory   price  \
 0          31     Perfume ZP##       Beauty     Perfume   11.40   
 1          36     Perfume ZJ##       Beauty     Perfume   94.44   
 2          62       Chair GO##         Home       Chair   44.33   
 3         102     Perfume UO##       Beauty     Perfume  176.48   
 4         107  Headphones QE##  Electronics  Headphones   42.33   
 5         169       Jeans SD##     Clothing       Jeans   25.30   
 6         184    Cookbook QA##        Books    Cookbook   55.99   
 7         218    Yoga Mat AP##       Sports    Yoga Mat   28.84   
 8         227        Lamp JN##         Home        Lamp   30.15   
 9         241  Basketball CR##       Sports  Basketball   18.14   
 
    quality_score   brand  color  popularity release_date  \
 0          0.752  BrandF  White          97   2024-11-23   
 1          0.825  BrandB  Green          85   2024-03-31   
 2          0.960  BrandK  White          91  

In [32]:
num_episodes = 2
for _ in range(num_episodes):
    state, info = env.reset()
    done = False
    while not done:
        action = name_to_recommender['random'].recommend(state, num_recommendations=10)
        state, reward, done, truncated, info = env.step(action)
        print("Reward:", reward)
        print("Done:", done)
        print()

Reward: 2.1
Done: False

Reward: 1.5
Done: False

Reward: -0.1
Done: False

Reward: 1.2000000000000002
Done: False

Reward: 0.6000000000000001
Done: True

Reward: 0.9000000000000001
Done: False

Reward: -0.1
Done: False

Reward: 1.2000000000000002
Done: False

Reward: 1.5
Done: False

Reward: 1.2000000000000002
Done: False

Reward: 1.2000000000000002
Done: True



In [22]:
env.done

False

In [24]:
info

{}

In [35]:
24_576 / 81

303.4074074074074

In [33]:
state.keys()

dict_keys(['user', 'candidates_cat_features', 'candidates_num_features', 'history_n_last_click_item_cat_features', 'history_n_last_click_item_num_features', 'history_n_last_click_items_mask'])