In [1]:
import gym
from gym import spaces
import numpy as np
import pandas as pd


In [2]:

class ProductRecommendationEnv(gym.Env):
    def __init__(self, user_features, product_features):
        super(ProductRecommendationEnv, self).__init__()

        self.user_features = user_features  # User states
        self.product_features = product_features  # Product states

        # Define action space for recommending top 10 products
        self.action_space = spaces.MultiDiscrete([len(product_features)] * 10)  # Top 10 recommendations
        self.observation_space = spaces.Box(low=0, high=1, shape=(user_features.shape[1],), dtype=np.float32)

        self.state = None

    def reset(self):
        # Randomly select a user state
        self.state = self.user_features.sample(n=1).values.flatten()  # Shape should be (5,)
        return self.state

    def step(self, action):
        # Calculate rewards based on action
        selected_products = action
        rewards = self.calculate_rewards(selected_products)
        done = True  # End after one recommendation round
        return self.state, rewards, done, {}

    def calculate_rewards(self, selected_products):
        # Implement a reward function based on user preferences
        return np.random.rand()  # Placeholder reward

# Example user and product data
user_data = pd.DataFrame(np.random.rand(100, 5))  # 100 users, 5 features each
product_data = pd.DataFrame({'id': range(10000), 'category': np.random.randint(0, 10, 10000), 'price': np.random.rand(10000)})

env = ProductRecommendationEnv(user_data, product_data)

In [3]:
state = env.reset()
print(state)

[0.44282004 0.40231971 0.70593549 0.89240215 0.15477688]


In [6]:
env.observation_space

Box(0.0, 1.0, (5,), float32)

In [7]:
env.action_space

MultiDiscrete([10000 10000 10000 10000 10000 10000 10000 10000 10000 10000])

In [5]:
from stable_baselines3 import PPO

# Create and train the PPO agent
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10)  # Adjust based on your needs


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.513    |
| time/              |          |
|    fps             | 172      |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 2048     |
---------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7fbcf2287970>

In [None]:
obs = env.reset()
for _ in range(10):  # Make 10 recommendations
    action, _states = model.predict(obs)
    obs, rewards, done, _ = env.step(action)
    print("Recommended products:", action)


Recommended products: [5143 2656 8058 8197 5477 5651 1814 3042 9580 5617]
Recommended products: [3958 2760 3248 7390 4950 8258 9961 9557 3500 4737]
Recommended products: [2119 7877 8822 4921 3774 6374 2388 9029 6180 4565]
Recommended products: [9465 6478 7364  457 4401 4003  839   25 2743  205]
Recommended products: [3530 6474 9651 8949 6604 3473  786 1065 1680 3383]
Recommended products: [4650 6539  308 9588 6445 2129 4320 5657 8752  841]
Recommended products: [5510 4323  535  601 7399 8201 8968 5519 3537 2071]
Recommended products: [3746 9868 1863  286  705 8547 1382 9893 3557  879]
Recommended products: [6488 6594 7082 2776 6061 9462 2761 1939 9509  478]
Recommended products: [8032 8421 8469 6839  729 3608 4049 2661 6853 3166]
