In [1]:
import gym
from gym import spaces
import numpy as np
import random

# Simulate fetching products from a database or external source
def fetch_products_from_database():
    return [
        {'id': 0, 'category_id': 0},
        {'id': 1, 'category_id': 1},
        {'id': 2, 'category_id': 0},
        {'id': 3, 'category_id': 1},
        {'id': 4, 'category_id': 2},
        {'id': 5, 'category_id': 0},
        {'id': 6, 'category_id': 2},
        {'id': 7, 'category_id': 1},
        {'id': 8, 'category_id': 0},
        {'id': 9, 'category_id': 2},
    ]

class CustomEnv(gym.Env):
    def __init__(self):
        super(CustomEnv, self).__init__()

        # Fetch products dynamically
        self.products = fetch_products_from_database()
        self.num_products = len(self.products)
        self.num_categories = max(product['category_id'] for product in self.products) + 1
        self.max_products_per_action = 3
        
        # Define action space as a list of products, each with product_id and category_id
        product_space = spaces.Dict({
            'product_id': spaces.Discrete(self.num_products),     # Product IDs
            'category_id': spaces.Discrete(self.num_categories)    # Category IDs
        })
        
        # Define a Tuple space to hold multiple products
        self.action_space = spaces.Tuple(
            [product_space] * self.max_products_per_action  # List of products
        )

        # Define observation space (example)
        self.observation_space = spaces.Box(low=0, high=1, shape=(4,), dtype=float)

    def step(self, action):
        # action is a tuple of product dictionaries
        for product in action:
            product_id = product['product_id']
            category_id = product['category_id']
            # Implement your logic using product_id and category_id
            print(f"Selected Product ID: {product_id}, Category ID: {category_id}")
        
        # Your logic for returning observation, reward, done, etc.
        obs = self.reset()  # Example: reset or get current observation
        reward = 0         # Compute your reward here
        done = False       # Determine if the episode is done

        return obs, reward, done, {}

    def reset(self):
        # Reset environment state and return initial observation
        return np.zeros(self.observation_space.shape)  # Example reset

    def render(self, mode='human'):
        # Optional: implement rendering of the environment
        print("Rendering the environment...")

# Initialize environment
env = CustomEnv()

# Example of sampling an action from the action space
sample_action = env.action_space.sample()
print("Sampled Action:", sample_action)

# Example of using the environment
obs = env.reset()
print("Initial Observation:", obs)

obs, reward, done, _ = env.step(sample_action)
print("New Observation:", obs, "Reward:", reward, "Done:", done)


Sampled Action: (OrderedDict([('category_id', 0), ('product_id', 9)]), OrderedDict([('category_id', 0), ('product_id', 7)]), OrderedDict([('category_id', 1), ('product_id', 2)]))
Initial Observation: [0. 0. 0. 0.]
Selected Product ID: 9, Category ID: 0
Selected Product ID: 7, Category ID: 0
Selected Product ID: 2, Category ID: 1
New Observation: [0. 0. 0. 0.] Reward: 0 Done: False
