In [1]:
import sys
import os

# Add the /app/src directory to the Python path
sys.path.append(os.path.abspath('../app/src'))

import gym
from gym import spaces
from gym import Space
import numpy as np
from typing import List
import random
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Now try importing again
from spark.data import loader
from spark.data.models import Customer, Product, Category, Interaction, InteractionType
from spark import utils


In [2]:
from spark.agent.environment import RecommendationEnv

In [3]:

"""
Custom gym environment for product recommendations where states represent customer interactions
actions are products;

Each customer interaction is a state. In the step function, transition of states will be the same 
customer with the new interaction for a product. The transition ends when the customer made a purchase or session ends.

The aim is to maximise rewards that can lead to a purchase.

It is better to have a customer interaction to represent a state rather than a time series of steps leading to a purchase. 
The latter method may have an incomplete where customer exits the application. Also, even if the customer did not purchase,
this information is still valuable for recommendations. Hence every state is a customer interaction.

An addition meta data for each customer will be stored to understand the context of the user. For example, if a product is 
purchase many times, this may factor into preference of the states.
---
Personalization: By incorporating the user ID, the model can tailor recommendations specifically to individual users, allowing 
it to learn unique user preferences and behaviors over time.

VS

Overfitting: If the model learns too much from the user ID directly, it might overfit to individual user patterns, potentially 
missing out on broader trends that could be useful for all users.

SOLUTION
Use Embeddings: Instead of directly using user IDs, consider using an embedding layer that transforms the user ID into a dense vector representation. This approach reduces dimensionality while capturing user-specific features.

Combine Features: Use user ID embeddings in conjunction with other features like user demographics, interaction history, and product attributes. This can create a more holistic view of user preferences.

Regularization: Implement techniques like dropout or weight regularization to mitigate overfitting when using user IDs or their embeddings.

Batch Normalization: Use batch normalization to stabilize learning, especially if the user ID leads to a wide range of outputs.
----
"""
class RecommendationEnv(gym.Env):
    def __init__(self, users:List[Customer], products:List[Product], top_k:int):
        super().__init__()
        
        self.users = users                  # list of users as states
        self.products = products            # products as actions, potential recommendations
        self.top_k = top_k                  # number of recommendations
        self.user_idx = 0           # index of users list, not user_id
        self.current_step = 0               # step is also the interactions list index
        self.categories = loader.load_categories()
        
        # get unique category data
            
        # # number of products as actions
        # self.action_space = self.action_space = spaces.Dict({
        #     'product_idx': spaces.Box(low=0, high=len(self.products), shape=(top_k,), dtype=np.uint8),
        #     'price': spaces.Box(low=0, high=100000, shape=(top_k,), dtype=np.float32),
        #     'category_idx': spaces.Box(low=0, high=len(self.categories), shape=(top_k,), dtype=np.uint8)
        # })
        
        self.action_space = spaces.MultiDiscrete([len(products)] * 10) 
        
        # number of customers as states
        # states are derived from customer profiles and interactioms
        # Users list will keep track of unique users
        # States include subset of features including product, interaction, ratings, and time in one-hot-encoding format
        # States exclude user_ids for policy network generalisation. But internal users list will be used as reference        
        self.observation_space = spaces.Dict({
            'pref_prod': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.float32),
            # 'pref_cat': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.float32),
            'buys': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'views': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'likes': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'ratings': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'product': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'interaction': spaces.Box(low=0, high=1, shape=(len(list(InteractionType)),), dtype=np.uint8),
            'rating': spaces.Discrete(5)
            }) 
            ## add more features like time, ignored recommendtions, engagement etc
            
    # @property
    # def user_observation_space(self):
    #     obs_space = {'pref_prod': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.float32),
    #                 #  'pref_cat': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.float32),
    #                  'buys': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
    #                  'views': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
    #                  'likes': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
    #                  'ratings': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8)}
        
    #     return obs_space
        
    def reset(self):
        self.user_idx = np.random.randint(len(self.users)) # may run throught users one by one
        user = self.users[self.user_idx]
        self.current_step = 0
        return self._get_observation(user) # get current user features as states

    def step(self, rec_products):
        """ randomly interacting with product to mimick real user unpredictable behavious """
        self.current_step += 1
        user = self.users[self.user_idx]
        
        reward = 0
        done = False
        
        # simulate selected recommended product and interaction
        selected_product = random.choice(rec_products)
        random_act = random.choice(list(InteractionType)) # generate random interaction
        random_rating = 0
        
        if random_act == InteractionType.NONE:
            reward = -1 # no interaction, customers not interested in recommendations
        elif random_act ==  InteractionType.VIEW:
            reward = 1
        elif random_act ==  InteractionType.LIKE:
            reward = 3
        elif random_act ==  InteractionType.BUY:
            reward = 20
        elif random_act ==  InteractionType.RATE:
            # generate rating, reward 1-2 is negative 3 neutral and 5 positive
            random_rating = random.randint(0, 5)
            reward = (random_rating - 3) * 2                
        elif random_act ==  InteractionType.SESSION_START:
            reward = 0
        elif random_act ==  InteractionType.SESSION_CLOSE:
            done = True
            reward = -1 # TODO: check if engament is too short
        else:
            reward = 0
        
        # generate random interaction
        new_interaction = Interaction(self.current_step, datetime.now(), user.idx, selected_product, random_act, random_rating)
        # reward = self._calculate_reward(user, product)
        
        return self._update_observation(new_interaction), reward, done, {}

    def _update_observation(self, interaction:Interaction):   
        # update user data     
        user = self.users[self.user_idx]   
        pid = interaction.product_idx    
         
        if interaction == InteractionType.VIEW:
            user.views[pid] += 1
        elif interaction == InteractionType.LIKE:
            user.likes[pid] += 1
        elif interaction == InteractionType.BUY:
            user.buys[pid] += 1
        elif interaction == InteractionType.RATE:
            user.rates[pid] = interaction.value
          
        # update observation based on new data  
        obs = {
                'pref_prod': self._get_product_preferences(user),
                # 'pref_cat': [], # TODOL calculate pref cat
                'buys': utils.normalise(user.buys),
                'views': utils.normalise(user.views),
                'likes': utils.normalise(user.likes),
                'ratings': user.ratings,
                'product': utils.one_hot_encode(pid, len(self.products)),
                'interaction': np.zeros(len(list(InteractionType))),
                'rating': 0 
                # 'interaction': self._get_interaction_observation(interaction),
                # 'rating': interaction.value if interaction.type == InteractionType.RATE else 0 
            }
        
        return obs

    def _get_observation(self, user:Customer): 
        
        obs = {
                'pref_prod': self._get_product_preferences(user),
                # 'pref_cat': [], # TODOL calculate pref cat
                'buys': utils.normalise(user.buys),
                'views': utils.normalise(user.views),
                'likes': utils.normalise(user.likes),
                'ratings': user.ratings,
                'product': np.zeros(len(self.products)),
                'interaction': np.zeros(len(list(InteractionType))),
                'rating': 0 
            }
        
        return obs
        
    def _get_interaction_observation(self, interaction:Interaction):
        idx = list(InteractionType).index(interaction.type)
        size = len(InteractionType)
        
        return utils.one_hot_encode(idx, size)
    
    # calculate preferences based on past interactions
    def _get_product_preferences(self, user:Customer):
        view_prefs = user.views / 20
        purchase_prefs = user.buys
        like_prefs = user.likes / 15

        rating_prefs = user.ratings.copy()
        rating_prefs[rating_prefs > 0] -= 2
        
        product_prefs = view_prefs + purchase_prefs + like_prefs+ rating_prefs
        
        return product_prefs# calculate preferences based on past interactions
    
    # def _get_category_preferences(self, user:Customer):
    #     view_prefs = user.views / 20
    #     purchase_prefs = user.buys
    #     like_prefs = user.likes / 15

    #     rating_prefs = user.ratings.copy()
    #     rating_prefs[rating_prefs > 0] -= 2
        
    #     product_prefs = view_prefs + purchase_prefs + like_prefs+ rating_prefs
        
    #     return product_prefs

    def render(self, mode='human'):
        # user_id = self.current_user
        # user_state = self.user_states[user_id]
        
        # print(f"Current User ID: {user_id}")
        # print("User Interaction Summary:")
        # print(f"Views: {user_state}")  # Assuming views are stored in user_state
        # print(f"Buys: {[i for i in range(self.num_products) if user_state[i] > 1]}")  # Modify as needed

        # # Print the current state (user's views and buys)
        # print(f"Current State (Views/Buys): {user_state}")

        # Print the last action taken (recommended product)
        if hasattr(self, 'last_action'):
            print(f"Recommended Product ID (Last Action): {self.last_action}")
        else:
            print("No product recommended yet.")

        # Optionally, print the reward received for the last action
        if hasattr(self, 'last_reward'):
            print(f"Reward for Last Action: {self.last_reward}")

        print("-----")

    def close(self):
        pass

In [4]:
list(InteractionType)

[<InteractionType.NONE: 'none'>,
 <InteractionType.VIEW: 'view'>,
 <InteractionType.LIKE: 'like'>,
 <InteractionType.BUY: 'buy'>,
 <InteractionType.RATE: 'rate'>,
 <InteractionType.EXIT: 'exit'>,
 <InteractionType.SESSION_START: 'session_start'>,
 <InteractionType.SESSION_CLOSE: 'session_close'>]

In [5]:
products = loader.load_products()
customers = loader.load_customers(include_interactions=True)

In [6]:
for customer in customers:
    customer.views = np.zeros(len(products), dtype=np.int8)
    customer.likes = np.zeros(len(products), dtype=np.int8)
    customer.buys = np.zeros(len(products), dtype=np.int8)
    customer.ratings = np.zeros(len(products), dtype=np.int8)
    for interaction in customer.interactions:
        i_type = interaction.type.value    
        product_idx = interaction.product_idx  
        # print(f"customer {customer.idx} interaction {type} product {product_idx}")
        if i_type == InteractionType.VIEW.value:
            customer.views[product_idx] += 1
            # print(f"customer {customer.idx} view", customer.views)
        elif i_type == InteractionType.LIKE.value:
            customer.likes[product_idx] += 1
            # print(f"customer {customer.idx} like", customer.likes)
        elif i_type == InteractionType.BUY.value:
            customer.buys[product_idx] += 1
            # print(f"customer {customer.idx} buy", customer.buys)
        elif i_type == InteractionType.RATE.value:
            customer.ratings[product_idx] = interaction.value
            # print(f"customer {customer.idx} rate", customer.rates)

In [7]:
len(InteractionType)

8

In [8]:
random.choice(list(InteractionType))

<InteractionType.LIKE: 'like'>

In [9]:
# test the env
from gym.wrappers import FlattenObservation
env = RecommendationEnv(customers, products, top_k=10)

In [10]:

# env.test_simiulate_interaction(products[:10])

In [11]:

# env = FlattenObservation(env)
# env.observation_space
random.choice(list(InteractionType))

<InteractionType.VIEW: 'view'>

In [12]:

# print(type(env.observation_space))  # Output: <class 'int'>

# def get_interaction_observation(interaction:Interaction):
#     idx = list(InteractionType).index(interaction.type)
#     size = len(InteractionType)
#     # print(idx)
    
def get_interaction_observation(interaction:Interaction):
        idx = list(InteractionType).index(interaction.type)
        size = len(InteractionType)
        return utils.one_hot_encode(idx, size)
    
get_interaction_observation(customers[0].interactions[0])

array([0, 0, 0, 1, 0, 0, 0, 0], dtype=uint8)

In [13]:
from stable_baselines3 import PPO
# policy_kwargs = dict(
#     net_arch=[dict(pi=[256, 256], vf=[256, 256])],
#     activation_fn=nn.ReLU,
# )

model = PPO(
    policy="MultiInputPolicy",
    env=env,
    verbose=1,
    learning_rate=3e-4,
    batch_size=64,
    n_steps=10,
    gamma=0.99,
    gae_lambda=0.95,
    ent_coef=0.0,
    # tensorboard_log=tensorboard_log_dir,
    # policy_kwargs=policy_kwargs,
)

model.learn(total_timesteps=50)  # Adjust based on your needs

  if not hasattr(tensorboard, "__version__") or LooseVersion(
  np.bool8: (False, True),
2024-11-05 22:05:50.894097: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  np.bool8: (False, True),
  from jax import xla_computation as _xla_computation


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=10 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2.5      |
|    ep_rew_mean     | -1       |
| time/              |          |
|    fps             | 273      |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 10       |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 3.6       |
|    ep_rew_mean          | -0.8      |
| time/                   |           |
|    fps                  | 125       |
|    iterations           | 2         |
|    time_elapsed         | 0         |
|    total_timesteps      | 20        |
| train/                  |           |
|    approx_kl            | 0.0939722 |
|    clip_fraction        | 0.45      |
|    clip_range           | 0.2       |
|    entropy_loss         | -56.9     |
|    explained_variance   | 0.165     |
|    learning_rate        | 0.0003    |
|    loss           

<stable_baselines3.ppo.ppo.PPO at 0x7f8208c34040>

In [14]:
obs = env.reset()
obs

{'pref_prod': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [38]:
random.randint(0, 5)

4

In [15]:
obs = env.reset()
for _ in range(10):  # Make 10 recommendations
    action, _states = model.predict(obs)
    obs, rewards, done, _ = env.step(action)
    print("Recommended products:", action)

Recommended products: [110 293 162 141 190 159 165 264  52 153]
Recommended products: [162  79 200 200 129  18  68 153 261  90]
Recommended products: [ 69  95 283 198  57  81 235  66 257 147]
Recommended products: [250 176 259 232 109  74 239  23 146  96]
Recommended products: [101  51 244 175 137 194  72  24 142 254]
Recommended products: [248 162 103 273 241  74  13  51 132 125]
Recommended products: [207 176 255 170  98   1 140 215 105   6]
Recommended products: [149 185 203 207 134 223 143  75 240 202]
Recommended products: [ 41  57  77  15  14 258 192  76  77  84]
Recommended products: [ 56 163  44 284 291   7 121 280 256  10]


In [39]:
obs

{'pref_prod': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [16]:
# Using Stable Baselines DQN
from stable_baselines3 import DQN

# Intialize the model with the ML Policy and Environment
model = DQN("MlpPolicy", env, verbose=2, tensorboard_log="Weights/tic_tac_toe_weights_sb3")

# Start training
model.learn(total_timesteps=500000)

#Save the trained model
model.save(f"{weights_path}/Tic_Tac_Toe_trained_dqn_model")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




AssertionError: The algorithm only supports (<class 'gymnasium.spaces.discrete.Discrete'>,) as action spaces but MultiDiscrete([295 295 295 295 295 295 295 295 295 295]) was provided