## Update and install packages

In [3]:
# Update and install display packages and stable baseline 3
# uncomment if running notebook for the first time
!apt-get update && apt-get install swig cmake -y
!apt-get update && apt-get install ffmpeg freeglut3-dev xvfb -y
!pip install box2d-py
!pip install moviepy
!pip install -r ../requirements.txt
!pip install ipywidgets


Get:1 http://security.ubuntu.com/ubuntu focal-security InRelease [128 kB]      
Get:2 http://archive.ubuntu.com/ubuntu focal InRelease [265 kB]                
Get:3 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [1276 kB]
Get:4 http://archive.ubuntu.com/ubuntu focal-updates InRelease [128 kB]
Get:5 http://archive.ubuntu.com/ubuntu focal-backports InRelease [128 kB]
Get:6 http://archive.ubuntu.com/ubuntu focal/multiverse amd64 Packages [177 kB]
Get:7 http://archive.ubuntu.com/ubuntu focal/main amd64 Packages [1275 kB]
Get:8 http://security.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [4109 kB]
Get:9 http://archive.ubuntu.com/ubuntu focal/restricted amd64 Packages [33.4 kB]
Get:10 http://archive.ubuntu.com/ubuntu focal/universe amd64 Packages [11.3 MB]
Get:11 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [4090 kB]
Get:12 http://security.ubuntu.com/ubuntu focal-security/multiverse amd64 Packages [30.9 kB]
Get:13 http://archiv

## Import Libraries

In [4]:
# Import rquired libraries and modules
import os
import signal
import subprocess
import gymnasium as gym
import stable_baselines3
from stable_baselines3 import A2C
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import torch
from pathlib import Path
import base64
from IPython import display as ipythondisplay
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.logger import configure
import tensorboard


import sys
import os

# Add the /app/src directory to the Python path
sys.path.append(os.path.abspath('../app/src'))

import gymnasium as gym
from gymnasium import spaces
from gymnasium import Space
import numpy as np
from typing import List
import random
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import csv

# Now try importing again
from spark.data import loader
from spark.data.models import Customer, Product, Category, Interaction, InteractionType
from spark import utils


In [5]:
# Check gym and stable baseline 3 versions
print(f"{gym.__version__=}")
print(f"{stable_baselines3.__version__=}")


gym.__version__='1.0.0'
stable_baselines3.__version__='2.4.0a11'


## Settings

### Tunning parameters

In [15]:
param_clip_range = 0.2
param_learning_rate = 0.001
param_gamma=0.995
param_gae_lambda=0.95
param_n_steps=2048
param_ent_coef = 0.01
param_total_timesteps = 500000
save_interval = param_total_timesteps/10

### Other settings

In [16]:
# name of the environment to run
env_name = 'Spark'
env_prefix = 'spark'

# output directoies
base_dir = '.'
output_dir = os.path.join(base_dir, 'output')
env_dir =  os.path.join(output_dir, env_prefix)
logs_dir = os.path.join(env_dir, 'logs')
models_dir = os.path.join(env_dir, 'models')
videos_dir = os.path.join(env_dir, 'videos')

os.makedirs(logs_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)
os.makedirs(videos_dir, exist_ok=True)

# tensorboard name for algorithm logs
tb_log_name = 'A2C'
model_prefix = 'a2c'
model_name_final = f"{model_prefix}_model_final"

print(logs_dir)
print(models_dir)
print(videos_dir)

./output/spark/logs
./output/spark/models
./output/spark/videos


## Callbacks and Directory Setup

In [17]:
# class to manage display
class Display:
    def __init__(self, command: str):
        self.command = command
        
    def start(self):
        self.process = subprocess.Popen(self.command.split())  
        os.environ['DISPLAY'] = ':1'
        
    def terminate(self):
        self.process.terminate()
        
# callback for saving model at regular intervals
class SaveOnIntervalCallback(BaseCallback):
    def __init__(self, save_interval: int, save_path: str, verbose=1):
        super().__init__(verbose)
        self.save_interval = save_interval
        self.save_path = save_path

    def _on_step(self) -> bool:
        # Save the model every 'save_interval' steps
        if self.num_timesteps % self.save_interval == 0:
            save_file = os.path.join(self.save_path, f'{model_prefix}_model_{self.num_timesteps}')
            self.model.save(save_file)
            if self.verbose > 0:
                print(f'Saving model to {save_file}.zip')
        return True

        
# Custom callback to log rewards for plotting
class RewardLoggingCallback(BaseCallback):
    def __init__(self, log_file="training_rewards_dqn.csv", verbose=1):
        super(RewardLoggingCallback, self).__init__(verbose)
        self.rewards = []
        self.episode_lengths = []  # Track episode lengths
        self.current_episode_length = 0
        self.log_file = log_file

        # Open the file in write mode and add a header row for the CSV log
        with open(self.log_file, mode="w", newline="") as file:
            writer = csv.writer(file)
            writer.writerow(["Timestep", "Reward"])

    def _on_step(self) -> bool:
        # Get reward and done
        reward = self.locals.get("rewards")
        done = self.locals.get("dones")

        if reward is not None:
            reward_value = reward[0]  # Since there's only one environment
            self.rewards.append(reward_value)

            # Append the reward and timestep to the CSV file
            with open(self.log_file, mode="a", newline="") as file:
                writer = csv.writer(file)
                writer.writerow([self.num_timesteps, reward_value])

        if done is not None:
            if done[0]:
                self.episode_lengths.append(self.current_episode_length)
                self.current_episode_length = 0
            else:
                self.current_episode_length += 1

        return True

## Custom Feature Extractor

In [18]:
# Feature extraction from frames as observations / states
class CustomANN(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=128):
        super(CustomANN, self).__init__(observation_space, features_dim)
        
        # Define your neural network layers
        self.net = nn.Sequential(
            nn.Linear(observation_space.shape[0], 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, features_dim)  # Output dimension should match features_dim
        )
    
    def forward(self, observations):
        return self.net(observations)
    

## Custom environment

In [19]:

"""
Custom gym environment for product recommendations where states represent customer interactions
actions are products;

Each customer interaction is a state. In the step function, transition of states will be the same 
customer with the new interaction for a product. The transition ends when the customer made a purchase or session ends.

The aim is to maximise rewards that can lead to a purchase.

It is better to have a customer interaction to represent a state rather than a time series of steps leading to a purchase. 
The latter method may have an incomplete where customer exits the application. Also, even if the customer did not purchase,
this information is still valuable for recommendations. Hence every state is a customer interaction.

An addition meta data for each customer will be stored to understand the context of the user. For example, if a product is 
purchase many times, this may factor into preference of the states.
---
Personalization: By incorporating the user ID, the model can tailor recommendations specifically to individual users, allowing 
it to learn unique user preferences and behaviors over time.

VS

Overfitting: If the model learns too much from the user ID directly, it might overfit to individual user patterns, potentially 
missing out on broader trends that could be useful for all users.

SOLUTION
Use Embeddings: Instead of directly using user IDs, consider using an embedding layer that transforms the user ID into a dense vector representation. This approach reduces dimensionality while capturing user-specific features.

Combine Features: Use user ID embeddings in conjunction with other features like user demographics, interaction history, and product attributes. This can create a more holistic view of user preferences.

Regularization: Implement techniques like dropout or weight regularization to mitigate overfitting when using user IDs or their embeddings.

Batch Normalization: Use batch normalization to stabilize learning, especially if the user ID leads to a wide range of outputs.
----
"""
class RecommendationEnv(gym.Env):
    def __init__(self, users:List[Customer], products:List[Product], top_k:int):
        super().__init__()
        
        self.users = users                  # list of users as states
        self.products = products            # products as actions, potential recommendations
        self.top_k = top_k                  # number of recommendations
        self.user_idx = 0                   # index of users list, not user_id
        self.current_step = 0               # step is also the interactions list index
        self.categories = loader.load_categories()
        
        self.action_space = spaces.MultiDiscrete([len(products)] * 10) 
        
        # number of customers as states
        # states are derived from customer profiles and interactioms
        # Users list will keep track of unique users
        # States include subset of features including product, interaction, ratings, and time in one-hot-encoding format
        # States exclude user_ids for policy network generalisation. But internal users list will be used as reference        
        self.observation_space = spaces.Dict({
            'pref_prod': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.float32),
            'pref_cat': spaces.Box(low=0, high=1, shape=(len(self.categories),), dtype=np.float32),
            'buys': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'views': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'likes': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'ratings': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'product': spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=np.uint8),
            'interaction': spaces.Box(low=0, high=1, shape=(len(list(InteractionType)),), dtype=np.uint8),
            'rating': spaces.Discrete(6)
            }) 
            ## add more features like time, ignored recommendtions, engagement etc
        
    def reset(self, seed=None, options=None):
        # Call the parent class's reset method to handle seeding
        super().reset(seed=seed)
        
        self.user_idx = np.random.randint(len(self.users)) # may run throught users one by one
        user = self.users[self.user_idx]
        self.current_step = 0
        return self._get_observation(user), {}# get current user features as states

    def step(self, rec_products):
        """ randomly interacting with product to mimick real user unpredictable behavious """
        self.current_step += 1
        user = self.users[self.user_idx]
        
        reward = 0
        done = False
        
        # simulate selected recommended product and interaction
        seleted_pid, interaction_type = self._simulate_interaction(rec_products) # generate random interaction
        # seleted_pid = random.choice(rec_products)
        # interaction_type = random.choice(list(InteractionType))
        
        random_rating = 0
        
        if interaction_type == InteractionType.NONE:
            reward = -1 # no interaction, customers not interested in recommendations
        elif interaction_type ==  InteractionType.VIEW:
            reward = 3
        elif interaction_type ==  InteractionType.LIKE:
            reward = 10
        elif interaction_type ==  InteractionType.BUY:
            reward = 50
        elif interaction_type ==  InteractionType.RATE:
            # generate rating, reward 1-2 is negative 3 neutral and 5 positive
            random_rating = random.randint(0, 5)
            reward = random_rating -1        
        elif interaction_type ==  InteractionType.SESSION_START:
            reward = 0
        elif interaction_type ==  InteractionType.SESSION_CLOSE:
            done = True
            reward = 0 # TODO: check if engament is too short
        else:
            reward = 0
        
        # generate random interaction
        new_interaction = Interaction(self.current_step, datetime.now(), user.idx, seleted_pid, interaction_type, random_rating)
        # reward = self._calculate_reward(user, product)
        
        return self._update_observation(new_interaction), reward, done, False, {}

    def _update_observation(self, interaction:Interaction):   
        # update user data     
        user = self.users[self.user_idx]   
        pid = interaction.product_idx    
         
        if interaction == InteractionType.VIEW:
            user.views[pid] += 1
        elif interaction == InteractionType.LIKE:
            user.likes[pid] += 1
        elif interaction == InteractionType.BUY:
            user.buys[pid] += 1
        elif interaction == InteractionType.RATE:
            user.rates[pid] = interaction.value
          
        # update observation based on new data  
        obs = {
                'pref_prod': self._get_product_preferences(user),
                'pref_cat': self._get_category_preferences(user), 
                'buys': utils.normalise(user.buys),
                'views': utils.normalise(user.views),
                'likes': utils.normalise(user.likes),
                'ratings': user.ratings,
                'product': utils.one_hot_encode(pid, len(self.products)),
                'interaction': self._get_interaction_observation(interaction),
                'rating': interaction.value if interaction.type == InteractionType.RATE else 0 
            }
        
        return obs

    def _get_observation(self, user:Customer): 
        
        obs = {
                'pref_prod': self._get_product_preferences(user),
                'pref_cat': self._get_category_preferences(user), 
                'buys': utils.normalise(user.buys),
                'views': utils.normalise(user.views),
                'likes': utils.normalise(user.likes),
                'ratings': user.ratings,
                'product': np.zeros(len(self.products)),
                'interaction': np.zeros(len(list(InteractionType))),
                'rating': 0 
            }
        
        return obs
        
    def _get_interaction_observation(self, interaction:Interaction):
        idx = list(InteractionType).index(interaction.type)
        size = len(InteractionType)
        
        return utils.one_hot_encode(idx, size)
    
    # calculate preferences based on past interactions
    def _get_product_preferences(self, user:Customer):
        view_prefs = user.views / 20
        purchase_prefs = user.buys
        like_prefs = user.likes / 15

        rating_prefs = user.ratings.copy()
        rating_prefs[rating_prefs > 0] -= 2
        
        product_prefs = view_prefs + purchase_prefs + like_prefs+ rating_prefs
        
        return product_prefs    # calculate preferences based on past interactions
    
    def _get_category_preferences(self, user:Customer):
        prod_prefs = self._get_product_preferences(user)
        cat_prefs = np.zeros(len(self.categories), np.float32)
        
        for idx, prod_pref in enumerate(prod_prefs):
            if prod_pref > 0:
                product = self.products[idx]
                cat_idx = product.category.idx
                cat_prefs[cat_idx] += prod_pref # accumulation of fav products for this cat
                # print(f"added pf {prod_pref} to cat {cat_idx}")
                
        cat_prefs = cat_prefs / 5 # reduce space     
           
        return cat_prefs

    def _simulate_interaction(self, product_ids):        
        user = self.users[self.user_idx]
        product_list = []
        
        # simulate selection
        num_products = len(product_ids)
        prod_scores = np.zeros(num_products, np.uint8)
        product_prefs = self._get_product_preferences(user)
        category_prefs = self._get_category_preferences(user)
        product_probs = np.full((num_products,), 1.0 / num_products) # equal probs by default
        product_probs[-1] = 0.1 # lower ending epsidoe flag to encourage longer training
        
        for idx, pid in enumerate(product_ids):
            product_list.append(self.products[pid]) # get the product objects
            prod_scores[idx] = product_prefs[pid]
            
        # combining category prefs to calculate probabilities
        for idx, product in enumerate(product_list):
            cid = product.category.idx
            prod_scores[idx] = category_prefs[cid] 
    
        # Ensure the probabilities sum to 1 for a valid probability distribution
        if np.argmax(prod_scores) > 0: # the product is in the preferences
            product_probs = np.array(prod_scores) / sum(prod_scores)

        # Randomly select a product based on the defined probabilities
        selected_product_id = np.random.choice(product_ids, p=product_probs)
        
        # simulate interaction for the selected product
        inter_types = list(InteractionType)
        inter_scores = np.zeros(len(inter_types), np.uint8)
        inter_probs = np.full((len(inter_types),), 1.0 / len(inter_types)) # equal probs by default
        
        
        for idx, inter_type in enumerate(inter_types):
            if inter_type == InteractionType.VIEW:
                inter_scores[idx] = user.views[selected_product_id]
            if inter_type == InteractionType.LIKE:
                inter_scores[idx] = user.likes[selected_product_id]
            if inter_type == InteractionType.BUY:
                inter_scores[idx] = user.buys[selected_product_id]
            if inter_type == InteractionType.RATE:
                inter_scores[idx] = user.ratings[selected_product_id]
        
        if np.argmax(inter_scores) > 0:
            inter_scores[inter_scores == 0] = 1 # default score for interaction that are 0
            inter_probs = np.array(inter_scores) / sum(inter_scores)

        # Randomly select a product based on the defined probabilities
        selected_interaction_type = np.random.choice(inter_types, p=inter_probs)
        
        return selected_product_id, selected_interaction_type

    def seed(self, seed=None):
        """
        Set the seed for reproducibility.
        """
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        random.seed(seed)
        np.random.seed(seed)
        return [seed]
    
    def render(self, mode='human'):
        if hasattr(self, 'last_action'):
            print(f"Recommended Product ID (Last Action): {self.last_action}")
        else:
            print("No product recommended yet.")

        # Optionally, print the reward received for the last action
        if hasattr(self, 'last_reward'):
            print(f"Reward for Last Action: {self.last_reward}")

        print("-----")

    def close(self):
        pass

## Initialise model with atari environemnt

In [20]:
products = loader.load_products()
customers = loader.load_customers(include_interactions=True)

In [21]:
for customer in customers:
    customer.views = np.zeros(len(products), dtype=np.int8)
    customer.likes = np.zeros(len(products), dtype=np.int8)
    customer.buys = np.zeros(len(products), dtype=np.int8)
    customer.ratings = np.zeros(len(products), dtype=np.int8)
    for interaction in customer.interactions:
        i_type = interaction.type.value    
        product_idx = interaction.product_idx  
        # print(f"customer {customer.idx} interaction {type} product {product_idx}")
        if i_type == InteractionType.VIEW.value:
            customer.views[product_idx] += 1
            # print(f"customer {customer.idx} view", customer.views)
        elif i_type == InteractionType.LIKE.value:
            customer.likes[product_idx] += 1
            # print(f"customer {customer.idx} like", customer.likes)
        elif i_type == InteractionType.BUY.value:
            customer.buys[product_idx] += 1
            # print(f"customer {customer.idx} buy", customer.buys)
        elif i_type == InteractionType.RATE.value:
            customer.ratings[product_idx] = interaction.value
            # print(f"customer {customer.idx} rate", customer.rates)

In [22]:
env = RecommendationEnv(customers, products, top_k=10)
env.seed(100)

[100]

## Initialize the PPO agent with specified parameters

In [23]:

model = A2C(
    env=env,
    policy='MultiInputPolicy',
    verbose=0,
    learning_rate=param_learning_rate, 
    n_steps=param_n_steps,
    ent_coef=param_ent_coef,
    gamma=param_gamma,
    gae_lambda=param_gae_lambda,
    # policy_kwargs={'features_extractor_class': CustomANN},
    tensorboard_log=logs_dir
)


# # Configure TensorBoard logger
# new_logger = configure(logs_dir, ["tensorboard"])
# model.set_logger(new_logger)

## Model training

In [24]:
# display = Display("Xvfb :1 -screen 0 1024x768x24")
# display.start()

In [None]:
# Define the interval at which models are saved during training
save_callback = SaveOnIntervalCallback(save_interval, models_dir)
# reward_callback = RewardLoggingCallback(log_file=f"{logs_dir}training_rewards_ppo.csv")
model.learn(total_timesteps=param_total_timesteps, progress_bar=True, callback=save_callback)

# Save the final model after training completes
final_model_path = os.path.join(models_dir, model_name_final)
model.save(final_model_path)

## Results rendering

### Logging

In [None]:
# Moving average of rewards
def plot_moving_average_rewards(rewards, window=50):
    moving_avg_rewards = np.convolve(rewards, np.ones(window) / window, mode="valid")
    plt.figure(figsize=(10, 5))
    plt.plot(moving_avg_rewards, label="Moving Average Reward (Window = {})".format(window))
    plt.xlabel("Timesteps")
    plt.ylabel("Moving Average Reward")
    plt.title("Moving Average of Reward vs. Timesteps")
    plt.legend()
    plt.show()


# Cumulative reward plot
def plot_cumulative_rewards(rewards):
    cumulative_rewards = np.cumsum(rewards)
    plt.figure(figsize=(10, 5))
    plt.plot(cumulative_rewards, label="Cumulative Reward")
    plt.xlabel("Timesteps")
    plt.ylabel("Cumulative Reward")
    plt.title("Cumulative Reward vs. Timesteps")
    plt.legend()
    plt.show()


# Plot episode lengths
def plot_episode_lengths(episode_lengths):
    plt.figure(figsize=(10, 5))
    plt.plot(episode_lengths, label="Episode Length")
    plt.xlabel("Episodes")
    plt.ylabel("Episode Length")
    plt.title("Episode Length vs. Episodes")
    plt.legend()
    plt.show()

In [None]:
# Plot moving average of rewards
plot_moving_average_rewards(reward_callback.rewards)

# Plot cumulative rewards
plot_cumulative_rewards(reward_callback.rewards)

# Plot episode lengths
plot_episode_lengths(reward_callback.episode_lengths)

### Video recording functions

In [12]:
# Functions to record videos of the agent playing and display the videos

def show_videos(video_path="", prefix=""):
    """Displays videos from a specified directory."""
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            '''<video alt="{0}" autoplay
                      loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{1}" type="video/mp4" />
            </video>'''.format(mp4, video_b64.decode('ascii'))
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))


def record_video(env_id, model, video_length=3000, prefix="", video_folder=videos_dir):
    eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode='rgb_array')])
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )
    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)
    eval_env.close()


In [13]:
# Display the video
import os

def get_model_identifiers(models_dir):
    files = os.listdir(models_dir)
    model_files = [f for f in files if f.startswith(f'{model_prefix}_model_')]
    identifiers = [f.split('_')[2] for f in model_files]
    return identifiers

def find_key_identifiers(identifiers):
    identifiers.sort()  # Ensure identifiers are sorted
    earliest = identifiers[0]
    final = identifiers[-1]
    middle = identifiers[len(identifiers) // 2]
    return earliest, middle, final

def view(models_dir, video_length=2000):
    print("MODEL DIR", models_dir)
    identifiers = get_model_identifiers(models_dir)
    earliest, middle, final = find_key_identifiers(identifiers)

    # Record videos at the beginning, middle, and end of training
    for stage, identifier in zip(["beginning", "middle", "end"], [earliest, middle, final]):
        model_path = os.path.join(models_dir, f'{model_prefix}_model_{identifier}')
        print("MODEL PATH", model_path)
        model = DQN.load(model_path)
        record_video(env_name, model, video_length=video_length, prefix=f'{model_prefix}-{env_prefix}-{stage}')

    # Display the videos
    for stage in ["beginning", "middle", "end"]:
        show_videos("videos", prefix=f'{model_prefix}-{env_prefix}-{stage}')



### Recording videos

In [12]:
# record video using background display
# display = Display("Xvfb :1 -screen 0 1024x768x24")
# display.start()

The XKEYBOARD keymap compiler (xkbcomp) reports:
> Internal error:   Could not resolve keysym XF86AudioPreset
> Internal error:   Could not resolve keysym XF86MonBrightnessCycle
> Internal error:   Could not resolve keysym XF86WWAN
> Internal error:   Could not resolve keysym XF86RFKill
> Internal error:   Could not resolve keysym XF86Keyboard
> Internal error:   Could not resolve keysym XF86RotationLockToggle
> Internal error:   Could not resolve keysym XF86FullScreen
Errors from xkbcomp are not fatal to the X server


In [14]:

# view(models_dir, video_length=3000)

MODEL DIR ./output/cartpole/models
MODEL PATH ./output/cartpole/models/dqn_model_1200000.zip
Saving video to /root/RL/A2/output/cartpole/videos/dqn-cartpole-beginning-step-0-to-step-3000.mp4
Moviepy - Building video /root/RL/A2/output/cartpole/videos/dqn-cartpole-beginning-step-0-to-step-3000.mp4.
Moviepy - Writing video /root/RL/A2/output/cartpole/videos/dqn-cartpole-beginning-step-0-to-step-3000.mp4



The XKEYBOARD keymap compiler (xkbcomp) reports:                 
> Internal error:   Could not resolve keysym XF86AudioPreset
> Internal error:   Could not resolve keysym XF86MonBrightnessCycle
> Internal error:   Could not resolve keysym XF86WWAN
> Internal error:   Could not resolve keysym XF86RFKill
> Internal error:   Could not resolve keysym XF86Keyboard
> Internal error:   Could not resolve keysym XF86RotationLockToggle
> Internal error:   Could not resolve keysym XF86FullScreen
Errors from xkbcomp are not fatal to the X server


Moviepy - Done !
Moviepy - video ready /root/RL/A2/output/cartpole/videos/dqn-cartpole-beginning-step-0-to-step-3000.mp4
MODEL PATH ./output/cartpole/models/dqn_model_3000000.zip
Saving video to /root/RL/A2/output/cartpole/videos/dqn-cartpole-middle-step-0-to-step-3000.mp4
Moviepy - Building video /root/RL/A2/output/cartpole/videos/dqn-cartpole-middle-step-0-to-step-3000.mp4.
Moviepy - Writing video /root/RL/A2/output/cartpole/videos/dqn-cartpole-middle-step-0-to-step-3000.mp4



The XKEYBOARD keymap compiler (xkbcomp) reports:                 
> Internal error:   Could not resolve keysym XF86AudioPreset
> Internal error:   Could not resolve keysym XF86MonBrightnessCycle
> Internal error:   Could not resolve keysym XF86WWAN
> Internal error:   Could not resolve keysym XF86RFKill
> Internal error:   Could not resolve keysym XF86Keyboard
> Internal error:   Could not resolve keysym XF86RotationLockToggle
> Internal error:   Could not resolve keysym XF86FullScreen
Errors from xkbcomp are not fatal to the X server


Moviepy - Done !
Moviepy - video ready /root/RL/A2/output/cartpole/videos/dqn-cartpole-middle-step-0-to-step-3000.mp4
MODEL PATH ./output/cartpole/models/dqn_model_final.zip
Saving video to /root/RL/A2/output/cartpole/videos/dqn-cartpole-end-step-0-to-step-3000.mp4
Moviepy - Building video /root/RL/A2/output/cartpole/videos/dqn-cartpole-end-step-0-to-step-3000.mp4.
Moviepy - Writing video /root/RL/A2/output/cartpole/videos/dqn-cartpole-end-step-0-to-step-3000.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready /root/RL/A2/output/cartpole/videos/dqn-cartpole-end-step-0-to-step-3000.mp4


The XKEYBOARD keymap compiler (xkbcomp) reports:
> Internal error:   Could not resolve keysym XF86AudioPreset
> Internal error:   Could not resolve keysym XF86MonBrightnessCycle
> Internal error:   Could not resolve keysym XF86WWAN
> Internal error:   Could not resolve keysym XF86RFKill
> Internal error:   Could not resolve keysym XF86Keyboard
> Internal error:   Could not resolve keysym XF86RotationLockToggle
> Internal error:   Could not resolve keysym XF86FullScreen
Errors from xkbcomp are not fatal to the X server


In [None]:
# display.terminate()