<a href="https://colab.research.google.com/github/diegomrodrigues/deep_rl/blob/main/Cart_Pole_e_Greedy_Strategy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m61.4/101.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm
from huggingface_hub import HFSummaryWriter

# Ensure GPU is available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

# Set TensorFlow to use only the first GPU and manage GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

# Strategy Interface
class Strategy:
    def __init__(self, action_space):
        self.action_space = action_space

    def select_action(self, model, state, epsilon):
        raise NotImplementedError("This method needs to be implemented by subclasses.")

    def update_epsilon(self, epsilon):
        raise NotImplementedError("This method needs to be implemented by subclasses.")

# Epsilon-Greedy Strategy
class EpsilonGreedyStrategy(Strategy):
    def select_action(self, model, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_space)
        else:
            qs = model.predict(state[np.newaxis], verbose=0)
            return np.argmax(qs)

    def update_epsilon(self, epsilon):
        if epsilon > 0.1:
            return epsilon * 0.995
        return epsilon

# Agent definition
class Agent:
    def __init__(self, action_space, strategy):
        self.action_space = action_space
        self.strategy = strategy
        self.model = self.create_model()
        self.epsilon = 1.0  # Initial exploration rate

    def create_model(self):
        model = Sequential([
            Dense(256, activation='relu', input_shape=(4,)),
            Flatten(),
            Dense(self.action_space, activation='linear')
        ])
        model.compile(optimizer=Adam(), loss='mse')
        return model

    def act(self, state):
        return self.strategy.select_action(self.model, state, self.epsilon)

    def update(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += 0.99 * np.max(self.model.predict(next_state[np.newaxis], verbose=0))
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        loss = self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)
        self.epsilon = self.strategy.update_epsilon(self.epsilon)
        return loss.history['loss'][0]

# Environment definition
class Environment:
    def __init__(self, env_name):
        self.env = gym.make(env_name)

    def reset(self):
        return self.env.reset()

    def step(self, action):
        next_state, reward, done, info = self.env.step(action)
        return next_state, reward, done, info

def simple_optimal_action(state):
    x, x_dot, theta, theta_dot = state
    # Heuristic: if the pole is falling to the right, push right, and vice versa
    # This heuristic also considers the direction of the cart's velocity
    if theta > 0:
        return 1 if theta_dot > 0 else 0
    else:
        return 0 if theta_dot < 0 else 1

# Main function to execute the training
def main():
    repo_id = "diegomrodrigues/cartpole_rl"
    writer = HFSummaryWriter(repo_id=repo_id, commit_every=2)

    env = Environment('CartPole-v1')
    strategy = EpsilonGreedyStrategy(env.env.action_space.n)
    agent = Agent(env.env.action_space.n, strategy)

    episodes = 1000
    for episode in tqdm(range(episodes), desc="Training Progress"):
        state = env.reset()
        done = False
        total_reward = 0
        steps = 0
        total_loss = 0
        optimal_actions_count = 0

        while not done:
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)

            # Here you need to define what is an optimal action.
            # For demonstration, assuming action == some_optimal_action(state)
            optimal_action = simple_optimal_action(state)  # You need to implement this function
            if action == optimal_action:
                optimal_actions_count += 1

            total_loss += agent.update(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            steps += 1

        average_reward = total_reward / steps if steps > 0 else 0
        percent_optimal_action = (optimal_actions_count / steps) * 100 if steps > 0 else 0

        writer.add_scalar('Total Reward', total_reward, episode)
        writer.add_scalar('Loss', total_loss, episode)
        writer.add_scalar('Epsilon', agent.epsilon, episode)
        writer.add_scalar('Average Reward', average_reward, episode)
        writer.add_scalar('Percentage Optimal Action', percent_optimal_action, episode)

        if episode % 10 == 0:
            tqdm.write(f"Episode: {episode + 1}, Total Loss: {total_loss}, Total Reward: {total_reward}, Average Reward: {average_reward}, Percent Optimal Actions: {percent_optimal_action}, Epsilon: {agent.epsilon}")

    writer.close()

if __name__ == "__main__":
    main()


Num GPUs Available:  1
Default GPU Device: /device:GPU:0


Training Progress:   0%|          | 1/1000 [00:05<1:28:14,  5.30s/it]

Episode: 1, Total Loss: 11.653076231479645, Total Reward: 22.0, Average Reward: 1.0, Percent Optimal Actions: 63.63636363636363, Epsilon: 0.8955869907338783


Training Progress:   1%|          | 11/1000 [00:56<1:05:00,  3.94s/it]

Episode: 11, Total Loss: 48.625279664993286, Total Reward: 16.0, Average Reward: 1.0, Percent Optimal Actions: 43.75, Epsilon: 0.4982051627146237


Training Progress:   2%|▏         | 21/1000 [01:32<53:32,  3.28s/it]

Episode: 21, Total Loss: 78.5553617477417, Total Reward: 11.0, Average Reward: 1.0, Percent Optimal Actions: 18.181818181818183, Epsilon: 0.4982051627146237


Training Progress:   3%|▎         | 31/1000 [02:15<1:15:12,  4.66s/it]

Episode: 31, Total Loss: 60.268036454916, Total Reward: 12.0, Average Reward: 1.0, Percent Optimal Actions: 25.0, Epsilon: 0.4982051627146237


Training Progress:   4%|▍         | 41/1000 [02:53<57:47,  3.62s/it]

Episode: 41, Total Loss: 42.31730563938618, Total Reward: 14.0, Average Reward: 1.0, Percent Optimal Actions: 42.857142857142854, Epsilon: 0.4982051627146237


Training Progress:   5%|▌         | 51/1000 [03:27<59:22,  3.75s/it]  

Episode: 51, Total Loss: 48.10290256241569, Total Reward: 13.0, Average Reward: 1.0, Percent Optimal Actions: 38.46153846153847, Epsilon: 0.4982051627146237


Training Progress:   6%|▌         | 61/1000 [04:18<1:03:52,  4.08s/it]

Episode: 61, Total Loss: 130.6578200161457, Total Reward: 15.0, Average Reward: 1.0, Percent Optimal Actions: 46.666666666666664, Epsilon: 0.4982051627146237


Training Progress:   7%|▋         | 71/1000 [04:58<1:26:21,  5.58s/it]

Episode: 71, Total Loss: 112.9042411223054, Total Reward: 38.0, Average Reward: 1.0, Percent Optimal Actions: 68.42105263157895, Epsilon: 0.4982051627146237


Training Progress:   8%|▊         | 81/1000 [05:57<57:56,  3.78s/it]

Episode: 81, Total Loss: 78.86002748087049, Total Reward: 11.0, Average Reward: 1.0, Percent Optimal Actions: 9.090909090909092, Epsilon: 0.4982051627146237


Training Progress:   9%|▉         | 91/1000 [06:29<48:25,  3.20s/it]

Episode: 91, Total Loss: 36.64923255564645, Total Reward: 12.0, Average Reward: 1.0, Percent Optimal Actions: 41.66666666666667, Epsilon: 0.4982051627146237


Training Progress:  10%|█         | 101/1000 [07:07<48:51,  3.26s/it]

Episode: 101, Total Loss: 104.75164231657982, Total Reward: 13.0, Average Reward: 1.0, Percent Optimal Actions: 38.46153846153847, Epsilon: 0.4982051627146237


Training Progress:  11%|█         | 107/1000 [07:32<1:02:54,  4.23s/it]


KeyboardInterrupt: 