# Trains CartPole agent with REINFORCE

**Please expand the cells to view the code!**

### Description
This notebook implements the REINFORCE model on a CartPole-v1 environment.

## How to Run:
1. **Setup**: Ensure all dependencies are installed, including PyTorch, Gym, and additional libraries for data handling and visualization.
2. **Hyperparameters Setup**: Configure learning rates and other parameters at the start of the script.
3. **Execution**: Run the script to train the model. Training progress can be monitored through the console output which includes rewards and total timesteps. Model checkpoints are saved periodically.

### References
Kang, C., 2021. REINFORCE on CartPole-v0 [Online]. Chan`s Jupyter. Available from: https://goodboychan.github.io/python/reinforcement_learning/pytorch/udacity/2021/05/12/REINFORCE-CartPole.html [Accessed 8 May 2024].

Paszke, A., n.d. Reinforcement Learning (DQN) Tutorial — PyTorch Tutorials 1.8.0 documentation [Online]. pytorch.org. Available from: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html [Accessed 8 May 2024].

Perkins, H., 2022a. youtube-rl-demos/vizdoom at vizdoom13 · hughperkins/youtube-rl-demos [Online]. GitHub. Available from: https://github.com/hughperkins/youtube-rl-demos/blob/vizdoom13/vizdoom/ [Accessed 8 May 2024].

Perkins, H., 2022b. youtube-rl-demos/vizdoom/vizdoom_011.py at vizdoom13 · hughperkins/youtube-rl-demos [Online]. GitHub. Available from: https://github.com/hughperkins/youtube-rl-demos/blob/vizdoom13/vizdoom/vizdoom_011.py [Accessed 8 May 2024].

In [None]:
import numpy as np
import gym

import torch
import json
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim, distributions
import cv2

import random                 # Handling random number generation
import time                   # Handling time calculation
from skimage import transform # Help us to preprocess the frames

from collections import deque # Ordered collection with ends
import matplotlib.pyplot as plt  # Display graphs

import warnings                  # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore')
import os


### MODEL HYPERPARAMETERS
learning_rate =  0.001   # Alpha (aka learning rate)
total_episodes = 100000

### TRAINING HYPERPARAMETERS
total_episodes = 100000         # Total episodes for training

# clip_norm = 0.1

model_folder = f"./models/reinforce/lr/LR{learning_rate}"
os.makedirs(model_folder)
log_path = f"./logs/reinforce/lr/LR{learning_rate}.txt"

env = gym.make('CartPole-v1')

class Net(nn.Module):

    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(4, 32)
        self.layer2 = nn.Linear(32, 2)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, state):
        x = F.relu(self.layer1(state))
        x = self.layer2(x)
        return x
    
model = Net()
opt = optim.Adam(params=model.parameters(), lr=learning_rate)




In [None]:
class Net(nn.Module):

    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(4, 32)
        self.layer2 = nn.Linear(32, 2)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, state):
        x = F.relu(self.layer1(state))
        x = self.layer2(x)
        return x
    
model = Net()
opt = optim.Adam(params=model.parameters(), lr=learning_rate)

out_f = open(log_path, 'w')

total_steps = 0

for episode in range(total_episodes):

    action_log_probs = []

    episode_entropy = 0.0
    episode_steps = 0
    
    episode_reward = 0

    episode_argmax_action_taken = 0

    state, _ = env.reset()

    for time_steps in range(10000000000):

        # choose state from model
        tensor_state = torch.from_numpy(state).float().unsqueeze(0)
        action_logits = model(tensor_state)
        action_probs = F.softmax(action_logits, dim=1)

        # print(episode_entropy)
        # asdf
        m = distributions.Categorical(action_probs)
        action = m.sample()
        log_prob = m.log_prob(action)

        
        action_value = action.item()
        action_log_probs.append(log_prob)


        # Makes an action (here random one) and returns a reward.
        next_state, reward, done, _, _ = env.step(action_value)
        episode_steps += 1
        total_steps += 1

        episode_reward += reward

        state = next_state

        if done:
            per_timestep_losses = [- log_prob * episode_reward for log_prob in action_log_probs]
            # per_timestep_losses_t = torch.stack(per_timestep_losses)
            policy_loss = torch.cat(per_timestep_losses).sum()

            opt.zero_grad()
            policy_loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
            opt.step()

            print(f"Episode {episode}     |      Reward: {episode_reward}     |      Total timesteps: {total_steps}")

            out_f.write(json.dumps({
                'episode': episode,
                'reward': episode_reward,
                'total_steps': total_steps
                }) + '\n')
            
            out_f.flush()

            if episode % 100 == 0:
                torch.save(model, f'{model_folder}/E_{episode}.pt')
                print(f'====== Model saved ======')
            break

    if total_steps > 200000:
        break

env.close()

