# Reinforcement Learning applied to Flappy Bird

## Install dependencies

In [None]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip install pyvirtualdisplay
!pip install pyglet==1.5.1 flappy-bird-gymnasium

In [None]:
# Virtual display
from pyvirtualdisplay import Display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7dd42d659d50>

In [None]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt

Collecting git+https://github.com/ntasfi/PyGame-Learning-Environment.git (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt (line 1))
  Cloning https://github.com/ntasfi/PyGame-Learning-Environment.git to /tmp/pip-req-build-3fkbmn9h
  Running command git clone --filter=blob:none --quiet https://github.com/ntasfi/PyGame-Learning-Environment.git /tmp/pip-req-build-3fkbmn9h
  Resolved https://github.com/ntasfi/PyGame-Learning-Environment.git to commit 3dbe79dc0c35559bb441b9359948aabf9bb3d331
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/simoninithomas/gym-games (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt (line 2))
  Cloning https://github.com/simoninithomas/gym-games to /tmp/pip-req-build-zhiw3w93
  Running command git clone --filter=blob:none --quiet https://github.com/simoninithomas/gym-games /tmp/pip-req-build-zhiw3w

## Setup Environment

In [None]:
import numpy as np

from collections import deque

%matplotlib inline

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Gym

# Hugging Face Hub

In [None]:
import gymnasium
env = gymnasium.make("FlappyBird-v0", use_lidar=False)

In [None]:
# Create the evaluation env
# Get the state space and action space
s_size = env.observation_space.shape[0]
a_size = env.action_space.n

print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample()) # Take a random action

_____OBSERVATION SPACE_____ 

The State Space is:  12
Sample observation [ 0.30045418 -0.48864902 -0.68948919  0.75645834 -0.19624315  0.9058895
 -0.21913452 -0.56275998 -0.43122534  0.79568474 -0.72862018 -0.22510945]

 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 1


**Features**
- the last pipe's horizontal position
- the last top pipe's vertical position
- the last bottom pipe's vertical position
- the next pipe's horizontal position
- the next top pipe's vertical position
- the next bottom pipe's vertical position
- the next next pipe's horizontal position
- the next next top pipe's vertical position
- the next next bottom pipe's vertical position
- player's vertical position
- player's vertical velocity
- player's rotation

## Model Building

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size*2)
        self.fc3 = nn.Linear(h_size*2, a_size)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

  and should_run_async(code)


**Play around with Hyperparameters**

In [None]:
flappybird_hyperparameters = {
    "h_size": 64,
    "n_training_episodes": 30000,
    "n_evaluation_episodes": 10,
    "max_t": 10000,
    "gamma": 0.99,
    "lr": 1e-4,
    "env_id": "FlappyBird-v0",
    "state_space": s_size,
    "action_space": a_size,
}
flappybird_hyperparameters

{'h_size': 64,
 'n_training_episodes': 30000,
 'n_evaluation_episodes': 10,
 'max_t': 10000,
 'gamma': 0.99,
 'lr': 0.0001,
 'env_id': 'FlappyBird-v0',
 'state_space': 12,
 'action_space': 2}

In [None]:
flappybird_policy = Policy(flappybird_hyperparameters["state_space"], flappybird_hyperparameters["action_space"], flappybird_hyperparameters["h_size"]).to(device)
flappybird_optimizer = optim.Adam(flappybird_policy.parameters(), lr=flappybird_hyperparameters["lr"])

## Training model

In [None]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()[0]
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        for t in range(n_steps)[::-1]:
            disc_return_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft( gamma*disc_return_t + rewards[t]   )

        eps = np.finfo(np.float32).eps.item()
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))

    return scores

In [None]:
scores = reinforce(flappybird_policy,
                   flappybird_optimizer,
                   flappybird_hyperparameters["n_training_episodes"],
                   flappybird_hyperparameters["max_t"],
                   flappybird_hyperparameters["gamma"],
                   1000)

Episode 1000	Average Score: -4.49
Episode 2000	Average Score: 3.96
Episode 3000	Average Score: 4.21
Episode 4000	Average Score: 4.18
Episode 5000	Average Score: 4.41
Episode 6000	Average Score: 4.92
Episode 7000	Average Score: 5.01
Episode 8000	Average Score: 5.26
Episode 9000	Average Score: 4.93
Episode 10000	Average Score: 5.46
Episode 11000	Average Score: 5.88
Episode 12000	Average Score: 6.31
Episode 13000	Average Score: 6.59
Episode 14000	Average Score: 7.48
Episode 15000	Average Score: 8.06
Episode 16000	Average Score: 8.62
Episode 17000	Average Score: 8.52
Episode 18000	Average Score: 8.18
Episode 19000	Average Score: 8.38
Episode 20000	Average Score: 8.15
Episode 21000	Average Score: 8.77
Episode 22000	Average Score: 8.98
Episode 23000	Average Score: 9.67
Episode 24000	Average Score: 9.06
Episode 25000	Average Score: 9.14
Episode 26000	Average Score: 9.58
Episode 27000	Average Score: 10.31
Episode 28000	Average Score: 9.59
Episode 29000	Average Score: 9.87
Episode 30000	Average

In [None]:

# save model
torch.save(flappybird_policy, "model.pt")

# save hyperparameters
# json.dump(flappybird_hyperparameters, open("hyperparameters.json", 'w'))