<a href="https://colab.research.google.com/github/ezzeddinegasmi/DRL_comparative_study/blob/main/Copie_de_Dim_soir_6_Avr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Configuration for Colab

In [38]:
import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !pip install gymnasium==1.0.0



## import module

In [39]:
import random
from typing import List, Tuple

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output
from torch.distributions import Normal

In [40]:
from cpu_thread import cpu_thread
from gpu_thread import gpu_thread
import os
import argparse
import torch.multiprocessing as mp
mp.set_sharing_strategy('file_system')


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


parser = argparse.ArgumentParser(description='PPO training')
parser.add_argument('--load', default=False,
                    help='Whether or not to load pretrained weights. '
                         'You must have started an alread trained net for it to work',
                    dest='load', type=str2bool)
parser.add_argument('--render', default=True, help='Show the game running in a separate process. '
                                                   'This slows the training a bit',
                    dest='render', type=str2bool)
parser.add_argument('--gif', default=False, help='Create a gif of a game every once in a while',
                    dest='gif', type=str2bool)


def main(args):
    assert args.render or not args.gif, 'If you want to display a gif, you must set render to true'
    if args.load is False and os.path.isfile('./model/breakout.pt'):
        while True:
            load = input('Are you sure you want to erase the previous training? (y/n) ')
            if load.lower() in ('y', 'yes', '1'):
                break
            elif load.lower() in ('n', 'no', '0'):
                import sys
                sys.exit()

    # create shared variables between all the processes
    manager = mp.Manager()
    # used to send the results of the net
    common_dict = manager.dict()
    # a queue of batches to be fed to the training net
    mem_queue = manager.Queue(1500 * mp.cpu_count())
    # a queue of operations pending
    process_queue = manager.Queue(mp.cpu_count()-1)

    with mp.Pool() as pool:
        try:
            workers: int = pool._processes
            print(f"Running pool with {workers//2} workers")
            pool.apply_async(gpu_thread, (args.load, mem_queue, process_queue, common_dict, [0, 1]))
            if args.render:
                pool.apply_async(cpu_thread, (2 if args.gif else 1, mem_queue, process_queue, common_dict, [2, 3]))
            for i in range(2*(1+args.render), workers, 2):
                pool.apply_async(cpu_thread, (0, mem_queue, process_queue, common_dict, [i, i+1]))

            # Wait for children to finish
            pool.close()
            pool.join()
        except KeyboardInterrupt:
            pool.join()


if __name__ == "__main__":
    args = parser.parse_args()
    mp.set_start_method('spawn')
    main(args)

ModuleNotFoundError: No module named 'cpu_thread'

In [None]:
import os
import gym
import time
import random
import matplotlib.pyplot as plt
from matplotlib import animation
from parameters import parameters


def sample(action):
    tresh = random.random()
    p = 0.
    for i, prob in enumerate(action):
        p += prob
        if p > tresh:
            return i, prob


def process_frame(frame):
    return (frame/255).transpose((2, 0, 1))


def process_reward(reward):
    return 0.01*reward


def generate_game(env, pid, process_queue, common_dict):
    observation = env.reset()
    done = False
    reward_list = []
    action_list = []
    prob_list = []
    observation_list = []
    frame_count = 0
    live = 5
    while not done:
        observation = process_frame(observation)
        observation_list.append(observation)
        process_queue.put((pid, observation))
        while pid not in common_dict:
            time.sleep(0.0001)
        action_prob = common_dict[pid]
        del common_dict[pid]
        action, prob = sample(action_prob)
        observation, reward, done, info = env.step(action)
        if info['ale.lives'] < live:
            live = info['ale.lives']
            reward = -1
        action_list.append(action)
        prob_list.append(prob)
        reward_list.append(process_reward(reward))
        frame_count += 1
    print("Score: {:4.0f}".format(100*sum(reward_list)))
    for i in range(len(reward_list) - 2, -1, -1):
        reward_list[i] += reward_list[i + 1] * parameters.GAMMA  # compute the discounted obtained reward for each step
    return observation_list, reward_list, action_list, prob_list


def play(env, pid, process_queue, common_dict):
    while True:
        counter = 0
        observation = env.reset()
        done = False
        while not done:
            counter += 1
            if counter >= 2000:
                break
            process_queue.put((pid, process_frame(observation)))
            while pid not in common_dict:
                time.sleep(0.0001)
            action_prob = common_dict[pid]
            del common_dict[pid]
            action = sample(action_prob)[0]
            observation, _, done, _ = env.step(action)
            env.render()


def play_to_gif(env, pid, process_queue, common_dict):
    display = False
    episode = 0
    while 'epoch' not in common_dict:
        time.sleep(0.001)
    while True:
        if common_dict['epoch'] % 25 == 0 and not display:
            display = True
            episode = common_dict['epoch']
        observation = env.reset()
        frames = []
        done = False
        while not done:
            process_queue.put((pid, process_frame(observation)))
            while pid not in common_dict:
                time.sleep(0.0001)
            action_prob = common_dict[pid]
            del common_dict[pid]
            action = sample(action_prob)[0]
            observation, _, done, _ = env.step(action)
            if display:
                frames.append(env.render(mode='rgb_array'))
            else:
                env.render()
        if display:
            display_frames_as_gif(frames, 'Episode {}.gif'.format(episode))
            display = False


def display_frames_as_gif(frames, name):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=33)
    try:
        anim.save('gifs/' + name)
    except:
        anim.save('gifs/' + name, writer=animation.PillowWriter(fps=40))


def cpu_thread(render, memory_queue, process_queue, common_dict, workers):
    import psutil
    p = psutil.Process()
    p.cpu_affinity(workers)
    import signal
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    try:
        env = gym.make('Breakout-v0')
        pid = os.getpid()
        print('process started with pid: {} on core(s) {}'.format(os.getpid(), workers), flush=True)
        if render == 1:
            play(env, pid, process_queue, common_dict)
        elif render == 2:
            play_to_gif(env, pid, process_queue, common_dict)
        else:
            while True:
                observation_list, reward_list, action_list, prob_list = generate_game(env, pid, process_queue, common_dict)
                memory_queue.put((observation_list, reward_list, action_list, prob_list))
    except Exception as e:
        print(e, flush=True)

In [None]:
import os
import torch
from model import PPO
import torch.optim as optim
from parameters import parameters


def process_observations(observation, model):
    with torch.no_grad():
        action = model.forward(observation)
        return action.cpu().squeeze().numpy()


def destack_process(model, process_queue, common_dict):
    if process_queue.qsize() > 0:
        model.eval()
        for _ in range(process_queue.qsize()):  # for instead of while to not get stuck
            pid, obs = process_queue.get(True)
            action = process_observations(torch.Tensor(obs).unsqueeze(0).to(parameters.DEVICE), model)
            common_dict[pid] = action


def destack_memory(memory_queue, observations, rewards, actions, probs):
    while memory_queue.qsize() > 0 and len(observations) <= parameters.MAXLEN:
        try:
            _, __, ___, ____ = memory_queue.get(True)
            observations = torch.cat((observations, torch.Tensor(_).to(parameters.DEVICE)))
            rewards = torch.cat((rewards, torch.Tensor(__).to(parameters.DEVICE)))
            actions = torch.cat((actions, torch.LongTensor(___).to(parameters.DEVICE)))
            probs = torch.cat((probs, torch.Tensor(____).to(parameters.DEVICE)))
        except RuntimeError as e:
            print(e)
            return True, observations, rewards, actions, probs
    return False, observations, rewards, actions, probs


def run_epoch(epochs, model, optimizer, observations, rewards, actions, probs):
    model.train()
    for _ in range(parameters.EPOCH_STEPS):
        perm = torch.randperm(len(probs))
        for i in range(0, len(probs), parameters.BATCH_SIZE):
            if i + parameters.BATCH_SIZE > len(probs):
                break
            optimizer.zero_grad()
            lossactor, losscritic = model.loss(observations[perm[i:i+parameters.BATCH_SIZE]], rewards[perm[i:i+parameters.BATCH_SIZE]], actions[perm[i:i+parameters.BATCH_SIZE]], probs[perm[i:i+parameters.BATCH_SIZE]])
            if epochs > 10:
                (lossactor + losscritic).backward()
            else:
                losscritic.backward()
            optimizer.step()
        print('Loss actor: {0:7.3f}  Loss critic: {1:7.3f}'.format(1000 * lossactor, 1000 * losscritic))


def gpu_thread(load, memory_queue, process_queue, common_dict, workers):
    # the only thread that has an access to the gpu, it will then perform all the NN computation
    import psutil
    p = psutil.Process()
    p.cpu_affinity(workers)
    import signal
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    try:
        print('process started with pid: {} on core(s) {}'.format(os.getpid(), workers), flush=True)
        model = PPO()
        model.to(parameters.DEVICE)
        optimizer = optim.RMSprop(model.parameters(), lr=parameters.LEARNING_RATE)
        epochs = 0
        if load:
            checkpoint = torch.load('./model/breakout.pt')
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            epochs = checkpoint['epochs']
        observations = torch.Tensor([]).to(parameters.DEVICE)
        rewards = torch.Tensor([]).to(parameters.DEVICE)
        actions = torch.LongTensor([]).to(parameters.DEVICE)
        probs = torch.Tensor([]).to(parameters.DEVICE)
        common_dict['epoch'] = epochs
        while True:
            memory_full, observations, rewards, actions, probs = \
                destack_memory(memory_queue, observations, rewards, actions, probs)
            destack_process(model, process_queue, common_dict)
            if len(observations) > parameters.MAXLEN or memory_full:
                epochs += 1
                print('-' * 60 + '\n        epoch ' + str(epochs) + '\n' + '-' * 60)
                run_epoch(epochs, model, optimizer, observations, rewards, actions, probs)
                observations = torch.Tensor([]).to(parameters.DEVICE)
                rewards = torch.Tensor([]).to(parameters.DEVICE)
                actions = torch.LongTensor([]).to(parameters.DEVICE)
                probs = torch.Tensor([]).to(parameters.DEVICE)
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'epochs': epochs
                }, './model/breakout.pt')
                common_dict['epoch'] = epochs
    except Exception as e:
        print(e)
        print('saving before interruption', flush=True)
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epochs': epochs
        }, './model/breakout.pt')

In [None]:
import torch


class Parameters:
    def __init__(self):
        self.ACTOR_COEFF = 1.
        self.LOSS_CLIPPING = 0.15
        self.GAMMA = 0.98
        self.DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.BATCH_SIZE = 32
        self.EPOCH_STEPS = 10
        self.MAXLEN = 1000
        self.LEARNING_RATE = 1e-4


parameters = Parameters()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from parameters import parameters


def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return torch.eye(num_classes)[y].to(parameters.DEVICE)


class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.vision = nn.Sequential(
            nn.Conv2d(3, 6, 3),
            nn.MaxPool2d(2, 2),
            nn.LeakyReLU(0.1),
            nn.Conv2d(6, 16, 3),
            nn.MaxPool2d(2),
            nn.LeakyReLU(0.1),
            nn.Conv2d(16, 24, 5),
            nn.MaxPool2d(2),
            nn.LeakyReLU(0.1),
        )

        self.actor = nn.Sequential(
            nn.Linear(9384, 120),
            nn.LeakyReLU(0.1),
            nn.Linear(120, 84),
            nn.LeakyReLU(0.1),
            nn.Linear(84, 4)
        )

        self.critic = nn.Sequential(
            nn.Linear(9384, 120),
            nn.LeakyReLU(0.1),
            nn.Linear(120, 84),
            nn.LeakyReLU(0.1),
            nn.Linear(84, 1)
        )

    def forward(self, x):
        vision = self.vision(x).view(-1, 9384)
        actor = self.actor(vision).view(-1, 4).softmax(-1)
        if self.training:
            critic = self.critic(vision)
            return actor, critic.squeeze()
        return actor

    def loss(self, observations, rewards, actions, old_prob):
        prob_distribution, reward_predicted = self.forward(observations)
        r = (torch.sum(torch.eye(4)[actions].to(parameters.DEVICE) * prob_distribution, -1) + 1e-10) / (old_prob + 1e-10)
        advantage = (rewards - reward_predicted).detach()
        lossactor = - parameters.ACTOR_COEFF \
                    * torch.mean(torch.min(r * advantage,
                                           torch.clamp(r,
                                                       min=(1. - parameters.LOSS_CLIPPING),
                                                       max=(1. + parameters.LOSS_CLIPPING))
                                           * advantage))
        losscritic = F.mse_loss(reward_predicted, rewards)
        return lossactor, losscritic

In [None]:
Concept :

Forward :

Découper l'écran en ~AxA parties

Pour chaque, faire tourner un classifier sur B classes (AxAxB)
+ des "coordonnées" (probablement un vecteur peu contraint de AxAxC)

Sampler aléatoirement le link en dur choisi, on obtient AxA items

Embedding B -> D (AxAxD)

Coordonnées concat Embedding (AxAx(C+D))

Actor critic après quelques couches linéaires + Leaky




Difficultés à prévoir :

Classification à plotter pour vérifier qu'il merde pas trop
Chercher une façon de s'assurer une certaine distance entre les objets (?)

Pistes d'améliorations :

Chercher un moyen de découvrir les objets autrement que par AxA :

RCNN Actor/Critic
ce serait mieux si parfaitement exécuté (ce qui reste à PoC, mais me paraît impossible)

Prédire direct coordonnées + classification + vecteur
comment s'assurer que les coordonnées soient "dans le bon ordre" ? En donnant une seule conv peut-être
comment s'assurer de l'unicité des classes ? Probablement un énorme problème

Chaque classe cherches ses coordonnées + vecteur + "importance" elle-même
comment linker 2 fois la même classe ? Probablement pas un énorme problème
comment s'assurer de l'unicité des classes ? Probablement un énorme problème


RCNN actor/critic :

CNN

ROI en mode actor/critic (PoC ?) Je peux me tromper, mais les anchors ne peuvent pas marcher en A/C

ROI pooling

coord (~bbox recentering) classification

actor/critic


RPN est impossible je pense car :
- pas de IoU à feed en A/C
- pas de continuité avec un nombre de pixel entier pour la bbox

Fast RCNN au lieu de Faster RCNN ?
ce serait lent mais probablement pas déconnant. Voir comment le ROI fonctionne.

In [None]:
Pytorch multiprocessing PPO implementation playing Breakout
How it works
The optimization is a standard PPO implementation, however the point was to push the limits of what a limited computer could do in reinforcement learning. Thus I use multiple processes to play the game and gather experiences. However, if multiples processes try to access a single gpu, most of the computation time will be lost to each process waiting for their turn on the gpu, rather than actually playing the game, resulting in a very limited speedup between multiprocessed and not multiprocessed algorithms. Furthermore it necessitated the net to be copied on multiple processes, wich was very VRAM consuming.

This algorithm works differently:

multiple processes play the game
a single process has access to the gpu
when a playing process requires the gpu, it sends the operation to execute to the gpu process, and the gpu process sends back the result
This way, the training can be around twice as fast for a computer with a single GPU compared to a naive multiprocessed PPO

Requirements
Pytorch
Numpy
gym (Atari)
a few standard libraries such as argparse, time, os
There is no guarantee this will work in python 2, or without a GPU
around 2Gb of RAM for each core of your CPU with the recommended number of workers
How to begin the training
Clone this repository: git clone https://github.com/CSautier/Breakout
Launch the game in a shell: python Breakout.py
If you'd prefer a faster training, you can deactivate the visualization: python Breakout.py --render False
Useful resources
https://openai.com/
https://arxiv.org/pdf/1707.06347.pdf
Feel free to use as much of this code as you want but mention my github if you found this useful.
For more information, you can contact me on my github.

In [None]:
/model/*.pt
__pycache__/
.*/

In [None]:

+18
-13


Original file line number	Diff line number	Diff line change
@@ -2,31 +2,36 @@

## How it works

The optimization is a standard PPO implementation, however the point was to push the limits of what a limited computer could do in reinforcement learning. Thus I use multiple processes to play the game and gather experiences. However, as my previous experiences taught me, if multiples processes try to access a single gpu, most of the computation time will be lost waiting for their turn on the gpu, rather than actually playing the game, resulting in a very limited speedup between multiprocessed and not multiprocessed algorithms. Furthermore it necessitated the net to be copied on multiple processes, wich was very VRAM consuming.
This algorithm works differently.
The optimization is a standard PPO implementation, however the point was to push the limits of what a limited computer
could do in reinforcement learning. Thus I use multiple processes to play the game and gather experiences. However, if
multiples processes try to access a single gpu, most of the computation time will be lost to each process waiting for
their turn on the gpu, rather than actually playing the game, resulting in a very limited speedup between multiprocessed
and not multiprocessed algorithms. Furthermore it necessitated the net to be copied on multiple processes, wich was very
VRAM consuming. \
\
This algorithm works differently:
* multiple processes play the game
* a single process has access to the gpu
* when a playing process requires the gpu, it sends the operation to execute to the gpu process, and the gpu process sends back the result
* when a playing process requires the gpu, it sends the operation to execute to the gpu process, and the gpu process
sends back the result

This way, the speed limitation will go one step further, on your gpu or ram most likely
This way, the training can be around twice as fast for a computer with a single GPU compared to a naive multiprocessed
PPO

## Requirements

* Pytorch (gpu highly recommanded)
* Pytorch
* Numpy
* gym (Atari)
* a few standard libraries such as argparse, time, os (you most likely already have them)
there might be a few modifications to make to run it in python 2
# Before any execution
Be careful, this requires a lot of ram, especially with many processes, so keep your ram in check and kill the program before it freezes your computer if necessary
* a few standard libraries such as argparse, time, os
* There is no guarantee this will work in python 2, or without a GPU
* around 2Gb of RAM for each core of your CPU with the recommended number of workers

## How to begin the training

* Clone this repository: `git clone https://github.com/CSautier/Breakout`
* Launch the game in bash: `python Breakout.py`
* Launch the game in a shell: `python Breakout.py`
* If you'd prefer a faster training, you can deactivate the visualization: `python Breakout.py --render False`

## Useful resources

‎breakout.py
+72
Original file line number	Diff line number	Diff line change
@@ -0,0 +1,72 @@
from cpu_thread import cpu_thread
from gpu_thread import gpu_thread
import os
import argparse
import torch.multiprocessing as mp
mp.set_sharing_strategy('file_system')
def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description='PPO training')
parser.add_argument('--load', default=False,
                    help='Whether or not to load pretrained weights. '
                         'You must have started an alread trained net for it to work',
                    dest='load', type=str2bool)
parser.add_argument('--render', default=True, help='Show the game running in a separate process. '
                                                   'This slows the training a bit',
                    dest='render', type=str2bool)
parser.add_argument('--gif', default=False, help='Create a gif of a game every once in a while',
                    dest='gif', type=str2bool)
def main(args):
    assert args.render or not args.gif, 'If you want to display a gif, you must set render to true'
    if args.load is False and os.path.isfile('./model/breakout.pt'):
        while True:
            load = input('Are you sure you want to erase the previous training? (y/n) ')
            if load.lower() in ('y', 'yes', '1'):
                break
            elif load.lower() in ('n', 'no', '0'):
                import sys
                sys.exit()
    # create shared variables between all the processes
    manager = mp.Manager()
    # used to send the results of the net
    common_dict = manager.dict()
    # a queue of batches to be fed to the training net
    mem_queue = manager.Queue(1500 * mp.cpu_count())
    # a queue of operations pending
    process_queue = manager.Queue(mp.cpu_count()-1)
    with mp.Pool() as pool:
        try:
            workers: int = pool._processes
            print(f"Running pool with {workers//2} workers")
            pool.apply_async(gpu_thread, (args.load, mem_queue, process_queue, common_dict, [0, 1]))
            if args.render:
                pool.apply_async(cpu_thread, (2 if args.gif else 1, mem_queue, process_queue, common_dict, [2, 3]))
            for i in range(2*(1+args.render), workers, 2):
                pool.apply_async(cpu_thread, (0, mem_queue, process_queue, common_dict, [i, i+1]))
            # Wait for children to finish
            pool.close()
            pool.join()
        except KeyboardInterrupt:
            pool.join()
if __name__ == "__main__":
    args = parser.parse_args()
    mp.set_start_method('spawn')
    main(args)
‎cpu_thread.py
+141
Original file line number	Diff line number	Diff line change
@@ -0,0 +1,141 @@
import os
import gym
import time
import random
import matplotlib.pyplot as plt
from matplotlib import animation
from parameters import parameters
def sample(action):
    tresh = random.random()
    p = 0.
    for i, prob in enumerate(action):
        p += prob
        if p > tresh:
            return i, prob
def process_frame(frame):
    return (frame/255).transpose((2, 0, 1))
def process_reward(reward):
    return 0.01*reward
def generate_game(env, pid, process_queue, common_dict):
    observation = env.reset()
    done = False
    reward_list = []
    action_list = []
    prob_list = []
    observation_list = []
    frame_count = 0
    while not done:
        observation = process_frame(observation)
        observation_list.append(observation)
        process_queue.put((pid, observation))
        while pid not in common_dict:
            time.sleep(0.0001)
        action_prob = common_dict[pid]
        del common_dict[pid]
        action, prob = sample(action_prob)
        observation, reward, done, info = env.step(action)
        action_list.append(action)
        prob_list.append(prob)
        reward_list.append(process_reward(reward))
        frame_count += 1
    print("Score: {:4.0f}".format(100*sum(reward_list)))
    for i in range(len(reward_list) - 2, -1, -1):
        reward_list[i] += reward_list[i + 1] * parameters.GAMMA  # compute the discounted obtained reward for each step
    return observation_list, reward_list, action_list, prob_list
def play(env, pid, process_queue, common_dict):
    while True:
        counter = 0
        observation = env.reset()
        done = False
        while not done:
            counter += 1
            if counter >= 2000:
                break
            process_queue.put((pid, process_frame(observation)))
            while pid not in common_dict:
                time.sleep(0.0001)
            action_prob = common_dict[pid]
            del common_dict[pid]
            action = sample(action_prob)[0]
            observation, _, done, _ = env.step(action)
            env.render()
def play_to_gif(env, pid, process_queue, common_dict):
    display = False
    episode = 0
    while 'epoch' not in common_dict:
        time.sleep(0.001)
    while True:
        if common_dict['epoch'] % 25 == 0 and not display:
            display = True
            episode = common_dict['epoch']
        observation = env.reset()
        frames = []
        done = False
        while not done:
            process_queue.put((pid, process_frame(observation)))
            while pid not in common_dict:
                time.sleep(0.0001)
            action_prob = common_dict[pid]
            del common_dict[pid]
            action = sample(action_prob)[0]
            observation, _, done, _ = env.step(action)
            if display:
                frames.append(env.render(mode='rgb_array'))
            else:
                env.render()
        if display:
            display_frames_as_gif(frames, 'Episode {}.gif'.format(episode))
            display = False
def display_frames_as_gif(frames, name):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=33)
    try:
        anim.save('gifs/' + name)
    except:
        anim.save('gifs/' + name, writer=animation.PillowWriter(fps=40))
def cpu_thread(render, memory_queue, process_queue, common_dict, workers):
    import psutil
    p = psutil.Process()
    p.cpu_affinity(workers)
    import signal
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    try:
        env = gym.make('Breakout-v0')
        pid = os.getpid()
        print('process started with pid: {} on core(s) {}'.format(os.getpid(), workers), flush=True)
        if render == 1:
            play(env, pid, process_queue, common_dict)
        elif render == 2:
            play_to_gif(env, pid, process_queue, common_dict)
        else:
            while True:
                observation_list, reward_list, action_list, prob_list = generate_game(env, pid, process_queue, common_dict)
                memory_queue.put((observation_list, reward_list, action_list, prob_list))
    except Exception as e:
        print(e, flush=True)
‎gpu_thread.py
+102
Original file line number	Diff line number	Diff line change
@@ -0,0 +1,102 @@
import os
import torch
from model import PPO
import torch.optim as optim
from parameters import parameters
def process_observations(observation, model):
    with torch.no_grad():
        action = model.forward(observation)
        return action.cpu().squeeze().numpy()
def destack_process(model, process_queue, common_dict):
    if process_queue.qsize() > 0:
        model.eval()
        for _ in range(process_queue.qsize()):  # for instead of while to not get stuck
            pid, obs = process_queue.get(True)
            action = process_observations(torch.Tensor(obs).unsqueeze(0).to(parameters.DEVICE), model)
            common_dict[pid] = action
def destack_memory(memory_queue, observations, rewards, actions, probs):
    while memory_queue.qsize() > 0 and len(observations) <= parameters.MAXLEN:
        try:
            _, __, ___, ____ = memory_queue.get(True)
            observations = torch.cat((observations, torch.Tensor(_).to(parameters.DEVICE)))
            rewards = torch.cat((rewards, torch.Tensor(__).to(parameters.DEVICE)))
            actions = torch.cat((actions, torch.LongTensor(___).to(parameters.DEVICE)))
            probs = torch.cat((probs, torch.Tensor(____).to(parameters.DEVICE)))
        except RuntimeError as e:
            print(e)
            return True, observations, rewards, actions, probs
    return False, observations, rewards, actions, probs
def run_epoch(epochs, model, optimizer, observations, rewards, actions, probs):
    model.train()
    for _ in range(parameters.EPOCH_STEPS):
        perm = torch.randperm(len(probs))
        for i in range(0, len(probs), parameters.BATCH_SIZE):
            if i + parameters.BATCH_SIZE > len(probs):
                break
            optimizer.zero_grad()
            lossactor, losscritic = model.loss(observations[perm[i:i+parameters.BATCH_SIZE]], rewards[perm[i:i+parameters.BATCH_SIZE]], actions[perm[i:i+parameters.BATCH_SIZE]], probs[perm[i:i+parameters.BATCH_SIZE]])
            if epochs > 10:
                (lossactor + losscritic).backward()
            else:
                losscritic.backward()
            optimizer.step()
        print('Loss actor: {0:7.3f}  Loss critic: {1:7.3f}'.format(1000 * lossactor, 1000 * losscritic))
def gpu_thread(load, memory_queue, process_queue, common_dict, workers):
    # the only thread that has an access to the gpu, it will then perform all the NN computation
    import psutil
    p = psutil.Process()
    p.cpu_affinity(workers)
    import signal
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    try:
        print('process started with pid: {} on core(s) {}'.format(os.getpid(), workers), flush=True)
        model = PPO()
        model.to(parameters.DEVICE)
        optimizer = optim.RMSprop(model.parameters(), lr=parameters.LEARNING_RATE)
        epochs = 0
        if load:
            checkpoint = torch.load('./model/mariobot.pt')
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            epochs = checkpoint['epochs']
        observations = torch.Tensor([]).to(parameters.DEVICE)
        rewards = torch.Tensor([]).to(parameters.DEVICE)
        actions = torch.LongTensor([]).to(parameters.DEVICE)
        probs = torch.Tensor([]).to(parameters.DEVICE)
        common_dict['epoch'] = epochs
        while True:
            memory_full, observations, rewards, actions, probs = \
                destack_memory(memory_queue, observations, rewards, actions, probs)
            destack_process(model, process_queue, common_dict)
            if len(observations) > parameters.MAXLEN or memory_full:
                epochs += 1
                print('-' * 60 + '\n        epoch ' + str(epochs) + '\n' + '-' * 60)
                run_epoch(epochs, model, optimizer, observations, rewards, actions, probs)
                observations = torch.Tensor([]).to(parameters.DEVICE)
                rewards = torch.Tensor([]).to(parameters.DEVICE)
                actions = torch.LongTensor([]).to(parameters.DEVICE)
                probs = torch.Tensor([]).to(parameters.DEVICE)
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'epochs': epochs
                }, './model/mariobot.pt')
                common_dict['epoch'] = epochs
    except Exception as e:
        print(e)
        print('saving before interruption', flush=True)
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epochs': epochs
        }, './model/mariobot.pt')
‎model.py
+59
Original file line number	Diff line number	Diff line change
@@ -0,0 +1,59 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from parameters import parameters
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return torch.eye(num_classes)[y].to(parameters.DEVICE)
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.vision = nn.Sequential(
            nn.Conv2d(3, 6, 5),
            nn.MaxPool2d(2, 2),
            nn.LeakyReLU(0.1),
            nn.Conv2d(6, 16, 5),
            nn.MaxPool2d(2),
            nn.LeakyReLU(0.1),
        )
        self.actor = nn.Sequential(
            nn.Linear(16 * 49 * 37, 120),
            nn.LeakyReLU(0.1),
            nn.Linear(120, 84),
            nn.LeakyReLU(0.1),
            nn.Linear(84, 4)
        )
        self.critic = nn.Sequential(
            nn.Linear(16 * 49 * 37, 120),
            nn.LeakyReLU(0.1),
            nn.Linear(120, 84),
            nn.LeakyReLU(0.1),
            nn.Linear(84, 1)
        )
    def forward(self, x):
        vision = self.vision(x).view(-1, 29008)
        actor = self.actor(vision).view(-1, 4).softmax(-1)
        if self.training:
            critic = self.critic(vision)
            return actor, critic.squeeze()
        return actor
    def loss(self, observations, rewards, actions, old_prob):
        prob_distribution, reward_predicted = self.forward(observations)
        r = (torch.sum(torch.eye(4)[actions].to(parameters.DEVICE) * prob_distribution, -1) + 1e-10) / (old_prob + 1e-10)
        advantage = (rewards - reward_predicted).detach()
        lossactor = - parameters.ACTOR_COEFF \
                    * torch.mean(torch.min(r * advantage,
                                           torch.clamp(r,
                                                       min=(1. - parameters.LOSS_CLIPPING),
                                                       max=(1. + parameters.LOSS_CLIPPING))
                                           * advantage))
        losscritic = F.mse_loss(reward_predicted, rewards)
        return lossactor, losscritic
‎parameters.py
+16
Original file line number	Diff line number	Diff line change
@@ -0,0 +1,16 @@
import torch
class Parameters:
    def __init__(self):
        self.ACTOR_COEFF = 1.
        self.LOSS_CLIPPING = 0.15
        self.GAMMA = 0.98
        self.DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.BATCH_SIZE = 32
        self.EPOCH_STEPS = 10
        self.MAXLEN = 1000
        self.LEARNING_RATE = 1e-4
parameters = Parameters()
