# Toy REINFORCE playground

In [1]:
import random
from itertools import accumulate

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
torch.cuda.is_available()

True

## Utilities

In [3]:
GAMMA = 0.99

In [7]:
def calculate_qvals(rewards: list[float], gamma: float = GAMMA) -> list[float]:
    return list(reversed(list(accumulate(reversed(rewards), lambda x, y: gamma * x + y))))

## Environment

In [8]:
class BlackJack:
    def _get_reward(self) -> float:
        if self.score < self.win_score:
            return self.step_penalty
        if self.score == self.win_score:
            return self.win_reward
        return self.loose_reward

    def __init__(self) -> None:
        self.actions_dict = {0: 1, 1: 5, 2: 10}
        self.win_score = 101

        self.win_reward = 1000
        self.step_penalty = -5
        self.loose_reward = -1000

    def reset(self):
        self.score = random.randint(0, self.win_score - 1)
        self.steps = 0

    def interact(self, action: int) -> tuple[int, float]:
        self.score += self.actions_dict[action]
        self.steps += 1

        return [self.score], self._get_reward()

    def is_terminal(self) -> bool:
        return self.reward >= self.win_reward

    def get_observation_shape(self) -> int:
        return 1

    def get_actions_shape(self) -> int:
        return len(self.actions_dict)

## Policy Gradient Network

In [None]:
class PGN(nn.Module):
    def __init__(self, input_dim: int, output_dim: int) -> None:
        super(PGN, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(), nn.Linear(127, output_dim)
        )

    def forward(self, x):
        return self.net(x)

## Agent

In [44]:
class Agent:
    def choose_action(self, action_logits):
        return random.choices(range(len(action_logits)), F.softmax(action_logits, dim=0))

## Training

In [None]:
LEARNING_RATE = 0.01
EPISODES_TO_TRAIN = 4