# Simple Reinforcement Learning in Pytorch: 
## The Multi-armed bandit, basic policy optimization

In [1]:
import torch
import torch.nn.functional as F
import numpy as np

### The Bandit
We are using a four-armed bandit. The pullBandit function generates a random number between zero and 1. The lower the bandit number, the more likely a positive reward will be returned. We want our agent to learn to always choose the arm that will give that positive reward.

In [2]:
#List out our bandit arms, where the bandit_payoff is the likelihood of a payoff when that arm is pulled.
#Currently arm 3 (index #2) is set to most often provide a positive reward.

bandit_payoffs = [0.1, 0.0, 0.99, 0.15]
num_arms = len(bandit_payoffs)
def pullBanditArm(arm):
    #Get a random number.
    result = np.random.rand(1)
    if result < bandit_payoffs[arm]:
        #return a positive reward.
        return 1
    else:
        #return a negative reward.
        return -1

In [3]:
class Agent(torch.nn.Module):
    
#Simple layer of weights, one for each bandit arm
    def __init__(self):
        super(Agent, self).__init__()
        self.weights = torch.ones([num_arms], requires_grad=True)
        
#Use softmax to shape output of choice values
    def forward(self):
        return F.softmax(self.weights,dim=0)
        
#Pick a random action from likelihood based on the current policy
    def action(self,output8):
        policy = output.data.numpy()
        actionValue = np.random.choice(policy,p=policy)
        action = np.argmax(policy == actionValue)
        return action
        
#compute loss on the arm we picked
    def loss(self,output,action,reward):
        return -output[action]*reward

### Training the Agent

In [4]:

agent = Agent()
optimizer = torch.optim.Adam([agent.weights], lr=0.1)
trials = 100
for i in range(trials):
    optimizer.zero_grad()
    output = agent.forward()
    action = agent.action(output)
    reward = pullBanditArm(action)
    loss = agent.loss(output,action,reward)
    loss.backward()
    optimizer.step()
    
print("The learned likelihood of getting the best reward from each of the arms after trial", trials)
print(F.softmax(agent.weights,dim=0).data.numpy().round(3))

The learned likelihood of getting the best reward from each of the arms after trial 100
[0.001 0.002 0.995 0.002]
