# Simple Reinforcement Learning in Pytorch: 
## The Multi-armed bandit, basic policy optimization

In [1]:
import torch
import torch.nn.functional as F
import numpy as np

### The Bandit
We are using a four-armed bandit. The pullBandit function generates a random number between zero and 1. The lower the bandit number, the more likely a positive reward will be returned. We want our agent to learn to always choose the arm that will give that positive reward.

In [2]:
#List out our bandit arms, where the bandit_payoff is the likelihood of a payoff when that arm is pulled.
#Currently arm 3 (index #2) is set to most often provide a positive reward.

bandit_payoffs = [0.1, 0.0, 0.3, 0.15]
num_arms = len(bandit_payoffs)
def pullBanditArm(arm):
    #Get a random number.
    result = np.random.rand(1)
    if result < bandit_payoffs[arm]:
        #return a positive reward.
        return 1
    else:
        #return a negative reward.
        return -1

### The Agent

#### Basic agent using vanilla policy of randomly selecting action based on weighted policy estimates

In [3]:
class Agent(torch.nn.Module):
    
#Simple layer of weights, one for each bandit arm
    def __init__(self):
        super(Agent, self).__init__()
        self.weights = torch.ones([num_arms], requires_grad=True)
        
#Use softmax to shape output of choice values
    def forward(self):
        return F.softmax(self.weights,dim=0)
        
#Pick a random action from likelihood based on the current policy
    def action(self,output):
        return torch.multinomial(output, 1)[0].data
        
#compute cross-entropy "loss" on the arm we picked. Because this is from a softmax, it will backpropagate to all weights.
    def loss(self,output,action,reward):
        return -output[action].log()*reward

### Training the Agent

In [4]:

def train_agent(agent,trials=1000,lr=0.01):
    optimizer = torch.optim.Adam([agent.weights],lr=lr)
    for i in range(trials):
        optimizer.zero_grad()
        output = agent.forward()
        action = agent.action(output)
        reward = pullBanditArm(action)
        loss = agent.loss(output,action,reward)
        loss.backward()
        optimizer.step()
    print("The learned likelihood of getting the best reward from each of the arms after trial", trials)
    print(F.softmax(agent.weights,dim=0).data.numpy().round(3))

In [5]:
agent = Agent()
train_agent(agent,1000)

The learned likelihood of getting the best reward from each of the arms after trial 1000
[0.094 0.034 0.744 0.128]


### Policy Variations

#### Always pick the current best policy action forecast, even at the very beginning of training

In [6]:
class Agent_greedy(Agent):
    
    def action(self,output):
        return torch.argmax(output)

train_agent(Agent_greedy(),1000)

The learned likelihood of getting the best reward from each of the arms after trial 1000
[0.251 0.251 0.252 0.246]



#### Pick likely-best policy option half the time, random half the time.

In [7]:
class Agent_mixed(Agent):
    
    def action(self,output):
        if(np.random.rand() > 0.5):
            return super(Agent_mixed,self).action(output)
        else:
            return np.random.randint(0,num_arms)

train_agent(Agent_mixed(),1000)

The learned likelihood of getting the best reward from each of the arms after trial 1000
[0.    0.    0.998 0.001]



#### Use an epsilon-greedy policy.

In [8]:
class Agent_epsilon_greedy(Agent):
    
    def __init__(self):
        super(Agent_epsilon_greedy, self).__init__()
        self.epsilon = 0.1
        
    def action(self,output):
        if(np.random.rand() > self.epsilon):
            return torch.argmax(output)
        else:
            return np.random.randint(0,num_arms)

train_agent(Agent_epsilon_greedy(),1000)

The learned likelihood of getting the best reward from each of the arms after trial 1000
[0.242 0.246 0.263 0.248]


#### Use an epsilon-greedy with decreasing epsilon (starts randomly and gradually shifts to greedy.)

In [9]:
class Agent_epsilon_decreasing(Agent):
    
    def __init__(self):
        super(Agent_epsilon_decreasing, self).__init__()
        self.epsilon = 0.999
        
    def action(self,output):
        self.epsilon *= 0.99
        if(np.random.rand() > self.epsilon):
            return torch.argmax(output)
        else:
            return np.random.randint(0,num_arms)

agent = Agent_epsilon_decreasing()
train_agent(agent,1000)
print("final epsilon:",agent.epsilon)

The learned likelihood of getting the best reward from each of the arms after trial 1000
[0.251 0.247 0.249 0.253]
final epsilon: 4.312807616324709e-05
