In [1]:
import gym
import numpy as np 
import torch
from torch import nn
from torch.autograd import Variable
from torch import optim
from torch.nn import functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [2]:
def one_hot(ids, nb_digits):
    """
    ids: (list, ndarray) shape:[batch_size]
    """
    if not isinstance(ids, (list, np.ndarray)):
        raise ValueError("ids must be 1-D list or array")
    batch_size = len(ids)
    ids = torch.cuda.LongTensor(ids).view(batch_size, 1)
    out_tensor = Variable(torch.cuda.FloatTensor(batch_size, nb_digits))
    out_tensor.data.zero_()
    out_tensor.data.scatter_(dim=1, index=ids, value=1.)
    return out_tensor

def uniform_linear_layer(linear_layer):
    linear_layer.weight.data.uniform_()
    linear_layer.bias.data.fill_(-0.02)

In [3]:
lake = gym.make('FrozenLake-v0')

[2018-01-23 13:48:56,377] Making new env: FrozenLake-v0


In [4]:
lake = gym.make('FrozenLake-v0')
lake.reset()
lake.render()
# lake.step(1)
# lake.render()

[2018-01-23 13:49:27,948] Making new env: FrozenLake-v0



[41mS[0mFFF
FHFH
FFFH
HFFG


In [5]:
class Agent(nn.Module):
    def __init__(self, observation_space_size, action_space_size):
        super(Agent, self).__init__()
        self.observation_space_size = observation_space_size
        self.hidden_size = observation_space_size
        self.l1 = nn.Linear(in_features=observation_space_size, out_features=self.hidden_size).cuda()
        self.l2 = nn.Linear(in_features=self.hidden_size, out_features=action_space_size).cuda()
        uniform_linear_layer(self.l1)
        uniform_linear_layer(self.l2)
    
    def forward(self, state):
        obs_emb = one_hot([int(state)], self.observation_space_size)
        out1 = F.sigmoid(self.l1(obs_emb))
        return self.l2(out1).view((-1)) # 1 x ACTION_SPACE_SIZE == 1 x 4  =>  4

In [7]:
class Trainer:
    def __init__(self):
        self.agent = Agent(lake.observation_space.n, lake.action_space.n)
        self.optimizer = optim.Adam(params=self.agent.parameters())
        self.success = []
        self.jList = []
    
    def train(self, epoch):
        for i in tqdm(range(epoch)):
            s = lake.reset()
            j = 0
            while j < 200:
                
                # perform chosen action
                a = self.choose_action(s)
                s1, r, d, _ = lake.step(a)
                if d == True and r == 0: r = -1
                
                # calculate target and loss
                target_q = r + 0.99 * torch.max(self.agent(s1).detach()) # detach from the computing flow
                loss = F.smooth_l1_loss(self.agent(s)[a], target_q)
                
                # update model to optimize Q
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                # update state
                s = s1
                j += 1
                if d == True: break
            
            # append results onto report lists
            if d == True and r > 0:
                self.success.append(1)
            else:
                self.success.append(0)
            self.jList.append(j)
        print("last 100 epoches success rate: " + str(sum(self.success[-100:])) + "%")

    def choose_action(self, s):
        if (np.random.rand(1) < 0.1): 
            return lake.action_space.sample()
        else:
            agent_out = self.agent(s).detach()
#             print("s: " + str(s))
#             print(self.agent(s))
#             print(self.agent.forward(s))
            _, max_index = torch.max(agent_out, 0)
            return max_index.data.cpu().numpy()[0]

In [8]:
t = Trainer()
t.train(2000)

100%|██████████| 2000/2000 [02:43<00:00, 12.20it/s]

last 100 epoches success rate: 30%





In [8]:
sum(t.success[-100:])

25

In [9]:
t.agent.l1.weight

Parameter containing:

Columns 0 to 9 
 0.7423 -0.3557  0.0710 -0.1783  0.5162  0.6724  0.0488  0.6868  0.2366  0.8622
 0.3130  0.2385  0.3504  0.0446 -0.0737  0.7004 -0.3870  0.5396  0.6313  0.8582
 0.1951  0.4983  0.1643  0.4734  1.0127  0.7690  0.6937  0.0357 -0.0444  0.6728
 0.1644  0.4600  0.3233  0.0913  0.7077  0.0815  1.0353  0.3363  0.3254  0.1934
 0.0837  0.0125  0.4990  0.3654  0.3342  0.1007  0.1168  0.9566 -0.0481  1.1108
-0.3572  0.1324  0.0055  0.3420 -0.4731  0.4924  0.8744  0.3643  0.5987  0.7327
-0.6579  0.9036  0.2980  1.0796  0.3269  0.7730  1.6167  0.3642  0.4722  0.2677
-0.1340 -0.4073  0.3173  0.1615  0.0344  0.3560  0.8031  0.0623 -0.3421 -0.1835
 0.1729  0.1236  0.6371  0.4363 -0.1595  0.9928 -0.1705  0.8178  0.2752 -0.3956
 0.4530  0.5539  0.0290 -0.0185  0.0071  0.2686 -0.2819  0.3291  0.4311  0.1830
 0.3114  0.4098  0.2487  0.5311  0.6061  0.8223 -0.0979  0.8103  0.3167 -0.3426
-0.2285  0.2757 -0.0500  0.3170  0.1526  0.3480  0.7339  0.1567  0.4384 -0.0597
 

In [20]:
lake2 = gym.make('FrozenLake-v0')
lake2.seed(4)
# lake2.reset()
lake2.render()



[41mS[0mFFF
FHFH
FFFH
HFFG
