In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from collections import deque

from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class memory():

    def __init__(self, size):
        self.mem = deque(maxlen=size)

    def append_sample(self, state, action, reward, next_state, done):
        self.mem.append((state, action, reward, next_state, done))

    def get_train_data(self,batch_size):
        return random.sample(self.mem, batch_size)

    def __len__(self):
        return len(self.mem)



In [2]:
class NN(nn.Module):

    def __init__(self, state_size, action_size):
        super(NN,self).__init__()
        self.fc1 = nn.Linear(state_size, 32)
        self.fc2 = nn.Linear(32,32)
        self.out = nn.Linear(32, action_size)
        nn.init.xavier_normal_(self.fc1.weight)
        nn.init.xavier_normal_(self.fc2.weight)
        nn.init.xavier_normal_(self.out.weight)



    def forward(self,x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.out(x)
        return x
        

In [3]:
action = 0
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n


WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.




In [4]:
class Agent():
    def __init__(self, mem_size, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9
        self.batch_size = 64
        self.GAMMA =0.9
        
        self.worker_NN = NN(state_size, action_size).to(device)
        self.target_NN = NN(state_size, action_size).to(device)
        self.target_NN.load_state_dict(self.worker_NN.state_dict())
        
        self.optimizer = optim.Adam(self.worker_NN.parameters(),lr=0.001)
        self.loss_func = nn.MSELoss()
        self.mem = memory(mem_size)
    
    def get_action(self,state):
        if np.random.rand() <= self.epsilon:
            self.epsilon = self.epsilon * self.epsilon_decay
            return env.action_space.sample()
        else:
            state = torch.Tensor(state).to(device)
            a=self.worker_NN(state)
            out = torch.argmax(a)
            return  out.cpu().numpy()
    
    def train_model(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
        mini_batch = self.mem.get_train_data(self.batch_size)
        
        states = np.zeros((self.batch_size, self.state_size))
        next_states = np.zeros((self.batch_size, self.state_size))
        actions, rewards, dones = [],[],[]
        
        for i in range(self.batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            next_states[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])
        
        states = torch.Tensor(states).to(device)
        next_states = torch.Tensor(next_states).to(device)
        rewards = torch.Tensor(rewards).to(device)
        
        q_eval = self.worker_NN(states)
        q_eval = q_eval.max(1)[0]
        q_next = self.target_NN(next_states)
        
        q_target = rewards + (self.GAMMA * q_next.max(1)[0])
        
            
        loss = F.mse_loss(q_eval, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
    def weight_copy(self):
        self.target_NN.load_state_dict(self.worker_NN.state_dict())
            
    

In [5]:
agent = Agent(mem_size=2000, state_size=state_size, action_size =action_size)

In [7]:
for i_episode in range(500):
    state = env.reset()
    score = 0
    for t in range(500):
        env.render()
        #action = env.action_space.sample()
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])

        reward = reward if not done or score > 499 else -100 #안끝났거나 score가 499보다 큰경우가 아니고서야 reward는 -100

        agent.mem.append_sample(state, action, reward, next_state, done)
        state=next_state
        score +=reward
        if len(agent.mem) > 1000:
            agent.train_model()

        if done:
            if t+1 > 100:
                print("{} : Episode finished after {} timesteps and ".format(i_episode, t+1))
            if (i_episode % 10) == 0:
                agent.weight_copy()
            break
env.close()


229 : Episode finished after 115 timesteps and 




424 : Episode finished after 102 timesteps and 




430 : Episode finished after 104 timesteps and 




433 : Episode finished after 111 timesteps and 




441 : Episode finished after 128 timesteps and 




446 : Episode finished after 106 timesteps and 




447 : Episode finished after 135 timesteps and 




450 : Episode finished after 119 timesteps and 




451 : Episode finished after 117 timesteps and 




452 : Episode finished after 221 timesteps and 




453 : Episode finished after 184 timesteps and 




455 : Episode finished after 184 timesteps and 




456 : Episode finished after 115 timesteps and 




457 : Episode finished after 130 timesteps and 




459 : Episode finished after 107 timesteps and 




460 : Episode finished after 141 timesteps and 




461 : Episode finished after 177 timesteps and 




462 : Episode finished after 255 timesteps and 




463 : Episode finished after 207 timesteps and 




464 : Episode finished after 186 timesteps and 




465 : Episode finished after 500 timesteps and 




466 : Episode finished after 500 timesteps and 




467 : Episode finished after 500 timesteps and 




468 : Episode finished after 378 timesteps and 




469 : Episode finished after 500 timesteps and 




470 : Episode finished after 372 timesteps and 




471 : Episode finished after 206 timesteps and 




472 : Episode finished after 160 timesteps and 




473 : Episode finished after 140 timesteps and 




474 : Episode finished after 197 timesteps and 




475 : Episode finished after 164 timesteps and 




476 : Episode finished after 137 timesteps and 




477 : Episode finished after 127 timesteps and 




478 : Episode finished after 117 timesteps and 




479 : Episode finished after 124 timesteps and 




480 : Episode finished after 119 timesteps and 




481 : Episode finished after 115 timesteps and 




482 : Episode finished after 133 timesteps and 




483 : Episode finished after 126 timesteps and 




484 : Episode finished after 113 timesteps and 




577 : Episode finished after 118 timesteps and 




578 : Episode finished after 124 timesteps and 




579 : Episode finished after 170 timesteps and 




580 : Episode finished after 296 timesteps and 




581 : Episode finished after 500 timesteps and 




582 : Episode finished after 500 timesteps and 




583 : Episode finished after 500 timesteps and 




584 : Episode finished after 500 timesteps and 




585 : Episode finished after 500 timesteps and 




586 : Episode finished after 500 timesteps and 




587 : Episode finished after 500 timesteps and 




588 : Episode finished after 500 timesteps and 




589 : Episode finished after 500 timesteps and 




590 : Episode finished after 500 timesteps and 




591 : Episode finished after 466 timesteps and 




592 : Episode finished after 344 timesteps and 




593 : Episode finished after 251 timesteps and 




594 : Episode finished after 210 timesteps and 




595 : Episode finished after 198 timesteps and 




596 : Episode finished after 183 timesteps and 




597 : Episode finished after 194 timesteps and 




598 : Episode finished after 185 timesteps and 




599 : Episode finished after 196 timesteps and 




600 : Episode finished after 207 timesteps and 




601 : Episode finished after 500 timesteps and 




602 : Episode finished after 500 timesteps and 




603 : Episode finished after 500 timesteps and 




604 : Episode finished after 500 timesteps and 




605 : Episode finished after 500 timesteps and 




606 : Episode finished after 500 timesteps and 




607 : Episode finished after 500 timesteps and 




608 : Episode finished after 500 timesteps and 




609 : Episode finished after 500 timesteps and 




610 : Episode finished after 500 timesteps and 




611 : Episode finished after 500 timesteps and 




612 : Episode finished after 386 timesteps and 




613 : Episode finished after 445 timesteps and 




614 : Episode finished after 343 timesteps and 




615 : Episode finished after 329 timesteps and 




616 : Episode finished after 305 timesteps and 




617 : Episode finished after 231 timesteps and 




618 : Episode finished after 133 timesteps and 




621 : Episode finished after 140 timesteps and 




622 : Episode finished after 140 timesteps and 




623 : Episode finished after 129 timesteps and 




624 : Episode finished after 142 timesteps and 




625 : Episode finished after 176 timesteps and 




626 : Episode finished after 187 timesteps and 




627 : Episode finished after 198 timesteps and 




628 : Episode finished after 207 timesteps and 




629 : Episode finished after 209 timesteps and 




630 : Episode finished after 246 timesteps and 




631 : Episode finished after 147 timesteps and 




632 : Episode finished after 104 timesteps and 




633 : Episode finished after 102 timesteps and 




634 : Episode finished after 120 timesteps and 




635 : Episode finished after 112 timesteps and 




636 : Episode finished after 105 timesteps and 




637 : Episode finished after 110 timesteps and 




638 : Episode finished after 101 timesteps and 




640 : Episode finished after 117 timesteps and 




641 : Episode finished after 178 timesteps and 




642 : Episode finished after 220 timesteps and 




643 : Episode finished after 228 timesteps and 




644 : Episode finished after 230 timesteps and 




645 : Episode finished after 254 timesteps and 




646 : Episode finished after 274 timesteps and 




647 : Episode finished after 270 timesteps and 




648 : Episode finished after 316 timesteps and 




649 : Episode finished after 313 timesteps and 




650 : Episode finished after 347 timesteps and 




651 : Episode finished after 500 timesteps and 




652 : Episode finished after 441 timesteps and 




653 : Episode finished after 364 timesteps and 




654 : Episode finished after 477 timesteps and 




655 : Episode finished after 350 timesteps and 




656 : Episode finished after 500 timesteps and 




657 : Episode finished after 351 timesteps and 




658 : Episode finished after 445 timesteps and 




659 : Episode finished after 493 timesteps and 




660 : Episode finished after 482 timesteps and 




661 : Episode finished after 499 timesteps and 




662 : Episode finished after 500 timesteps and 




663 : Episode finished after 500 timesteps and 




664 : Episode finished after 500 timesteps and 




665 : Episode finished after 500 timesteps and 




KeyboardInterrupt: 