In [None]:
rm -rf ML_main_project

In [None]:
#Getting the task files and gym from git
!git clone https://github.com/ganeshalamuru/ML_main_project.git ML_main_project

In [None]:
import gym
import subprocess

In [None]:
gym_path = gym.__file__
gym_path = gym_path.replace('/__init__.py','')
print(gym_path)

In [None]:
subprocess.call('cp ./ML_main_project/task3.py {}/envs/classic_control/cartpole.py'.format(gym_path), shell=True)

In [None]:
import gym
from gym import spaces, logger
from gym.utils import seeding
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
import keras
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt
from torch.utils.tensorboard import SummaryWriter
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from collections import Counter
from collections import defaultdict
from collections import deque
from statistics import median, mean

In [None]:
# Render the start state of the cartpole game
env = gym.make("CartPole-v1")
env.reset()
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
print("Mass of the pole:",env.masspole)
env.close()

In [None]:
# Deep Q Learning
# Hyperparameters
BATCH_SIZE = 64
LR = 0.01
EPSILON = 0.9
DECAY = 0.995
GAMMA = 0.95
TARGET_UPDATE_INTERVAL = 100
REPLAY_BUFFER_CAPACITY = 2000
env = gym.make("CartPole-v1")
STATE_DIM = env.observation_space.shape[0]
ACTION_DIM = env.action_space.n
N_TRAIN_EPISODES = 1000
N_TEST_EPISODES = 100

# Initialise weights from normal distribution
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.normal_(m.weight, 0.0, 0.1)

# Store the experiences in the replay buffer
class ReplayBuffer:
    def __init__(self, size):
        self.size = size
        self.memory = deque([], maxlen=size)

    def push(self, x):
        self.memory.append(x)

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def get_len(self):
        return len(self.memory)


# Neural Network Definition
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()

        self.fc1 = nn.Linear(STATE_DIM, 50)
        self.fc2 = nn.Linear(50, ACTION_DIM)

        self.apply(init_weights)
# Forward Propagation of an Input
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Agent who plays the game
class Agent(object):
    def __init__(self):
        self.dqn, self.target_dqn = DQN(), DQN()

        self.learn_step_counter = 0
        self.memory_counter = 0
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_CAPACITY)
        self.optimizer = opt.Adam(self.dqn.parameters(), lr=LR)
        self.loss_fn = nn.MSELoss()

    # Choosing action based on epsilon-greedy policy: Choose a random action if random number generated <= epsilon
    def choose_action(self, s,epsilon):
        s = torch.unsqueeze(torch.FloatTensor(s), 0)

        if np.random.uniform() > epsilon:
            qs = self.dqn.forward(s)
            action = torch.max(qs, 1)[1].data.numpy()
            action = action[0]
        else:
            action = env.action_space.sample()

        return action

    # Update parameters of the NN
    def update_params(self):
        # update target network
        if self.learn_step_counter % TARGET_UPDATE_INTERVAL == 0:
            self.target_dqn.load_state_dict(self.dqn.state_dict())
        self.learn_step_counter += 1

        # sample batch of transitions
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            BATCH_SIZE
        )

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions.astype(int).reshape((-1, 1)))
        rewards = torch.FloatTensor(rewards).unsqueeze(1)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(np.float32(dones)).unsqueeze(1)

        # get q values
        q_current = self.dqn(states).gather(1, actions)
        q_next = self.target_dqn(next_states).detach()
        q_target = rewards + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
        q_loss = self.loss_fn(q_current, q_target)

        # backpropagate
        self.optimizer.zero_grad()
        q_loss.backward()
        self.optimizer.step()

    def train(self):

        scores = []
        for i in range(N_TRAIN_EPISODES):
            state = env.reset()
            episode_reward = 0
            step = 0
            self.epsilon = EPSILON
            while True:
              # env.render()
              action = self.choose_action(state,self.epsilon)
              self.epsilon *= DECAY
              self.epsilon = max(0.1,self.epsilon)

              # take action
              next_state, reward_orig, done, _ = env.step(action)
              step += 1

              # modify the reward function
              x, x_dot, theta, theta_dot = next_state
              r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
              r2 = (
              env.theta_threshold_radians - abs(theta)
              ) / env.theta_threshold_radians - 0.5
              reward = r1 + r2

              self.replay_buffer.push((state, action, reward, next_state, done))
              self.memory_counter += 1

              episode_reward += reward_orig

              if self.memory_counter > REPLAY_BUFFER_CAPACITY:
                self.update_params()

              if done:
                print(
                    "Episode: {}, Reward: {}, step: {}".format(
                        i, round(episode_reward, 2), step
                    )
                )

              if done:
                break

              state = next_state
            scores.append(step)
        # Print some stats
        print('Average Score:',sum(scores)/len(scores))
        plt.plot(scores)
        plt.ylabel('Scores')
        plt.xlabel('Train Episodes')
        plt.show()

    def evaluate(self):

        env = gym.make('CartPole-v1')
        scores = []
        for i in range(N_TEST_EPISODES):
            state = env.reset()
            episode_reward = 0
            step = 0

            while True:
              # env.render()
              action = self.choose_action(state,0)

              # take action
              next_state, reward_orig, done, _ = env.step(action)
              step += 1
              state = next_state
              if done:
                break

            scores.append(step)
        # Print some stats
        print('Average Score:',sum(scores)/len(scores))
        plt.plot(scores)
        plt.ylabel('Scores')
        plt.xlabel('Test Episodes')
        plt.show()      




In [None]:
with open('../Trained Models/Deep-Q-Learning-Model-Best-Extra.pkl', 'rb') as input:
    model = pickle.load(input)
    model.evaluate()

In [None]:
# Evaluate model over 100 episodes with rendering
scores = []
env = gym.make("CartPole-v1")
env.reset()
for episode in range(0,100):
  env.reset()
  env.seed(42)
  current_state = env.reset()
  score = 0
  for _ in range(0,500):
    env.render()
    action = model.choose_action(current_state,0)
    obs, reward, done, info = env.step(action)
    current_state = obs
    score += 1
    if done:
      break
  scores.append(score)
# Print some stats
print('Average Score:',sum(scores)/len(scores))
print('Scores:',scores)  