# **Solving cartpole game with DQN**

The goal of the cartpole game is to balance the pole connected with one joint on top of a moving cart. There are 4 kinds of information given by the state, such as angle of the pole and position of the cart. An agent can move the cart by performing a series of actions of 0 or 1 to the cart, pushing it left or right.

In [0]:
 !pip -q install gym #installing all the required packages
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.image import imread
%matplotlib inline

EPISODES = 500

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate  to calculate the future discounted reward.
        self.epsilon = 1.0  # exploration rate :this is the rate in which an agent randomly decides its action rather than prediction.
        self.epsilon_min = 0.01 #we want the agent to explore at least this amount.
        self.epsilon_decay = 0.995 #we want to decrease the number of explorations as it gets good at playing games.
        self.learning_rate = 0.001 #Determines how much neural net learns in each iteration.
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done): #remember function will store states, actions and resulting rewards to the memory
        self.memory.append((state, action, reward, next_state, done)) #done is just a boolean that indicates if the state is the final state.

    def act(self, state): #act function works on the principle that initially the agent goes rogue but after training on mutiple iterations 
    #When it is not deciding the action randomly, the agent will predict the reward value based on the current state and pick the action that will give the highest reward. 
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size) #The above code will make minibatch, which is just a randomly sampled elements of the memories of size batch_size. 
        for state, action, reward, next_state, done in minibatch: #We set the batch size as 32.
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0])) 
            target_f = self.model.predict(state) #predict() function on the model will predict the reward of current state based on the data we trained.
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0) #fit() method feeds input and output pairs to the model.
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    epsilons =[]
    done = False
    batch_size = 32

    for e in range(EPISODES): #episode: a number of games we want the agent to play.
        state = env.reset() ## reset state in the beginning of each game
        state = np.reshape(state, [1, state_size])
        for time in range(500): ## time_t represents each frame of the game
        # Our goal is to keep the pole upright as long as possible until score of 500
        # the more time_t the more score
            #env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state ## makes next_state the new current state for the next frame.
            if done: #done becomes True when the game ends
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        epsilons.append(agent.epsilon)
        if e % 10 == 0:
            agent.save("cartpole-dqn.h5")
    agent.load("cartpole-dqn.h5")
   


    plt.figure(figsize=(8, 6), dpi=80)
    plt.title("Epsilon")
    plt.xlabel("Episode")
    plt.ylabel("Epsilon value")
    plt.plot(epsilons)

  result = entry_point.load(False)


episode: 0/500, score: 15, e: 1.0
episode: 1/500, score: 14, e: 1.0
episode: 2/500, score: 25, e: 0.89
episode: 3/500, score: 20, e: 0.8
episode: 4/500, score: 28, e: 0.7
episode: 5/500, score: 14, e: 0.65
episode: 6/500, score: 13, e: 0.61
episode: 7/500, score: 10, e: 0.58
episode: 8/500, score: 8, e: 0.56
episode: 9/500, score: 11, e: 0.53
episode: 10/500, score: 10, e: 0.5
episode: 11/500, score: 12, e: 0.47
episode: 12/500, score: 23, e: 0.42
episode: 13/500, score: 12, e: 0.4
