In [0]:
# Dependencies
import gym
import random
import numpy as np
from collections import deque
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [0]:
# Environment
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

In [0]:
memory = deque(maxlen=2000)
gamma = 0.95    # discount rate
epsilon = 1.0  # exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001

EPISODES = 1000
done = False
batch_size = 32

In [0]:
def build_model(state_size,action_size,learning_rate):
    # Neural Net for Deep-Q learning Model
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse',
                  optimizer=Adam(lr=learning_rate))
    return model


In [244]:
model=build_model(state_size,action_size,learning_rate)
model.summary()

Model: "sequential_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_90 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_91 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_92 (Dense)             (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [0]:
#model.load_weights(link)
result=[]
seed = 42
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            env.render()
            # Find action based on epsilon
            if np.random.rand() <= epsilon:
                action= random.randrange(action_size)
            else :
                act_values = model.predict(state)
                action=np.argmax(act_values[0])  
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])

            memory.append((state, action, reward, next_state, done))
            state = next_state
            if done:
                result.append((e, time, epsilon, reward))
                print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, time, epsilon))
                break
            if len(memory) > batch_size:
                minibatch = random.sample(memory, batch_size)
                for state1, action1, reward1, next_state1, done1 in minibatch:
                    target1 = reward1
                    if not done1:
                        target1 = (reward1 + gamma *np.amax(model.predict(next_state1)[0]))
                    target_f = model.predict(state1)
                    target_f[0][action1] = target1
                    model.fit(state1, target_f, epochs=1, verbose=0)
                if epsilon > epsilon_min:
                    epsilon *= epsilon_decay
                if e % 10 == 0:
                     model.save_weights('DQN.h5')