In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gym # for environment
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam # adaptive momentum 
import random

EPISODES = 1000

class DQLAgent(): 
    
    def __init__(self, env):
        # parameters and hyperparameters
        
        # this part is for neural network or build_model()
        self.state_size = env.observation_space.shape[0] # this is for input of neural network node size
        self.action_size = env.action_space.n # this is for out of neural network node size
        self.memory = deque(maxlen = 1000) # a list with 1000 memory, if it becomes full first inputs will be deleted
        
        """
        TODO:
        - implement a policy (epsilon-greedy recommended)
        - add a model that selects actions (DQN = deep Q learning)
        - recommended: implement experience replay for training
        """    
    def memorize(self, state, action, reward, next_state, done):
        # storage
        self.memory.append((state, action, reward, next_state, done)) # (state, action, reward, next_state, done) tuple
    
    def act(self, state):
        # acting, exploit or explore
        if random.uniform(0,1) <= self.epsilon:
            return env.action_space.sample()
        else:
            act_values = self.model.predict(state) # predict Q values
            return np.argmax(act_values[0]) # return the index of the max Q value
            
    
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)
            

if __name__ == "__main__":
    
    # initialize gym environment and agent
    env = gym.make('CartPole-v1')
    agent = DQLAgent(env)
    state_size = env.observation_space.shape[0]
    batch_size = 32

    rolling_rewards = deque(maxlen=100)
    avg_reward = 0
    for e in range(EPISODES):
        
        # initialize environment
        state = env.reset()
        state = np.reshape(state[0], [1, state_size])
        for time in range(500):
            ep_reward =0
            env.render()
            action = agent.act(state)
            # print(env.step(action))
            next_state, reward, done, _, _= env.step(action)
            # print("next_state: ", next_state)
            next_state = np.reshape(next_state, [1, state_size])
            reward = reward if not done else -10
            ep_reward += reward
            agent.memorize(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, time, agent.epsilon))
                break

        rolling_rewards.append(ep_reward)
        avg_reward = sum(rolling_rewards)/100
        print("Average Reward:", avg_reward)
        if avg_reward >= 195.0:
            print("Solved!")
            print(avg_reward)
            agent.save("./save/cartpole-dqn.h5")
            exit(0)

    print("Not solved :(")
    print(avg_reward)