In [1]:
import numpy as np
import pandas as pd 
import datetime as datetime
import gym
from gym import envs

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam    

import os
import time

import random 
from collections import deque

envs.registry.all()
env = gym.make('CartPole-v0')

import wandb


from wandb.keras import WandbCallback



In [2]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        #experience replay
        self.memory = deque(maxlen = 2000)
        
        #discount rate
        self.gamma = 0.95
        
        #epsilon-greedy params
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.0001
        
        self.learning_rate = 0.001
        
        self.model = self._build_model()
        self.target = self._build_model()
        self.alighn_target_model()
    
    def _build_model(self):
        
        wandb.init(config = {
            "gamma" : 0.95,
            "epsilon_decay" : 0.995,
            "learning_rate": 0.0001,
            "epochs": 1,
            "batch_size": 30,
            "neuroneLayer1" : 30,
            "neuroneLayer2" : 24,
            "activation1" : "relu",
            "activation2" : "relu",
            "activation_out" : "linear"
        })
        
        
        model = Sequential()
        
        #hyper params to tune
        model.add(Dense(30, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        
        return model
    
    def alighn_target_model(self):
        self.target.set_weights(self.model.get_weights())
    
    def remember(self, state, action,reward,next_state,done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        #epsilon-greedy choice of the action to perform
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, batch_size):  
    
        x = np.ndarray((0,4))
        y = np.ndarray((0,2))
        
        
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, terminated in minibatch:
            
            target = self.model.predict(state)
            
            if terminated:
                target[0][action] = reward
            else:
                t = self.target.predict(next_state)
                target[0][action] = reward + self.gamma * np.amax(t)
            
            #x = np.vstack((x, state))
            #y = np.vstack((y, target_f))
        
        
        
            self.model.fit(state, target, epochs =1, verbose = 0,
            callbacks=[WandbCallback()])
        
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
    def load(self, path):
        self.model.load_weights(path)
        
    def save(self, path):
        self.model.save_weights(path)

In [None]:
#sort of main, to define as a class "controller"

!python -m wandb login 46004e4ab31134349e71ceede20423c7cfdbb092
wandb.init(project="RL_PLAY", entity="devantheryl")



env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 30
n_episode = 1000
output_dir = 'model_output/cartepole'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

agent = Agent(state_size, action_size)

for e in range(n_episode):
    
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    done = False
    score = 0
    
    while not done:
        env.render()
        score+=1
        action = agent.act(state)
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1,state_size])
        
        agent.remember(state, action, reward, next_state, done)
        
        state = next_state
        
        if done:
            print("episode : {}/{}, score : {}, e : {:.2}".format(e, n_episode, score, agent.epsilon))
            agent.alighn_target_model()
            break
    
    
    if len(agent.memory) > batch_size:
            
            
            agent.replay(batch_size)
            

    if e%50 == 0:
        #agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5")
        pass

env.close()
    
        

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\LDE/.netrc
wandb: Currently logged in as: devantheryl (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.7 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.0, max=1.0)…

wandb: wandb version 0.12.7 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

wandb: wandb version 0.12.7 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


episode : 0/1000, score : 19, e : 1.0
episode : 1/1000, score : 28, e : 1.0
episode : 2/1000, score : 14, e : 0.99
episode : 3/1000, score : 16, e : 0.99
episode : 4/1000, score : 27, e : 0.99
episode : 5/1000, score : 33, e : 0.98
episode : 6/1000, score : 17, e : 0.98
episode : 7/1000, score : 14, e : 0.97
episode : 8/1000, score : 23, e : 0.97
episode : 9/1000, score : 24, e : 0.96
episode : 10/1000, score : 11, e : 0.96
episode : 11/1000, score : 33, e : 0.95
episode : 12/1000, score : 20, e : 0.95
episode : 13/1000, score : 24, e : 0.94
episode : 14/1000, score : 21, e : 0.94
episode : 15/1000, score : 15, e : 0.93
episode : 16/1000, score : 30, e : 0.93
episode : 17/1000, score : 15, e : 0.92
episode : 18/1000, score : 29, e : 0.92
episode : 19/1000, score : 13, e : 0.91
episode : 20/1000, score : 24, e : 0.91
episode : 21/1000, score : 11, e : 0.9
episode : 22/1000, score : 11, e : 0.9
episode : 23/1000, score : 11, e : 0.9
episode : 24/1000, score : 24, e : 0.89
episode : 25/10

episode : 203/1000, score : 61, e : 0.36
episode : 204/1000, score : 47, e : 0.36
episode : 205/1000, score : 35, e : 0.36
episode : 206/1000, score : 41, e : 0.36
episode : 207/1000, score : 148, e : 0.36
episode : 208/1000, score : 60, e : 0.35
episode : 209/1000, score : 68, e : 0.35
episode : 210/1000, score : 55, e : 0.35
episode : 211/1000, score : 53, e : 0.35
episode : 212/1000, score : 74, e : 0.35
episode : 213/1000, score : 132, e : 0.35
episode : 214/1000, score : 63, e : 0.34
episode : 215/1000, score : 78, e : 0.34
episode : 216/1000, score : 82, e : 0.34
episode : 217/1000, score : 46, e : 0.34
episode : 218/1000, score : 50, e : 0.34
episode : 219/1000, score : 109, e : 0.34
episode : 220/1000, score : 44, e : 0.33
episode : 221/1000, score : 81, e : 0.33
episode : 222/1000, score : 91, e : 0.33
episode : 223/1000, score : 79, e : 0.33
episode : 224/1000, score : 153, e : 0.33
episode : 225/1000, score : 103, e : 0.33
episode : 226/1000, score : 200, e : 0.32
episode : 

In [None]:
state = np.array([[1,1,1,1]])

x = np.ndarray((0,4))

x = np.vstack((x, state))
x = np.vstack((x, state))
x = np.vstack((x, state))
x = np.vstack((x, state))



print(x)