In [10]:
# The goal here is to build a reinforcement learning skeleton project
# that can be adapted both to my own sims and openai gym/other stuff.
# Also, there are annotations by nifty bits of code so that I can learn it
# better and faster.
from collections import deque, namedtuple

import gym
import numpy as np
import tensorflow as tf
import random
import math
import imageio

from PIL import Image
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Conv2D, Flatten, MaxPooling2D
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers.legacy import Adam

# import environment here, or build it in-notebook idc

ALPHA = 0.01
GAMMA = 0.99
MEMORY_SIZE = 1000
NUM_STEPS_FOR_UPDATE = 5
env = gym.make('CarRacing-v2',domain_randomize=True,continuous=False)
observation = env.reset()
# controller = env.controller()
# env.n_foods = 3
# env.grid_size = [25,25]
# env.unit_size = 10
# env.unit_gap = 1
# env.snake_size = 1
# env.random_init = True
#...

# When I customize this enough that it's my own, maybe name my setup after Otaku things, because DQN. haha. So funny.
# With this, I am basically performing transfer learning on myself: I'm starting with someone else's model (Andrew Ng) and then gradually descending towards
# a solution that is more tuned towards me (while improving my knowledge)

In [11]:
## environment stuff
# Note to self: I should make whatever environment I build's API resemble 
# that of the openai thing, so that I can freely switch between my own
# sims and theirs depending on the project.
# ADDITIONAL NOTE: minesweeper is probably not a good initial game. Way too luck-based.

state_size =  env.observation_space.shape# change based on what is being considered
num_actions = env.action_space.n # Figure out how to represent "actions" efficiently
env.reset()

print(env.render())
# Image.fromarray(env.render())
print("State shape: ",env.observation_space.shape)
print("Number of actions (Should be 5):",env.action_space.n)
print("State and action space full objects:",env.observation_space,env.action_space)

None
State shape:  (96, 96, 3)
Number of actions (Should be 5): 5
State and action space full objects: Box(0, 255, (96, 96, 3), uint8) Discrete(5)


  gym.logger.warn(


In [12]:

# constants
# print(state_size)
# dimensions_after_1_conv = convolve_dimensions(state_size[0],3,0,1)
# print("1conv dimensions",dimensions_after_1_conv)
# dimensions_after_1_pool = pool_dimensions(dimensions_after_1_conv,3)
# print("1pool dimensions",dimensions_after_1_pool)

# Set up networks
q_network = Sequential([
    Conv2D(32,3,input_shape=(state_size)),
    MaxPooling2D((3,3)),
    Conv2D(64,3),
    MaxPooling2D((3,3)),
    Flatten(),
    Dense(units = 50, activation = "relu"),
    Dense(units = 25, activation = "relu"),
    Dense(units = num_actions, activation = "linear")
])

# # Set up networks
# q_network = Sequential([
#     Conv2D(32,3,input_shape=(state_size)),
#     MaxPooling2D((3,3)),
#     Conv2D(64,dimensions_after_1_pool,dimensions_after_1_pool),
#     MaxPooling2D((3,3)),
#     Flatten(),
#     Dense(units = 64, activation = "relu"),
#     Dense(units = 32, activation = "relu"),
#     Dense(units = num_actions)
# ])

# target_q_network = Sequential([
#     Input(state_size),
#     Dense(units = 64, activation = "relu"),
#     Dense(units = 64, activation = "relu"),
#     Dense(units = num_actions)
# ])
optimizer = Adam(ALPHA)

# define environment tuple
experience = namedtuple("Experience", field_names=["state","action","reward","next_state","done"])

Metal device set to: Apple M1 Pro


2023-01-15 21:22:29.693456: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-15 21:22:29.694147: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [13]:
def compute_loss(experiences, gamma, q_network,):
    
    # unpack values of tuples as lists (basically take the columns out)
    states, actions, rewards, next_states, done_vals = experiences
    
    # Compute Q(s,a) for all experience tuples (this will be used to get the y-target value)
    max_q = tf.reduce_max(q_network(next_states), axis=-1)
    finished_constant = 1 - done_vals
    
    y_targets = rewards + gamma*max_q*finished_constant
    
    q_values = q_network(states)
    # Get someone clever to tell me what the line below does
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))
    
    loss = MSE(y_targets,q_values)
    return loss
    
    

In [14]:
# define helpers
EPSILON_DECAY = 0.99
EPSILON_FLOOR = 0.0

MINIBATCH_SIZE = 64

def update_if_conditions(t,steps_for_update,buffer): # update only every some # of steps
                                                     # and also only when buffer is at least as large as the batch size.
    if (((steps_for_update % t) == 0) and (len(buffer) > MINIBATCH_SIZE)):
        return True
    return False
    
def sample_experiences(buffer): # Taken from Andrew Ng's coursera ML course
    experiences = random.sample(buffer,k=MINIBATCH_SIZE)
    states = tf.convert_to_tensor( # conversion to tensor so that methods like tf.reduce_max work later
    np.array([e.state for e in experiences if e is not None]), dtype=tf.float32
    )
    # Note the neat inline for loop and if statement going on. Be sure to ask about that.
    actions = tf.convert_to_tensor(
        np.array([e.action for e in experiences if e is not None]), dtype=tf.float32
    )
    
    rewards = tf.convert_to_tensor(
        np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32
    )
    next_states = tf.convert_to_tensor(
        np.array([e.next_state for e in experiences if e is not None]), dtype=tf.float32
    )
    done_vals = tf.convert_to_tensor(
        np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
        dtype=tf.float32,
    )
    return (states, actions, rewards, next_states, done_vals) # all of the above is done to take the tuple columns and make them tensorflow tensors
                                                              # so that they're easier to work with. This returns the tensors as a group of separate items.
    
def update_epsilon(epsilon):
    # Reduce epsilon by a fraction during each iteration, to a minimum of
    # the epsilon floor
    return max(epsilon*EPSILON_DECAY,EPSILON_FLOOR)

def get_action_epsilon(qvals,epsilon): # Get lowest-cost action, or pick randomly with epsilon% chance
    if (random.random() < epsilon):
        # pick random action
        choice = random.randint(0,qvals.numpy().shape[0])
        return choice # assumes action takes form of a number. # possible alternative from Coursera: return random.choice(np.arange(4))
    else:
        # pick best action (according to DQN)
        choice = np.argmax(qvals.numpy()[0]) # Take the first of the "max rewards in list" array generated ny np.argmax. The tensorflow tensor that is qvals needs to be converted first.
        

    

In [15]:
# Compute and apply gradients function
# mostly taken from coursera ml spec course
@tf.function
def agent_learn(experiences,gamma):
    
    with tf.GradientTape() as tape:
        loss = compute_loss(experiences,gamma,q_network)
    
    gradients = tape.gradient(loss, q_network.trainable_variables)
    
    optimizer.apply_gradients(zip(gradients,q_network.trainable_variables))

In [16]:
# state = tf.convert_to_tensor(state)
state = env.reset()
# type(state)
# print(state)
# type(state[0])
# print(state[0])

In [None]:
# without any of the history stuff to start

memory_buffer = deque(maxlen=MEMORY_SIZE) # Deques with a max_len remove their first elements
                                          # (they're "pushed out") if at their max length when new
                                          # elements are added to the end. This makes it a natural structure
                                          # for a fixed buffer we're drawing from.
num_episodes = 2000
max_turns = 1000 # Will vary heavily based on what the sim is
                 # max-turns here lines up nicely with memory_size
stop_at_score = 2

total_point_history = []
    
num_p_av = 200
epsilon = 1.0



for i in range(num_episodes):
    state = env.reset(
#         options={"randomize":True}
    )
    state = state[0]
    total_points = 0 # history stuff
    for t in range(max_turns):
        # Choose action A (with epsilon-greedy policy) based on current state
        state_qn = np.expand_dims(state,axis=0) # State needs to be of expected shape
#         print("expanded state", state_qn)
#         state_qn = state
        q_values = q_network(state_qn)
#         print("q_values",q_values)
        
        action = get_action_epsilon(q_values, epsilon) #!EA make function
#         print(env.step(action))
        next_state, reward, done, _, _ = env.step(action)
        
        memory_buffer.append(experience(state,action,reward,next_state,done))
#         print("t and numsteps",t,NUM_STEPS_FOR_UPDATE)
        t1 = t+1
        update = update_if_conditions(t1,NUM_STEPS_FOR_UPDATE, memory_buffer) #!EA
        
        if update:
            # Sample random mini-batch of experience tuples from memory buffer
            experiences = sample_experiences(memory_buffer) #!EA
            
            # ...And learn from 'em. Set y-targets based on estimates and update
            # DQN weights
            agent_learn(experiences,GAMMA)
            
        state = next_state.copy() # Since the state is a list, this should be resistant to api changes
        total_points += reward
#         print("made it through a loop!")
        # Keeps iterating continuously in the same situation, unless sim is resolved
        if done:
            break
    total_point_history.append(total_points) # note no assignment here, because .append changes the object which is stored in memory, not some temporary value. No assignment is needed.
    av_latest_points = np.mean(total_point_history[-num_p_av:]) # colon here means "everything after num_p_av from the left"
    
            
    epsilon = update_epsilon(epsilon) #!EA make function
    
    print(f"\rEpisode {i+1}: total point average over last {num_p_av} episodes: {av_latest_points}",end="")
    
    if ((i % 10) == 0):        
        # Print permenantly; things after this will be on a new line. This gives some sort of history/context as the algorithm runs.
        print(f"\rEpisode {i+1}: total point average over last {num_p_av} episodes: {av_latest_points:.2f}")
        
    if av_latest_points >= stop_at_score:
        q_network.save('reinforcement_learning.h5')
        break
    

        
    
        
        
            
            
        


Episode 250: total point average over last 200 episodes: -19.436710420579687

In [None]:
# To get network to make decisions, call network on state and pick best action
def decide(state):
    expanded = np.expand_dims(state,axis=0) # change state into something tf can work with
    decision = q_network(expanded)         # lay out Q values for each choice
    return np.argmax(decision.numpy()[0]) # picks choice with highest Q.



In [23]:
# Make imageio shut up
import logging
logging.getLogger().setLevel(logging.ERROR)

filename = "./car_racing.mp4"
def create_video(): # Taken from coursera ML specialization by Andrew Ng
    with imageio.get_writer(filename, fps=60) as video:
            done = False
            state = env.reset()
            state = state[0]
            frame = env.render()
            video.append_data(frame)
            while not done:
                state = np.expand_dims(state, axis=0)
                q_values = q_network(state)
                action = np.argmax(q_values.numpy()[0])
                state, _, done, _ = env.step(action)
                frame = env.render()
                video.append_data(frame)

In [19]:
q_network.save('reinforcement_learning.h5')



In [25]:
create_video()

ValueError: append_data requires ndarray as first arg