In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import random
import numpy as np
import seaborn as sns
from sklearn.utils import shuffle
from tqdm import tqdm
import pandas as pd
import math

import warnings
warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

# DATA

In [None]:
states = pd.read_csv('data_for_sarsa_lambda.csv')
rewards = states['rewards'].tolist()
actions = states['actions'].tolist()
states.drop(['rewards','actions'], axis=1, inplace=True)

# TRAJECTORIES

In [12]:
T = []
for s, a, r in zip(states.iterrows(), actions, rewards):
    t = []
    idx, row = s
    row_np = np.array(row.values)
    if idx % 18 == 0:
        is_end_of_traj = True
        if idx == 0:
            is_end_of_traj = False
    t.append(row_np) # state
    t.append(a) # action
    if is_end_of_traj and (r == 1): # is they are at state 19 and reward is 1, they made it to the end successfully
        t.append(10) # goal state reward
    else:
        t.append(r) # otherwise append 0
    if is_end_of_traj:
        t.append(row_np) # append current state
        t.append(True)
    else:
        t.append(states.loc[idx+1,:].values) # next state
        t.append(False)
    is_end_of_traj = False
    # print(t)
    T.append(t)

KeyError: 114000

<IPython.core.display.Javascript object>

# indicate DONE for 0 or -10 reward

In [None]:
for idx, t in enumerate(T):
    if (t[2] == 0) or (t[2] == -10):
        T[idx][4] = True

# TILES

In [62]:
class StateActionFeatureVectorWithTile:
    def __init__(
        self,
        state_low: np.array,
        state_high: np.array,
        num_actions: int,
        num_tilings: int,
        tile_width: np.array,
    ):
        """
        state_low: possible minimum value for each dimension in state
        state_high: possible maximum value for each dimension in state
        num_actions: the number of possible actions
        num_tilings: # tilings
        tile_width: tile width for each dimension
        """
        # TODO: implement here
        self.state_low = state_low
        self.state_high = state_high
        self.num_actions = num_actions
        self.num_tilings = num_tilings
        self.tile_width = tile_width
        
        self.tiles = []
        self.offset = []
        self.tiles = [math.ceil((state_high[i] - state_low[i]) / tile_width[i]) + 1 for i in range(len(self.tile_width))]
        self.tiles.append(self.num_actions)
        self.offset = [(state_low - (i / num_tilings) * tile_width) for i in range(self.num_tilings)]
        self.weight = np.zeros(np.append(self.num_tilings, self.tiles))

    def feature_vector_len(self) -> int:
        """
        return dimension of feature_vector: d = num_actions * num_tilings * num_tiles
        """
        return np.append(self.num_tilings, self.tiles)

    def __call__(self, s, a) -> np.array:
        """
        implement function x: S+ x A -> [0,1]^d
        if done is True, then return 0^d
        """
        self.features = np.zeros(np.append(self.num_tilings, self.tiles))
        for i in range(self.num_tilings):
            index = np.floor((s - self.offset[i]) / self.tile_width)
            # print('index', index)
            u = int(index[0])
            # print('u', u)
            v = int(index[1])
            # print('v', v)
            w = int(index[2])
            x = int(index[3])
            y = int(index[4])
            z = int(index[5])
            self.features[i][u][v][w][x][y][z][a] = 1
        return self.features.flatten()

<IPython.core.display.Javascript object>

In [63]:
state_low = [15, 1, 0, 0, 0, 0, 0]
state_high = [100, 50, 1, 9, 5, 1, 160000]
tile_width = [17, 10, 0.2, 2, 1, 0.2, 35000]

X = StateActionFeatureVectorWithTile(
        [15, 1, 0, 0, 0, 0, 0],
        [100, 50, 1, 9, 5, 1, 160000],
        num_actions=6,
        num_tilings=10,
        tile_width=np.array([17, 10, 0.2, 2, 1, 0.2, 35000])
    )

<IPython.core.display.Javascript object>

In [60]:
w = np.zeros((X.feature_vector_len())).flatten()

<IPython.core.display.Javascript object>

In [64]:
gamma = 1.    # discount factor
lam = 0.5     # decay rate
alpha = 0.1   # step size

def epsilon_greedy_policy(s, w, epsilon=0.15):
        nA = 6
        Q = [np.matmul(w, X(s, a)) for a in range(nA)]

        if np.random.rand() < epsilon:
            return np.random.randint(nA)
        else:
            return np.argmax(Q)

done = T[0][4]
s = T[0][0]
# read s, done, w to e-greedy policy
a = epsilon_greedy_policy(s, w)

x = X(s,a)
z = np.zeros(X.feature_vector_len()).flatten()

# constitutes one episode
for i, t in tqdm(enumerate(T)):
    if done == False:
        s = t[0]
        # take action a, observe r, s -> s_prime, reward, done, _ = env.step(a)
        s_prime = t[3]
        reward = T[i+1][2] # done should be true on the terminal state
        done = T[i+1][4]
        # given s_prime, choose a_prime using w
        a_prime = epsilon_greedy_policy(s_prime, W)
        # calculate x_prime using tile coding function
        x_prime = X(s_prime, a_prime)
        # calculate current Q
        Q = np.dot(W.T, x)
        # calculate next Q
        Q_prime = np.dot(W.T, x_prime)
        # calculate delta
        delta = reward + gamma * Q_prime - Q
        # calculate z
        z = (gamma * lam * z) + x
        # calculate w
        W = W + alpha * delta * z

        x = x_prime
        a = a_prime
    if done == True:
        done = T[i+1][4]

113998it [21:18:33,  1.49it/s] 


IndexError: list index out of range

<IPython.core.display.Javascript object>

In [28]:
# w = pd.read_csv('weight_for_sarsa_lambda.csv').transpose().to_numpy()
# w = w.reshape(-1)

<IPython.core.display.Javascript object>