#### Setting up Functions for all Modules

In [1]:
import numpy as np


# Randomly initialize the environment, note that we first generate litters and then obstacles. In this way,
# some of the obstacles may overwrite some grids with litters, but that is not an issue as our environment
# and setup number of litters and obstacles are large enough.
def init_env(r=8, c=16, rand=-1):
    environ = np.empty(shape=(r, c), dtype=str)
    environ.fill('0')

    # $ for litter
    rng = np.random.default_rng()
    lit_x = rng.integers(0, r, 20)
    lit_y = rng.integers(0, c, 20)
    for i in range(20):
        environ[lit_x[i], lit_y[i]] = '$'
    # x for obstacle
    obs_x = rng.integers(0, r, 20)
    obs_y = rng.integers(0, c, 20)
    for i in range(20):
        environ[obs_x[i], obs_y[i]] = 'x'

    # a for agent
    ac = 0
    ar = rng.integers(0, 8) if rand == -1 else rand

    environ[ar, ac] = 'a'
    # environ[5, 15] = 'g'
    return environ, ar


# extract enviroment with only agent location, used for sidewalk and goal module learning
def env_extract(a_r, r=8, c=16):
    extract_env = np.empty(shape=(r, c), dtype=str)
    extract_env.fill('0')
    extract_env[a_r, 0] = 'a'
    return extract_env


# extract environment with only agent and litter locations, used for litter module learning
def env_extract_litter(env, a_r, r=8, c=16):
    extract_env = np.empty(shape=(r, c), dtype=str)
    extract_env.fill('0')

    for i in range(r):
        for j in range(c):
            if env[i, j] == '$':
                extract_env[i, j] = '$'

    extract_env[a_r, 0] = 'a'
    return extract_env


# extract environment with only agent and obstacle locations, used for obstacle module learning
def env_extract_obs(env, a_r, r=8, c=16):
    extract_env = np.empty(shape=(r, c), dtype=str)
    extract_env.fill('0')

    for i in range(r):
        for j in range(c):
            if env[i, j] == 'x':
                extract_env[i, j] = 'x'

    extract_env[a_r, 0] = 'a'
    return extract_env


# initialize Q-table for each module
qtable_sidewalk = np.zeros((8, 4))
qtable_goal = np.zeros((8 * 16, 4))
qtable_obs = np.zeros((2 ** 6, 4))
qtable_lit = np.zeros((2 ** 15, 4))


# check if the row is within sidewalk
def is_sidewalk(r):
    return 2 <= r < 6


# setup goal location, we fix it to be [5,15]
goal = [5, 15]


# check if a grid is goal
def is_goal(r, c):
    return [r, c] == goal


# check if a grid is obstacle
def is_obstacle(r, c, environ):
    return environ[r, c] == 'x'


# check if a grid is litter
def is_litter(r, c, environ):
    return environ[r, c] == '$'


# get the sum of row distance and column distance between a certain grid and the goal
def get_distance(r, c):
    return abs(r - goal[0]) + abs(c - goal[1])


# get corresponding state number of sidewalk module
def get_state_sd(r):
    return r


# get corresponding state number of goal module
def get_state_goal(r, c):
    return r * 16 + c


# get state number from the encoded binary list for obstable module
def get_state_obs_num(ls, len=6):
    sum = 0
    for i in range(len):
        power = len - i - 1
        sum += ls[i] * (2 ** power)
    return sum


# get corresponding state number of obstacle module
def get_state_obs(environ, ar, ac):
    ls = []
    for i in range(ar - 1, ar + 2):
        for j in range(ac, ac + 2):
            if out_bound(i, j):
                ls.append(0)
            else:
                ls.append(int(environ[i, j] == 'x'))
    s = get_state_obs_num(ls)
    return s


# get state number from the encoded binary list for litter module
def get_state_lit_num(ls, len=15):
    sum = 0
    for i in range(len):
        power = len - i - 1
        sum += ls[i] * (2 ** power)
    return sum


# get corresponding state number of litter module
def get_state_lit(environ, ar, ac):
    ls = []
    for i in range(ar - 2, ar + 3):
        for j in range(ac, ac + 3):
            if out_bound(i, j):
                ls.append(0)
            else:
                ls.append(int(environ[i, j] == '$'))
    s = get_state_lit_num(ls)
    return s


# check if a location is out of boundary
def out_bound(r, c):
    return r < 0 or r > 7 or c < 0 or c > 15


# Setup reward function for each module

def get_reward_sd(r):
    return 30 if is_sidewalk(r) else -500


def get_reward_goal(r, c, new_r, new_c):
    if get_distance(r, c) == 0:
        return 100000

    dist = get_distance(r, c) - get_distance(new_r, new_c)
    if dist <= 0:
        return -400
    else:
        return 200


def get_reward_obs(environ, r, c):
    if out_bound(r, c) or environ[r, c] != 'x':
        return 5
    else:
        return -5000


def get_reward_lit(environ, r, c):
    if out_bound(r, c) or environ[r, c] != '$':
        return -1
    else:
        return 500


# set dictionary to represent 4 different moving directions
direct = {0: [-1, 0], 1: [0, 1], 2: [1, 0], 3: [0, -1]}


#### Sidewalk Module Learning

In [2]:
num_episodes = 4000  # total number of episodes in learning
max_step = 200  # the max number of steps within each episode
mu = 0.9  # learning rate
gamma = 0.9  # discount factor
epsilon = 0.5  # value for epsilon-greedy algorithm
for num in range(num_episodes):
    env, a_r = init_env()
    environ_sd = env_extract(a_r)
    a_c = 0

    loc = [[a_r, a_c]]
    s = get_state_sd(a_r)
    new_s = s

    for j in range(max_step):
        # epsilon-greedy algorithm
        greedy = False
        indices = 0
        if np.random.rand() < epsilon:
            indices = np.where(qtable_sidewalk[s, :] >= 0)
            if np.shape(indices)[1] != 0:
                greedy = True

        if greedy:
            pick = np.random.randint(np.shape(indices)[1])
            act = indices[0][pick]
        else:
            act = np.argmax(qtable_sidewalk[s, :])

        row = a_r + direct[act][0]
        col = a_c + direct[act][1]
        r = get_reward_sd(row)

        # perform action
        if act == 0:  # up
            if not out_bound(a_r - 1, a_c):
                environ_sd[a_r, a_c] = '0'
                environ_sd[a_r - 1, a_c] = 'a'
                new_s = get_state_sd(a_r - 1)
                a_r -= 1
        if act == 1:  # right
            if not out_bound(a_r, a_c + 1):
                environ_sd[a_r, a_c] = '0'
                environ_sd[a_r, a_c + 1] = 'a'
                new_s = get_state_sd(a_r)
                a_c += 1
        if act == 2:  # down
            if not out_bound(a_r + 1, a_c):
                environ_sd[a_r, a_c] = '0'
                environ_sd[a_r + 1, a_c] = 'a'
                new_s = get_state_sd(a_r + 1)
                a_r += 1
        if act == 3:  # left
            if not out_bound(a_r, a_c - 1):
                environ_sd[a_r, a_c] = '0'
                environ_sd[a_r, a_c - 1] = 'a'
                new_s = get_state_sd(a_r)
                a_c -= 1
        
        # updata with Q-learning
        loc.append([a_r, a_c])
        qtable_sidewalk[s, act] += mu * (r + gamma * max(qtable_sidewalk[new_s, :]) - qtable_sidewalk[s, act])
        s = new_s

    # print environment and agent's trajectory in certain episode numbers
    if num == 0 or num == num_episodes // 2 or num == num_episodes - 1:
        print(environ_sd)
        print(loc)
        

[['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' 'a' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']]
[[2, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 1], [1, 1], [1, 2], [2, 2], [2, 3], [3, 3], [4, 3], [4, 2], [5, 2], [5, 1], [5, 0], [5, 0], [4, 0], [5, 0], [6, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 1], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [4, 0], [5, 0], [5, 0], [5, 1], [5, 0], [5, 0], [5, 0], [4, 0], [5, 0], [5, 0], [5, 0], [4, 0], [3, 0], [4, 0], [5, 0], [5, 1], [5, 2], [5, 3], [5, 4], [5, 3], [5, 2], [5, 3], [4, 3], [4, 2], [5, 2]

###### Print Sidewalk Module Q-table 

In [3]:
print('q_sd:\n', qtable_sidewalk)

q_sd:
 [[-859.5        -859.5        -230.         -450.        ]
 [-450.         -450.          300.         -207.22114766]
 [-450.          300.          300.          300.        ]
 [ 300.          300.          300.          300.        ]
 [ 300.          300.          300.          300.        ]
 [ 300.          300.         -450.          300.        ]
 [ 300.         -207.00017263 -634.43979653 -207.17262538]
 [-230.         -450.         -450.         -450.        ]]


#### Goal Module Learning

In [9]:
num_episodes = 4000
max_step = 50
mu = 0.8
gamma = 0.9
epsilon = 0.01
for num in range(num_episodes):
    env, a_r = init_env()
    env_goal = env_extract(a_r)
    # rng = np.random.default_rng()
    a_c = 0

    loc = [[a_r, a_c]]
    s = get_state_goal(a_r, a_c)
    new_s = s

    for j in range(max_step):
        if is_goal(a_r, a_c):
            break

        # epsilon-greedy algorithm
        greedy = False
        indices = 0
        if np.random.rand() < epsilon:
            indices = np.where(qtable_goal[s, :] > 0)
            if np.shape(indices)[1] != 0:
                greedy = True

        if greedy:
            pick = np.random.randint(np.shape(indices)[1])
            act = indices[0][pick]
        else:
            act = np.argmax(qtable_goal[s, :])

        row = a_r + direct[act][0]
        col = a_c + direct[act][1]
        r = get_reward_goal(a_r, a_c, row, col)

        # perform action
        if act == 0:  # up
            if not out_bound(a_r - 1, a_c):
                env_goal[a_r, a_c] = '0'
                env_goal[a_r - 1, a_c] = 'a'
                new_s = get_state_goal(a_r - 1, a_c)
                a_r -= 1
        if act == 1:  # right
            if not out_bound(a_r, a_c + 1):
                env_goal[a_r, a_c] = '0'
                env_goal[a_r, a_c + 1] = 'a'
                new_s = get_state_goal(a_r, a_c + 1)
                a_c += 1
        if act == 2:  # down
            if not out_bound(a_r + 1, a_c):
                env_goal[a_r, a_c] = '0'
                env_goal[a_r + 1, a_c] = 'a'
                new_s = get_state_goal(a_r + 1, a_c)
                a_r += 1
        if act == 3:  # left
            if not out_bound(a_r, a_c - 1):
                env_goal[a_r, a_c] = '0'
                env_goal[a_r, a_c - 1] = 'a'
                new_s = get_state_goal(a_r, a_c - 1)
                a_c -= 1

        # update with Q-learning
        loc.append([a_r, a_c])
        qtable_goal[s, act] += mu * (r + gamma * max(qtable_goal[new_s, :]) - qtable_goal[s, act])
        s = new_s

    # print environment and agent's trajectory in certain episode numbers
    if num == 0 or num == num_episodes // 2 or num == num_episodes - 1:
        print(env_goal)
        print(loc)


[['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' 'a']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']]
[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7], [0, 8], [0, 9], [0, 10], [0, 11], [0, 12], [0, 13], [0, 14], [0, 15], [1, 15], [2, 15], [3, 15], [4, 15], [5, 15]]
[['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '

###### Examine Rows of Goal Module Q-table

In [10]:
print('Q-values at [2,0]: ', qtable_goal[get_state_goal(2,0), :])
print('Q-values at [5,0]: ', qtable_goal[get_state_goal(5,0), :])
print('Q-values at [7,0]: ', qtable_goal[get_state_goal(7,0), :])
print('Q-values at [3,10]:', qtable_goal[get_state_goal(3,10), :])
print('Q-values at [5,14]:', qtable_goal[get_state_goal(5,14), :])
print('Q-values at [4,15]:', qtable_goal[get_state_goal(4,15), :])

Q-values at [2,0]:  [-320.         1699.81072941    0.            0.        ]
Q-values at [5,0]:  [-320.         1180.45636601    0.            0.        ]
Q-values at [7,0]:  [1336.16965647    0.            0.            0.        ]
Q-values at [3,10]: [-320.        831.499022    0.          0.      ]
Q-values at [5,14]: [0. 0. 0. 0.]
Q-values at [4,15]: [-204.8 -320.   200.     0. ]


#### Obstacle Module Learning

In [11]:
num_episodes = 4000
max_step = 200
mu = 0.8
gamma = 0.9
epsilon = 0.33
for num in range(num_episodes):
    env, a_r = init_env()
    environ_agent = env_extract(a_r)
    environ_obs = env_extract_obs(env, a_r)
    a_c = 0

    loc = [[a_r, a_c]]
    s = get_state_obs(environ_obs, a_r, a_c)
    new_s = s

    for j in range(max_step):
        # epsilon-greedy algorithm
        greedy = False
        indices = 0
        if np.random.rand() < epsilon:
            indices = np.where(qtable_obs[s, :] > -50)
            if np.shape(indices)[1] != 0:
                greedy = True

        if greedy:
            pick = np.random.randint(np.shape(indices)[1])
            act = indices[0][pick]
        else:
            act = np.argmax(qtable_obs[s, :])

        row = a_r + direct[act][0]
        col = a_c + direct[act][1]
        r = get_reward_obs(environ_obs, row, col)

        # perform action
        if act == 0:  # up
            if not out_bound(a_r - 1, a_c):
                environ_agent[a_r, a_c] = '0'
                environ_agent[a_r - 1, a_c] = 'a'
                new_s = get_state_obs(environ_obs, a_r - 1, a_c)
                a_r -= 1
        if act == 1:  # right
            if not out_bound(a_r, a_c + 1):
                environ_agent[a_r, a_c] = '0'
                environ_agent[a_r, a_c + 1] = 'a'
                new_s = get_state_obs(environ_obs, a_r, a_c + 1)
                a_c += 1
        if act == 2:  # down
            if not out_bound(a_r + 1, a_c):
                environ_agent[a_r, a_c] = '0'
                environ_agent[a_r + 1, a_c] = 'a'
                new_s = get_state_obs(environ_obs, a_r + 1, a_c)
                a_r += 1
        if act == 3:  # left
            if not out_bound(a_r, a_c - 1):
                environ_agent[a_r, a_c] = '0'
                environ_agent[a_r, a_c - 1] = 'a'
                new_s = get_state_obs(environ_obs, a_r, a_c - 1)
                a_c -= 1

        # update with Q-learning
        loc.append([a_r, a_c])
        qtable_obs[s, act] += mu * (r + gamma * max(qtable_obs[new_s, :]) - qtable_obs[s, act])
        s = new_s

    # print environment and agent's trajectory in certain episode numbers
    if num == 0 or num == num_episodes // 2 or num == num_episodes - 1:
        e_r, e_c = 8, 16
        e = np.empty(shape=(e_r, e_c), dtype=str)
        e.fill('0')

        for i in range(e_r):
            for j in range(e_c):
                if environ_obs[i, j] == 'x':
                    e[i, j] = 'x'

        e[a_r, a_c] = 'a'
        print(e)
        print(loc)


[['0' 'x' 'x' '0' '0' '0' '0' '0' '0' 'x' '0' '0' '0' '0' 'a' 'x']
 ['0' '0' '0' '0' '0' 'x' '0' 'x' '0' '0' '0' 'x' '0' '0' 'x' 'x']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' 'x' '0' '0' '0' 'x' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' 'x' '0' '0' '0' '0' '0' 'x' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' 'x' '0']
 ['x' '0' 'x' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' 'x' 'x' 'x' '0' '0']]
[[5, 0], [4, 0], [3, 0], [2, 0], [1, 0], [1, 1], [1, 2], [1, 3], [0, 3], [0, 3], [0, 4], [0, 4], [0, 4], [0, 4], [1, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [0, 4], [1, 4], [0, 4], [0, 4], [1, 4], [0, 4], [0, 4], [0, 4], [0, 5], [0, 5], [0, 6], [0, 6], [0, 6], [0, 6], [0, 6], [0, 6], [0, 6], [0, 6], [0, 6], [0, 6], [1, 6], [0, 6], [0, 6], [0, 6], [0, 6], [0, 7], [0, 8], [1, 8], [0, 8], [0, 8]

In [12]:
# print certain obstacle module local 6-gridworlds
print(np.array([['0','x'],['0','0'],['0','x']]))
print()
print(np.array([['x','x'],['0','0'],['0','x']]))
print()
print(np.array([['0','0'],['0','x'],['0','0']]))
print()
print(np.array([['x','0'],['0','x'],['x','x']]))
print()
print(np.array([['x','0'],['x','0'],['x','0']]))
print()
print(np.array([['x','x'],['x','x'],['x','x']]))
print()

[['0' 'x']
 ['0' '0']
 ['0' 'x']]

[['x' 'x']
 ['0' '0']
 ['0' 'x']]

[['0' '0']
 ['0' 'x']
 ['0' '0']]

[['x' '0']
 ['0' 'x']
 ['x' 'x']]

[['x' '0']
 ['x' '0']
 ['x' '0']]

[['x' 'x']
 ['x' 'x']
 ['x' 'x']]



In [13]:
# print the associated obstacle module state number and Q-table entries
print()
a = (0,1,0,0,0,1)
print(a,"--> 17:\n", qtable_obs[17,:])
print('\n')
b = (1,1,0,0,0,1)
print(b,"--> 49:\n", qtable_obs[49,:])
print('\n')
c = (0,0,0,1,0,0)
print(c,"--> 4:\n", qtable_obs[4,:])
print('\n')
d = (1,0,0,1,1,1)
print(d,"--> 39:\n", qtable_obs[39,:])
print('\n')
e = (1,0,1,0,1,0)
print(e,"--> 42:\n", qtable_obs[42,:])
print('\n')
f = (1,1,1,1,1,1)
print(f,"--> 64:\n", qtable_obs[63,:])
print()


(0, 1, 0, 0, 0, 1) --> 17:
 [   50.     50.     50.  -3961.2]


(1, 1, 0, 0, 0, 1) --> 49:
 [-4000.            50.            50.         -3981.57908404]


(0, 0, 0, 1, 0, 0) --> 4:
 [   50.         -4000.            50.         -3961.34693238]


(1, 0, 0, 1, 1, 1) --> 39:
 [-4000.         -3967.18947019 -3971.2           50.        ]


(1, 0, 1, 0, 1, 0) --> 42:
 [-3964.41192249    42.21312    -3971.2            0.        ]


(1, 1, 1, 1, 1, 1) --> 64:
 [0. 0. 0. 0.]



#### Litter Module Learning

In [32]:
num_episodes = 6000
max_step = 200
mu = 0.9
gamma = 0.9
epsilon = 0.3
for num in range(num_episodes):
    env, a_r = init_env()
    environ_lit = env_extract_litter(env, a_r)
    a_c = 0

    # examine a certain environment in the last episode
    if num == num_episodes - 1:
        a_r = 2
        environ_lit = np.array([['$','0','0','0','0','0','0','$','0','0','0','0','0','0','0','0'],
                       ['0','$','0','0','0','0','0','0','0','0','0','$','$','0','0','0'],
                       ['a','0','0','0','0','0','$','$','$','0','0','0','0','0','$','0'],
                       ['0','0','0','$','0','0','0','0','$','0','0','0','0','$','0','0'],
                       ['0','0','0','0','0','0','0','$','0','0','$','$','0','0','0','0'],
                       ['0','0','0','0','0','0','0','0','$','0','0','0','0','0','$','0'],
                       ['0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0'],
                       ['0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','$']])
        print(environ_lit)   
    
    loc = [[a_r, a_c]]
    s = get_state_lit(environ_lit, a_r, a_c)
    new_s = s
    lit_num = 0

    for j in range(max_step):
        # epsilon-greedy algorithm
        greedy = False
        indices = 0

        if np.random.rand() < epsilon:
            indices = np.where(qtable_lit[s, :] >= -10)
            if np.shape(indices)[1] != 0:
                greedy = True

        if greedy:
            pick = np.random.randint(np.shape(indices)[1])
            act = indices[0][pick]
        else:
            act = np.argmax(qtable_lit[s, :])

        row = a_r + direct[act][0]
        col = a_c + direct[act][1]
        r = get_reward_lit(environ_lit, row, col)

        # perform action
        if act == 0:  # up
            if not out_bound(a_r - 1, a_c):
                environ_lit[a_r, a_c] = '0'
                if is_litter(a_r - 1, a_c, environ_lit):
                    lit_num += 1
                environ_lit[a_r - 1, a_c] = 'a'
                new_s = get_state_lit(environ_lit, a_r - 1, a_c)
                a_r -= 1
        if act == 1:  # right
            if not out_bound(a_r, a_c + 1):
                environ_lit[a_r, a_c] = '0'
                if is_litter(a_r, a_c + 1, environ_lit):
                    lit_num += 1
                environ_lit[a_r, a_c + 1] = 'a'
                new_s = get_state_lit(environ_lit, a_r, a_c + 1)
                a_c += 1
        if act == 2:  # down
            if not out_bound(a_r + 1, a_c):
                environ_lit[a_r, a_c] = '0'
                if is_litter(a_r + 1, a_c, environ_lit):
                    lit_num += 1
                environ_lit[a_r + 1, a_c] = 'a'
                new_s = get_state_lit(environ_lit, a_r + 1, a_c)
                a_r += 1
        if act == 3:  # left
            if not out_bound(a_r, a_c - 1):
                environ_lit[a_r, a_c] = '0'
                if is_litter(a_r, a_c - 1, environ_lit):
                    lit_num += 1
                environ_lit[a_r, a_c - 1] = 'a'
                new_s = get_state_lit(environ_lit, a_r, a_c - 1)
                a_c -= 1

        # update with Q-learning
        loc.append([a_r, a_c])
        qtable_lit[s, act] += mu * (r + gamma * max(qtable_lit[new_s, :]) - qtable_lit[s, act])
        s = new_s

    # print environment, number of litters picked up, and agent's trajectory in certain episode numbers
    if num == 0 or num == num_episodes // 2 or num == num_episodes - 1:
        print(environ_lit)
        print('litter picked up:', lit_num)
        print(loc)



[['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '$' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '$' '$' '0' '$' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '$' '0' '0' '$' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' 'a' '0' '0' '0']
 ['0' '0' '$' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '$' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']]
litter picked up: 7
[[2, 0], [2, 0], [1, 0], [0, 0], [0, 0], [1, 0], [1, 1], [0, 1], [0, 1], [0, 2], [0, 2], [0, 3], [1, 3], [2, 3], [2, 4], [2, 5], [3, 5], [2, 5], [2, 4], [2, 5], [2, 6], [3, 6], [3, 7], [4, 7], [4, 8], [5, 8], [5, 7], [4, 7], [4, 8], [5, 8], [5, 7], [5, 8], [4, 8], [5, 8], [5, 9], [5, 10], [5, 9], [5, 10], [5, 9], [5, 10], [5, 9], [5, 10], [5, 11], [5, 10], [6, 10], [7, 10], [6, 10], [6, 11], [6, 12], [5, 12], [6, 12], [6, 13], [6, 14], [7,

In [33]:
# print certain litter module state number and Q-table entries
print("17408:\n", qtable_lit[17408,:])
print()
print("128:\n", qtable_lit[128,:])
print()
print("1:\n", qtable_lit[1,:])
print()
print("0:\n", qtable_lit[0,:])
print()

17408:
 [1781.1611148   978.77894189 1217.54158827 1305.17441914]

128:
 [1033.58061145 1251.89254962  920.15501735  969.60627209]

1:
 [ 786.8096416   808.45656036 1202.57500846  792.48011755]

0:
 [ 532.45024762  563.13651379   18.76553255 1020.72930765]



#### Final Combined Module

In [62]:
# randomly generate an environment, set agent at [2,0] and goal at [5,15], and print it
env_test, a_r = init_env(rand=2)
env_test[5, 15] = '0'
a_c = 0
print('env_test:\n', env_test)
env_test_agent = env_extract(a_r)

# walking
num_steps = 60
loc = [[a_r, a_c]]
for i in range(num_steps):
    if [a_r, a_c] == goal:
        break

    # get associated state numbers and Q-table entries from each module
    s_sd = get_state_sd(a_r)
    s_goal = get_state_goal(a_r, a_c)
    s_obs = get_state_obs(env_test, a_r, a_c)
    s_lit = get_state_lit(env_test, a_r, a_c)

    q_sd = qtable_sidewalk[s_sd, :]
    q_goal = qtable_goal[s_goal, :]
    q_obs = qtable_obs[s_obs, :]
    q_lit = qtable_lit[s_lit, :]

    # combine Q-entries using weighted-sum (weights are different here depending on the total steps walked);
    # there are many possible ways to combine or set threshold here
    if i <= 40:
        q_combined = 3*q_sd + 1.5*q_obs + 8*q_lit + q_goal
    else:
        q_combined = q_sd + 0.5*q_obs + q_lit + 5*q_goal

    # act and update
    act = np.argmax(q_combined)
    new_r = a_r + direct[act][0]
    new_c = a_c + direct[act][1]

    if not out_bound(new_r, new_c):
        a_r, a_c = new_r, new_c
    if env_test[a_r, a_c] == '$':
        env_test[a_r, a_c] = '0'

    loc.append([a_r, a_c])

    
# print trajectory
print(loc)

# print the resulting environment
env_final = np.empty(shape=(8, 16), dtype=str)
env_final.fill('0')
for i in range(8):
    for j in range(16):
        if env_test[i, j] == 'x':
            env_final[i, j] = 'x'
        elif env_test[i, j] == '$':
            env_final[i, j] = '$'

env_final[a_r, a_c] = 'a'
print(env_final)

env_test:
 [['0' '0' 'x' '0' 'x' '0' 'x' '0' 'x' '0' 'x' '0' '0' '$' '0' '0']
 ['0' '0' 'x' 'x' '0' 'x' '0' '0' '0' '0' '$' '0' '0' '0' 'x' '$']
 ['a' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['0' '0' '0' '$' '0' '0' '0' '$' 'x' '0' 'x' '0' 'x' '0' '$' '0']
 ['0' '0' '0' '0' '$' '0' '0' '0' '0' '0' '$' '0' '0' '0' '0' '0']
 ['x' '0' '$' '0' '0' '0' 'x' '0' '0' '0' '0' '$' 'x' '$' '0' '0']
 ['x' '0' 'x' '$' 'x' 'x' '0' '0' '0' '0' '$' '0' '0' '0' '0' '0']
 ['0' '0' '0' '0' '0' 'x' '0' '0' '0' '0' '$' '$' '0' '0' '0' '0']]
[[2, 0], [2, 1], [2, 2], [2, 3], [3, 3], [3, 4], [4, 4], [4, 5], [4, 6], [4, 7], [3, 7], [2, 7], [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14], [3, 14], [4, 14], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [4, 15], [5, 15]]
[['0' '0' 'x' '0' 'x' '0' 'x' '0' 'x' '0' 'x' '0' '0' '$' '0' '0']
 ['0' '0' '