In [1]:
import numpy as np

In [2]:
# Define the states
location_to_state = {
    'L1': 0,
    'L2': 1,
    'L3': 2,
    'L4': 3,
    'L5': 4,
    'L6': 5,
    'L7': 6,
    'L8': 7,
    'L9': 8
}

In [3]:
# Define the actions
actions = [0,1,2,3,4,5,6,7,8]

In [4]:
rewards = np.array([[0,1,0,0,0,0,0,0,0],
                   [1,0,1,0,1,0,0,0,0],
                   [0,1,0,0,0,1,0,0,0],
                   [0,0,0,0,0,0,1,0,0],
                   [0,1,0,0,0,0,0,1,0],
                   [0,0,1,0,0,0,0,0,0],
                   [0,0,0,1,0,0,0,1,0],
                   [0,0,0,0,1,0,1,0,1],
                   [0,0,0,0,0,0,0,1,0]])

In [6]:
state_to_location = dict((state,location) for location,state in location_to_state.items())

In [7]:
state_to_location

{0: 'L1',
 1: 'L2',
 2: 'L3',
 3: 'L4',
 4: 'L5',
 5: 'L6',
 6: 'L7',
 7: 'L8',
 8: 'L9'}

In [8]:
# Initialize parameters
gamma = 0.75 # Discount factor
alpha = 0.9 # Learning rate

In [9]:
class QAgent():
    # Initialize alpha, gamma, states, actions, rewards, and Q-values
    def __init__(self, alpha, gamma, location_to_state, actions, rewards, state_to_location):
        self.gamma = gamma
        self.alpha = alpha
        
        self.location_to_state = location_to_state
        self.actions = actions
        self.rewards = rewards
        self.state_to_location = state_to_location
        
        M = len(location_to_state)
        self.Q = np.zeros((M,M), dtype = None, order = 'C')
        
    def training(self, start_location, end_location, iterations):
        rewards_new = np.copy(self.rewards)
        ending_state = self.location_to_state[end_location]
        rewards_new[ending_state, ending_state] = 999
        
        # picking a random current state
        for i in range(iterations):
            current_state = np.random.randint(0,9)
            playable_actions = []
            
            # iterate through the rewards matrix to get the states
            # directly reachable from the randomly chosen current state
            # assign those state in a list named playable_actions.
            for j in range(9):
                if rewards_new[current_state,j] > 0:
                    playable_actions.append(j)
                    
            # choosing a random next state
            next_state = np.random.choice(playable_actions)
            
            # finding temporal difference
            TD = rewards_new[current_state, next_state] + \
                self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - self.Q[current_state]
            
            self.Q[current_state, next_state] += self.alpha * TD
            
        route = [start_location]
        next_location = start_location
        
        # Get the route
        self.get_optimal_route(start_location, end_location, next_location, route, self.Q)
        
    # get the optimal route
    def get_optimal_route(self, start_location, end_location, next_location, route, Q):
        while(next_location != end_location):
            starting_state = self.location_to_state[start_location]
            next_state = np.argmax(Q[starting_state,])
            next_location = self.state_to_location[next_state]
            route.append(next_location)
            start_location = next_location
            
        print(route)
    

In [10]:
qagent = QAgent(alpha, gamma, location_to_state, actions, rewards, state_to_location)
qagent.training('L9', 'L1', 1000)

ValueError: setting an array element with a sequence.