In [1]:
from nomadic_trucker_new import nomadictrucker
import numpy as np
from random import randint
import scipy
from collections import defaultdict
import itertools
import pickle
import math
import time

In [2]:
def get_next_action(Q, state, epsilon):

    actions = model['all_actions']
    best_action = np.argmax(Q[state]) # best action with highest Q value
    random_action = np.random.choice(actions)
    next_action = np.random.choice([best_action, random_action], p = [(1-epsilon),epsilon])

    return next_action

def qLearning(num_episodes, discount_factor = 0.95, alpha = 0.2, epsilon = 0.1):

    Q = defaultdict(lambda:np.zeros(len(model['all_actions'])))
    all_episode_rewards = np.zeros(num_episodes)
    all_episode_lengths = np.zeros(num_episodes)

    np.set_printoptions(precision = 2)
    visited_states=[]
    for ith_episode in range(num_episodes):
#         print('episode:', ith_episode)

        # Reset the environment and pick the first action
        state_index = model['get_first_state']()

        previous_state = state_index

        visited_states.append(state_index)

        loaded_moves = 0

        for t in itertools.count():

            #action = # get action
            action = get_next_action(Q, state_index, epsilon)

            load = model['get_state_exp'](state_index)[3]
            if load[action] == 1:
                loaded_moves += 1


            # take action and get reward, transit to next state
            next_truck_location = model['get_actual_next_grid_state'](action, state_index)[0]
            next_day = model['get_actual_next_grid_state'](action, state_index)[1]
            next_trailer_type = model['get_actual_next_grid_state'](action,state_index)[2]
            new_load = model['get_actual_next_grid_state'](action, state_index)[3]
            next_state = model['get_numeric_state_nomadic_trucker'](next_truck_location, next_day, next_trailer_type, new_load)
            #print(new_load)
            reward = model['get_rewards'](state_index, action)

            # Update statistics
            all_episode_rewards[ith_episode] += reward
            all_episode_lengths[ith_episode] = t

            # TD Update

            best_next_action = np.argmax(Q[next_state])
            td_target = reward + discount_factor * (Q[next_state][best_next_action])
            td_delta = td_target - (Q[state_index][action])
            (Q[state_index][action]) +=(alpha * td_delta)

            if loaded_moves == 10: # end after 100 days
#                 print("{}th episode is finished with delta: {}!".format(ith_episode, td_delta))
                break
            previous_state = state_index
            state_index = next_state
            if state_index not in visited_states:
                visited_states.append(state_index)
    return Q, all_episode_rewards, all_episode_lengths, visited_states

In [4]:
# possible states
s = [2, 3]

# possible days
d = [1, 3, 7]

# possible trailer types
t = [1, 3, 3]

In [5]:
for size in s:
    for days, trailers in zip(d,t):
        model = nomadictrucker(size,days,trailers)
        if size == 3:
            n = 10000
        else:
            n = 1000
        
        start = time.time()
        Q_val, all_episode_rewards, all_episode_lengths, visited_states = qLearning(n)
        end = time.time()
        print(n)
        print(str(len(model["states"])),"time:", (end - start))
        with open("visited_states_nomadic_"+str(size)+"_"+str(days)+"d_"+str(trailers)+"t.pickle","wb") as fp:
            pickle.dump(visited_states, fp)
        with open("Q_nomadic_"+str(size)+"_"+str(days)+"d_"+str(trailers)+"t.pickle","wb") as fp:
            pickle.dump(dict(Q_val), fp)
        print("done state space size:", str(len(model["states"])))

1000
64 time: 35.59246611595154
done state space size: 64
1000
576 time: 50.452276945114136
done state space size: 576
1000
1344 time: 69.52126502990723
done state space size: 1344
10000
4608 time: 389.6080241203308
done state space size: 4608
10000
41472 time: 539.1700129508972
done state space size: 41472
10000
96768 time: 773.6231310367584
done state space size: 96768
