In [1]:
import model
import numpy as np
import random
import tank
import truck


import matplotlib.pyplot as plt
from matplotlib import animation, rc
from IPython.display import HTML

import utilsq as ut

import time

## System initializer

In [2]:
def initialize_test_system():
    
    # Tanks' information
    global n
    n = 5 
    tank_ids = list(range(1,n+1))
    tank_max_loads =  np.array([100., 100., 200., 300., 400.])
    #tank_current_loads =  np.array([50., 60., 120., 150., 300.])
    tank_current_loads = tank_max_loads.copy()
    tank_consumption_rates =  np.array([5.] * n)
    
    global n_discrete_load_levels
    n_discrete_load_levels = np.array([4,4,4,4,4])

    # Trucks' information
    global k
    k = 2
    truck_ids = list(range(k))
    truck_max_loads = np.array([20., 50.])
    truck_current_loads = truck_max_loads.copy()
    truck_current_positions =  np.array([5] * k)
    #truck_fractions_deliverable =  np.array([1.] * k) # we for now we only allow to deliver all the content of the truck
    truck_fractions_deliverable =  np.array([ np.array([0.5, 1.]), 
                                              np.array([0.5, 1.])
                                            ]) # we for now we only allow to deliver all the content of the truck
    global n_discrete_load_levels_trucks
    n_discrete_load_levels_trucks = np.array([2,2])

    # System's information
   
    graph = ut.simple_graph(n+1)
    tanks = [tank.Tank( tank_id, current_load, max_load, consumption_rate, n_lvls) 
             for  tank_id, current_load, max_load, consumption_rate, n_lvls in 
             zip( tank_ids, tank_current_loads, tank_max_loads, tank_consumption_rates, n_discrete_load_levels)]
    trucks = [truck.Truck( truck_id, current_load, max_load, current_position, load_fractions_deliverable, n_lvls) 
             for  truck_id, current_load, max_load, current_position, load_fractions_deliverable, n_lvls in 
             zip(truck_ids, truck_current_loads, truck_max_loads, truck_current_positions, 
                 truck_fractions_deliverable, n_discrete_load_levels_trucks)]

    #w =  np.array([0, 20., 10., 30., 50.5, 45.])
    w =  np.array([0., 20., 20., 20., 20., 20.])

    weights_matrix = ut.simple_weights(n+1, w)
    
    return(tanks, trucks, graph, weights_matrix)

tanks, trucks, graph, weights_matrix = initialize_test_system()
toy_system = model.System(tanks = tanks, trucks = trucks, adjacency_matrix = graph, weights_matrix = weights_matrix)

#print(toy_system.weights)

a_s_dim = toy_system.states_dim * toy_system.actions_dim
a_s_dim

1474560

## Q-learning algorithm (off-policy)


### Training

In [3]:
# Train parameters:

learning_rate0 = 0.05
learning_rate_decay = 0.1
train_iterations = 200
train_freq = 50
discount_rate = 0.8

verbose = False
verbose_info = False

seed = 42

train_visualization_steps = []
train_rewards_list = []

tanks, trucks, graph, weights_matrix = initialize_test_system()
toy_system = model.System(tanks = tanks, trucks = trucks, adjacency_matrix = graph, weights_matrix = weights_matrix)

Q = {}

simulation_id = 2

In [4]:
# "Train zone". Q-values are being converging to the optimal, theoretically

def train_Q(n_iterations = train_iterations, 
            learning_rate0 = learning_rate0,
            learning_rate_decay = learning_rate_decay,
            discount_rate = discount_rate,
            system = toy_system,
            Q = Q, verbose = verbose, verbose_info = verbose_info,
            visualization_steps = train_visualization_steps, rewards_list = train_rewards_list,
            seed = seed, 
            freq = train_freq,
            simulation_id = simulation_id,
            round_time = 3
           ):
    
    time_start = time.time()
    
    for iteration in range(1, n_iterations+1):
        
        system.update_state() 

        if verbose: print("System state before, ", toy_system.ds)

        reward = system.random_action(seed = (seed + iteration), verbose = verbose)
        sa_current = system.state_action_to_string()
 
        if ut.is_key(Q, sa_current) == False:
            Q[sa_current] = 0

        learning_rate = learning_rate0 / (1+(iteration-1)*learning_rate_decay)

        Q_max = max([Q[key] for key in Q.keys() if key.startswith(sa_current[0:system.state_length])])
        
        if Q[sa_current] != -np.inf:
            Q[sa_current] = ( (1-learning_rate) * Q[sa_current] 
                             + learning_rate* (reward + discount_rate * Q_max)
                            )
        if verbose &  (iteration % freq == 0):
            print("System state after, ", system.ds)
            print("System action after, ", system.da)
            print("sa current, ", sa_current)

        system.reset_trucks_positions();
        system.reset_trucks_loads();

        if iteration % freq == 0:
            time_end = time.time()
            print("Iteration ", iteration, ", Elapsed time ", round(time_end-time_start, round_time), " seconds." )
            if verbose_info:
                print("s, a", system.s, system.a)
                print("ds, da", system.ds, system.da)
                
            #Save visualization and rewards
            rewards_list.append(reward);
            visualization_steps.append(toy_system.visualize());    

            ut.save_obj(Q, "Q-dict-sim" + f"{simulation_id}" + "-" + f"{iteration}")   
            ut.save_obj(visualization_steps, "vis/vis-train-sim" + f"{simulation_id}" + "-" + f"{iteration}")   
            ut.save_obj(rewards_list, "rewards/rew-train-sim" + f"{simulation_id}" + "-" + f"{iteration}")
    end_time = round(time.time()-time_start,round_time)        
    print(f"Training finished. Total iterations: {n_iterations}. Elapsed time: {end_time} seconds.")        

In [5]:
train_Q()

Iteration  50 , Elapsed time  0.017  seconds.
Iteration  100 , Elapsed time  0.082  seconds.
Iteration  150 , Elapsed time  0.11  seconds.
Iteration  200 , Elapsed time  0.137  seconds.
Training finished. Total iterations: 200. Elapsed time: 0.145 seconds.


### Testing

In [6]:
# Given a state, returns the action that has the highest Q-value.

def optimal_policy(state, Q, system = toy_system):
    """
    state must be in the string-integers code
    """
    state_keys = [key for key in list(Q) if key.startswith(state)]
    if len(state_keys) == 0:
        return(None)
    
    state_q = [Q[state_key] for state_key in state_keys]
    
    #print("state_q ", state_q[1:min(10,len(state_q))])
    
    max_q = max(state_q)
    #print("max_q", max_q)
    optimal_key_index = np.where(np.isin(state_q, max_q ))[0][0]
    #print("optimal_key_index", optimal_key_index)
    optimal_key = state_keys[optimal_key_index]
    #print("optimal_key", optimal_key)
    optimal_action = optimal_key[system.state_length:]
    
    return(optimal_action)
    

In [7]:
# TEST PARAMETERS AND INITIALIZATION

# Initialize system
tanks, trucks, graph, weights_matrix = initialize_test_system()
test_toy_system = model.System(tanks = tanks, trucks = trucks, adjacency_matrix = graph, weights_matrix = weights_matrix)

# Load trained Q-values
Q = ut.load_obj("Q-dict-test-"+f'{train_iterations}')

test_iterations = 1000
test_freq = 100

test_visualization_steps = []
test_rewards_list = []

In [8]:
def test_Q(test_iterations = test_iterations,
           system = test_toy_system,
           visualization_steps = test_visualization_steps, 
           rewards_list = test_rewards_list,
           freq = test_freq
          ):
    
    for i in range(1,test_iterations+1):
        #print("state", test_toy_system.s, test_toy_system.ds)
        system.update_state()
        
        #Save visualization steps
        if i % freq == 0:
            visualization_steps.append(system.visualize());
    
        s0 = system.state_to_string()
        best_action = optimal_policy(s0, Q)
        #print("best_action", best_action)
        
        if best_action == None:
            reward = system.random_action()
            if i % freq == 0:
                print(i,reward, " Random action is performed. Current state unknown for Q.")

        else:
            reward = system.deterministic_action(best_action)
            if i % freq == 0:
                print(i,reward, best_action)
        
        system.reset_trucks_positions();
        system.reset_trucks_loads();
            
        #Save rewards
        if i % freq == 0:
            rewards_list.append(reward);


In [9]:
test_Q()

100 -inf  Random action is performed. Current state unknown for Q.
200 -50.0  Random action is performed. Current state unknown for Q.
300 -60.0  Random action is performed. Current state unknown for Q.
400 -inf  Random action is performed. Current state unknown for Q.
500 -75.0  Random action is performed. Current state unknown for Q.
600 -inf  Random action is performed. Current state unknown for Q.
700 -inf  Random action is performed. Current state unknown for Q.
800 -75.0  Random action is performed. Current state unknown for Q.
900 -75.0  Random action is performed. Current state unknown for Q.
1000 -65.0  Random action is performed. Current state unknown for Q.


In [10]:
# Visualizing test simulation:

test_anim = ut.create_system_animation(test_visualization_steps, test_iterations,test_freq)
HTML(test_anim.to_html5_video())

In [11]:
# Visualizing train simulation:
train_anim = ut.create_system_animation(train_visualization_steps, train_iterations, train_freq)
HTML(train_anim.to_html5_video())