In [1]:
import numpy as np
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import pickle
from matplotlib import style
import time
style.use('ggplot')

In [2]:
SIZE = 10 # 10 by 10 grid
HM_EPISODES = 55000
STEPS = 50

q = 15 # Move penalty
v = 2 # Velocity of the drone
N = 1 # Number of people 
P_survival = 1 # Survival probability
P_unfairness = 1 # Probability that quantifies vulnerability of each group
b = 30 # Minutes of initial battery level
r = 15 # Initial medical resource available
c_x = 4 # x-coordinate for the charging station
c_y = 0 # y-coordinate for the charging station
T = 10 # Time in minutes 
survivor_x = 9 # x-coordinate for survivor group (subject to change between survivors)
survivor_y = 9 # y-coordinate for survivor group (subject to change between survivors)
N_objectives = 2 # Number of objectives
NO_RETURN_PENALTY = 100 # Penalty for not having enough battery to return to charging station

epsilon = 0.9
EPS_DECAY = 0.9998
SHOW_EVERY = 6000


start_q_table = None # or filename

LEARNING_RATE = 0.1
DISCOUNT = 0.95

DRONE_N = 1
SURVIVOR_N = 2


d = {1: (255, 175, 0),
     2: (0, 255, 0)}

In [3]:
# Error en b y r negativos
# Error porque c decimal

class Drone:
    def __init__(self):
        
        self.x = c_x # Initial position of drone
        self.y = c_y # Initial position of drone
        self.b = b # Initial battery time
        self.v = v # Drone's velocity
        self.c = 0 # Initial distance to charge station
        self.r = r # Initial resource availability
        
    
    
    def __str__(self):
        return f"{self.x}, {self.y}"
    
    
    def __sub__(self, other):
        return (self.x - other.x, self.y - other.y)
    
    def state(self):
        obs = (self.x, self.y, self.b, self.r, self.c)
        return obs
    
    def action(self, choice, N, other, i, add = False):
        if choice == 0:
            self.move(x=1, y=1)
            #self.b -= 1
        elif choice == 1:
            self.move(x=1, y=-1)
            #self.b -= 1
        elif choice == 2:
            self.move(x=1, y=0)
            #self.b -= 1
        elif choice == 3:
            self.move(x=0, y=1)
            #self.b -= 1
        elif choice == 4:
            self.move(x=0, y=-1)
            #self.b -= 1
        elif choice == 5:
            self.move(x=-1, y=1)
            #self.b -= 1
        elif choice == 6:
            self.move(x=-1, y=0)
            #self.b -= 1
        elif choice == 7:
            self.move(x=-1, y=-1)
            #self.b -= 1
            
        elif choice == 8:
            self.resource(n = N)
        
        
        self.c = self.charging_point_distance()
        
        
        if add:
            setattr(self, 'd' + str(i), self.objectives_distance(other))
    
    def move(self, x=False, y=False):
        if not x:
            self.x += 0#np.random.randint(0, 2)
        else:
            self.x += x
        if not y:
            self.y += 0#np.random.randint(-1, 2)
        else:
            self.y += y
            
        
        if self.x < 0:
            self.x = 0
        elif self.x > SIZE - 1:
            self.x = SIZE - 1
            
        if self.y < 0:
            self.y = 0
        elif self.y > SIZE - 1:
            self.y = SIZE - 1  
    
    def resource(self, n):
        if self.r < n:
            pass
        else:
            self.r -= n
        
    # Calculates drone's distance to charging station
    def charging_point_distance(self):
        c = int(np.sqrt(((self.x - c_x)**2 + (self.y - c_y)**2)))
        return c

    # Calculates distances to objectives
    def objectives_distance(self, other):
        d = int(np.sqrt(((self.x - other.x)**2 + (self.y - other.y)**2)))
        return d

In [4]:
class Survivor:
    def __init__(self, survivor_x, survivor_y, N):
        
        self.n = N # Number of survivors
        self.survival_time = T # Time 
        self.unfairness_probability = P_unfairness # Probability of vulnerability for survivors
        self.x = survivor_x # x-coordinate position of the group of survivors
        self.y = survivor_y # y-coordinate position of the group of survivors
        self.r = N # Quantity of resource needed

In [6]:
# Creation of the q_table
# Each index for the q table is the state variable of the system in the form S_t(x, y, b, r, c, d_1,..., d_n)

if start_q_table is None:
    q_table = {}
    for x1 in range(0, SIZE):
        #print(x1)
        for y1 in range(0, SIZE):
            for b1 in range(0, b+1):
                for r1 in range(0, r+1):
                    for c1 in range(-SIZE+1, SIZE+1):
                        for done in range(2):
                            q_table[((x1, y1, b1, r1, c1, done))] = [np.random.uniform(-5,0) for i in range(9)]
                        
else:
    with open(start_q_table, "rb") as f:
        q_table = pickle.load(f) 

In [None]:
episode_rewards = []
n_survivors = [200, 10]
epsilon = 0.9

for episode in range(HM_EPISODES):
    drone = Drone()
    
    
    survivors = []
    counter = 0
    for j in range(N_objectives):
        survivors.append(Survivor(survivor_x - 4*j, survivor_y - 4*j, n_survivors[j]))
    
    
    if episode % SHOW_EVERY == 0:
        print(f"on # {episode}, epsilon: {epsilon}")
        print(f"{SHOW_EVERY} ep mean {np.mean(episode_rewards[-SHOW_EVERY:])}")
        show = True
    else:
        show = False
        
        
        
    
    episode_reward = 0
    
    
    
    for i in range(STEPS):
        obs = drone.state()
        
        if np.random.random() > epsilon:
            action = np.argmax(q_table[obs])
        else:
            action = np.random.randint(0, 9)
        
        
        
        drone.action(action, N = 3, other = drone, i = 1)
        
        
### REWARD
        
        if drone.b - drone.c <=0:
            reward = -NO_RETURN_PENALTY
        else:
            reward = -q        
        
        for j in range(N_objectives):
            if drone.x == survivors[j].x and drone.y == survivors[j].y:
                reward = P_survival*survivors[j].n*100
            
    
### NEW OBS         
            
        new_obs = drone.state()
        min_future_q = np.max(q_table[new_obs])
        current_q = q_table[obs][action]
        
        
### NEW q 
        
        if reward == -NO_RETURN_PENALTY: # Termination 
            new_q = reward
        else: # No goal reached
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * min_future_q)
            
        for j in range(N_objectives):
            # New q calculation
            if reward == P_survival*survivors[j].n*100: # Termination (goal reached)
                new_q = reward
                 
        
        q_table[obs][action] = new_q
        

        
        if show:
            env = np.zeros((SIZE, SIZE, 3), dtype=np.uint8)
            env[drone.y][drone.x] = d[DRONE_N]
            for j in range(N_objectives):
                env[survivors[j].y][survivors[j].x] = d[SURVIVOR_N]

            
            
            img = Image.fromarray(env, "RGB")
            img = img.resize((300,300))
            cv2.imshow("", np.array(img))
            
            # Simulation ended (pause)
            if reward != -q:
                if cv2.waitKey(300) & 0xFF == ord("q"):
                    break
            else:
                if cv2.waitKey(100) & 0xFF == ord("q"):
                    break
            
            
        episode_reward += reward
        
        for j in range(N_objectives):
            if reward == P_survival*survivors[j].n*100:
                break
        
        if drone.b <= 0 or reward == -NO_RETURN_PENALTY:
            break
            
            
    episode_rewards.append(episode_reward)
    epsilon *= EPS_DECAY

moving_avg = np.convolve(episode_rewards, np.ones((SHOW_EVERY,)) / SHOW_EVERY, mode = "valid")
            
plt.plot([i for i in range(len(moving_avg))], moving_avg)
plt.ylabel(f"reward {SHOW_EVERY}ma")
plt.xlabel("episode #")
plt.show()



with open(f"qtable-{int(time.time())}.pickle", "wb") as f:
    pickle.dump(q_table, f)     

In [None]:
np.argmax(q_table[0,0,0,0,-9])