# Optimisation de flux dans un entrepôt avec Q-Learning

## Import des librairies

In [14]:
import numpy as np

## Définition des paramètres gamma et alpha pour le Q-Learning

In [15]:
gamma = 0.75
alpha = 0.9

## Partie 1 - Modélisation de l'environnement

### Définition des états

In [16]:
location_to_state = {'A': 0,
                     'B': 1,
                     'C': 2,
                     'D': 3,
                     'E': 4,
                     'F': 5,
                     'G': 6,
                     'H': 7,
                     'I': 8,
                     'J': 9,
                     'K': 10,
                     'L': 11}

### Définition des actions

In [17]:
actions = [0,1,2,3,4,5,6,7,8,9,10,11]

### Définition des gains (Rewards)

In [18]:
R = np.array([[0,1,0,0,0,0,0,0,0,0,0,0],
              [1,0,1,0,0,1,0,0,0,0,0,0],
              [0,1,0,0,0,0,1,0,0,0,0,0],
              [0,0,0,0,0,0,0,1,0,0,0,0],
              [0,0,0,0,0,0,0,0,1,0,0,0],
              [0,1,0,0,0,0,0,0,0,1,0,0],
              [0,0,1,0,0,0,1,1,0,0,0,0],
              [0,0,0,1,0,0,1,0,0,0,0,1],
              [0,0,0,0,1,0,0,0,0,1,0,0],
              [0,0,0,0,0,1,0,0,1,0,1,0],
              [0,0,0,0,0,0,0,0,0,1,0,1],
              [0,0,0,0,0,0,0,1,0,0,1,0]])

## Partie 2- Construction de la solution avec Q-Learning

### Implémentation du processus de Q-Learning

In [19]:
def compute_Q(R):
    Q = np.array(np.zeros([12,12]))
    for i in range(1000):
        current_state = np.random.randint(0,12)
        playable_actions = []
        for j in range(12):
            if(R[current_state, j] > 0):
                playable_actions.append(j)
        #On choisit aléatoirement une action parmi celles possibles dans l'état courant
        action = np.random.choice(playable_actions)
        next_state = action
        #Calcul de la différence temporelle
        TD = R[current_state,  action] + gamma * Q[next_state, np.argmax(Q[next_state,])] - Q[current_state, action]
        #Mise à jour de la Q-value
        Q[current_state, action] += alpha * TD
    return Q

## Partie 3 -Mise en production de l'optimisation de flux

### Implémentation d'un mapping états ==> emplacements

In [20]:
state_to_location = {state: location for location, state in location_to_state.items()}

### Implémentation de la fonction qui calcule le chemin optimal

In [21]:
def route(starting_location, ending_location):
    R_ = R.copy() #or np.copy(R)
    ending_state = location_to_state[ending_location]
    R_[ending_state,ending_state] = 1000
    Q = compute_Q(R_)
    route = [starting_location]
    next_location = starting_location
    while (next_location != ending_location):
        starting_state = location_to_state[starting_location]
        max_Q_action = np.argmax(Q[starting_state])
        next_state = max_Q_action
        next_location = state_to_location[next_state]
        route.append(next_location)
        starting_location = next_location
    return route

## Chemin optimal en pasant par un emplacement intermédiaire

In [22]:
def best_route(starting_location, intermediary_location, ending_location):
    return route(starting_location, intermediary_location) + route(intermediary_location, ending_location)[1:] 

## Affichage de résultat

In [23]:
print('Route:')
best_route('E', 'K', 'G')

Route:


['E', 'I', 'J', 'K', 'L', 'H', 'G']