## Imports

In [1]:
import random
import pandas as pd
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import plot_results, load_results
import random
from pandas import Timestamp
import warnings

from flight_scheduling import FlightSchedulingEnv
from utils import generate_random_flight_schedule, generate_lambdas

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

## Generate flight schedule and lambdas

In [36]:
random_schedule = generate_random_flight_schedule(2)
random_schedule

Unnamed: 0,departure,arrival,way,departure_minutes,arrival_minutes,way_transformed
0,1900-01-01 08:51:00,1900-01-01 10:06:00,1,531,606,1
1,1900-01-01 11:00:00,1900-01-01 12:53:00,-1,660,773,0


In [27]:
schedule_0 = pd.DataFrame({
    'departure' : [Timestamp('1900-01-01 08:00:00'), Timestamp('1900-01-01 13:30:00')],
    'arrival' : [Timestamp('1900-01-01 10:00:00'), Timestamp('1900-01-01 14:30:00')],
    'way' : [-1, 1],
    'departure_minutes' : [480, 910],
    'arrival_minutes' : [600, 970],
    'way_transformed' : [0, 1]
})
schedule_0 

Unnamed: 0,departure,arrival,way,departure_minutes,arrival_minutes,way_transformed
0,1900-01-01 08:00:00,1900-01-01 10:00:00,-1,480,600,0
1,1900-01-01 13:30:00,1900-01-01 14:30:00,1,910,970,1


In [28]:
lambdas = generate_lambdas(schedule_0)
lambdas = {(0, 1): 1000}
lambdas

{(0, 1): 1000}

## Custom environment

In [54]:
env = FlightSchedulingEnv(
    flight_schedule=schedule_0, 
    lambdas=lambdas, 
    max_steps=1000,
    revenue_estimation='basic'
)

## Random Policy

In [45]:
env = FlightSchedulingEnv(schedule_0, lambdas, max_steps=1000)

obs = env.reset()

for _ in range(10):
    action = env.action_space.sample()
    obs, reward, done, truncated, _ = env.step(action)
    print("Observation:", obs, "Reward:", reward)

env.close()

Observation: [[480 600   0]
 [930 990   1]] Reward: 20.0
Observation: [[480 600   0]
 [930 990   1]] Reward: 0.0
Observation: [[ 480  600    0]
 [ 950 1010    1]] Reward: 20.0
Observation: [[ 500  620    0]
 [ 950 1010    1]] Reward: 20.0
Observation: [[ 480  600    0]
 [ 950 1010    1]] Reward: -20.0
Observation: [[ 480  600    0]
 [ 970 1030    1]] Reward: 20.0
Observation: [[ 460  580    0]
 [ 970 1030    1]] Reward: -20.0
Observation: [[ 480  600    0]
 [ 970 1030    1]] Reward: 20.0
Observation: [[ 460  580    0]
 [ 970 1030    1]] Reward: -20.0
Observation: [[ 460  580    0]
 [ 950 1010    1]] Reward: -20.0


## RL Policy (1)

In [65]:
env = FlightSchedulingEnv(random_schedule, lambdas, max_steps=100)
env = DummyVecEnv([lambda: env])

#model = A2C("MlpPolicy", env, verbose=1)
model = PPO("MlpPolicy", env, verbose=1)
#model = DQN("MlpPolicy", env, verbose=1)
# Entraîner le modèle sur un certain nombre d'itérations
model.learn(total_timesteps=1000)

# Tester le modèle entraîné
obs = env.reset()
for _ in range(3):
    obs = env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        print(obs)
        print(reward)
env.close()

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1284 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
[[[866. 981.]
  [866. 981.]]]
[-20.]
[[[866. 981.]
  [866. 981.]]]
[0.]
[[[886. 981.]
  [886. 981.]]]
[20.]
[[[886. 961.]
  [886. 961.]]]
[-20.]
[[[886. 941.]
  [886. 941.]]]
[-20.]
[[[866. 941.]
  [866. 941.]]]
[-20.]
[[[866. 941.]
  [866. 941.]]]
[0.]
[[[886. 941.]
  [886. 941.]]]
[20.]
[[[866. 941.]
  [866. 941.]]]
[-20.]
[[[846. 941.]
  [846. 941.]]]
[-20.]
[[[866. 941.]
  [866. 941.]]]
[20.]
[[[866. 941.]
  [866. 941.]]]
[0.]
[[[886. 941.]
  [886. 941.]]]
[20.]
[[[886. 941.]
  [886. 941.]]]
[0.]
[[[886. 961.]
  [886. 961.]]]
[20.]
[[[886. 941.]
  [886. 941.]]]
[-20.]
[[[886. 961.]
  [886. 961.]]]
[20.]
[[[886. 941.]
  [886. 941.]]]
[-20.]
[[[886. 941.]
  [886. 941.]]]
[0.]
[[[906. 941.]
  [906. 941.]]]
[20.]
[[[926. 941.]
  [926. 941.]]]
[20.]
[[[946. 941.]


In [80]:
for _ in range(1):
    obs = env.reset()
    total_reward = 0
    done = False
    while not done:
        action, _ = model.predict(obs)
        #action = [0]
        obs, reward, done, _ = env.step(action)
        #print('action : ', action[0])
        print('obs : ', obs)
        #print('reward : ', reward[0])
        total_reward += reward
    print(total_reward)
env.close()

ValueError: You have passed a tuple to the predict() function instead of a Numpy array or a Dict. You are probably mixing Gym API with SB3 VecEnv API: `obs, info = env.reset()` (Gym) vs `obs = vec_env.reset()` (SB3 VecEnv). See related issue https://github.com/DLR-RM/stable-baselines3/issues/1694 and documentation for more information: https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-api

## RL Policy (2)

In [63]:
env = FlightSchedulingEnv(
    flight_schedule=schedule_0, 
    lambdas=lambdas, 
    max_steps=1000,
    revenue_estimation='basic'
)

check_env(env)

In [65]:
model = PPO("MultiInputPolicy", env).learn(total_timesteps=10000)

In [66]:
for _ in range(10):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    while not done:
        action, _ = model.predict(obs)
        #action = 0
        obs, reward, done, _, _ = env.step(action)
        #print('action : ', action)
        #print('obs : ', obs)
        #print('reward : ', reward)
        total_reward += reward
    print(total_reward)
env.close()

2600.0
2560.0
2600.0
2600.0
2600.0
2600.0
2600.0
2580.0
2580.0
2600.0


## Revenue estimation

In [34]:
env = FlightSchedulingEnv(
    flight_schedule=schedule_0,
    lambdas=lambdas,
    max_steps=100,
    revenue_estimation='classic'
)

In [35]:
model = PPO("MlpPolicy", env).learn(total_timesteps=10000)

IndexError: index 2 is out of bounds for axis 1 with size 2

In [21]:
for _ in range(19):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    while not done:
        action, _ = model.predict(obs)
        #action = 0
        obs, reward, done, _, _ = env.step(action)
        #print('action : ', action)
        #print('obs : ', obs)
        #print('reward : ', reward)
        total_reward += reward
    print(total_reward)
env.close()

-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-222.2222222222224
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
-736.1111111111112
