## Imports

In [1]:
import random
import pandas as pd
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import plot_results, load_results
from gymnasium.wrappers import NormalizeObservation, NormalizeReward
import random
from pandas import Timestamp
import warnings

from flight_scheduling import FlightSchedulingEnv
from utils import generate_random_flight_schedule, generate_lambdas

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

## Generate flight schedule and lambdas

In [2]:
random_schedule = generate_random_flight_schedule(2)
random_schedule

Unnamed: 0,departure,arrival,way,airport,departure_minutes,arrival_minutes,way_transformed
0,1900-01-01 12:58:00,1900-01-01 14:51:00,-1,MAD,778,891,0
1,1900-01-01 13:22:00,1900-01-01 14:37:00,1,FCO,802,877,1


In [3]:
schedule_0 = pd.DataFrame({
    'departure' : [Timestamp('1900-01-01 08:00:00'), Timestamp('1900-01-01 13:30:00')],
    'arrival' : [Timestamp('1900-01-01 10:00:00'), Timestamp('1900-01-01 14:30:00')],
    'way' : [-1, 1],
    'airport' : ['JFK', 'MAD'],
    'departure_minutes' : [480, 910],
    'arrival_minutes' : [600, 970],
    'way_transformed' : [0, 1]
})
schedule_0 

Unnamed: 0,departure,arrival,way,airport,departure_minutes,arrival_minutes,way_transformed
0,1900-01-01 08:00:00,1900-01-01 10:00:00,-1,JFK,480,600,0
1,1900-01-01 13:30:00,1900-01-01 14:30:00,1,MAD,910,970,1


In [4]:
lambdas = generate_lambdas(schedule_0)
lambdas = {'JFKMAD': 1000, 'MADJFK': 1000}
lambdas

{'JFKMAD': 1000, 'MADJFK': 1000}

## Custom environment

In [23]:
env = FlightSchedulingEnv(
    flight_schedule=schedule_0, 
    lambdas=lambdas, 
    max_steps=1000,
    revenue_estimation='basic'
)
env = NormalizeObservation(env)
env = NormalizeReward(env)

## Random Policy

In [28]:
obs = env.reset()

for _ in range(10):
    action = env.action_space.sample()
    obs, reward, done, truncated, _ = env.step(action)
    print(action)
    print("Observation:", obs, "Reward:", reward)

env.close()

1
Observation: [-0.51001334] Reward: 0.2693628214671298
4
Observation: [-0.50316808] Reward: 0.0
3
Observation: [0.07288058] Reward: 0.26487638399618896
2
Observation: [-0.50344911] Reward: -0.2626103857435954
4
Observation: [-0.49713077] Reward: 0.0
3
Observation: [0.09377179] Reward: 0.26132193188027114
2
Observation: [-0.49782595] Reward: -0.26047240591652715
2
Observation: [-1.0772197] Reward: -0.25831721151046455
3
Observation: [-0.47081697] Reward: 0.2582274137455481
0
Observation: [0.13329051] Reward: -0.25696027196752197


## RL Policy

In [58]:
env = FlightSchedulingEnv(
    flight_schedule=schedule_0, 
    lambdas=lambdas, 
    max_steps=1000,
    revenue_estimation='basic'
)
env = NormalizeObservation(env)
#env = NormalizeReward(env)

check_env(env)

In [61]:
model = PPO("MlpPolicy", env).learn(total_timesteps=100000)

In [63]:
for _ in range(10):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    while not done:
        action, _ = model.predict(obs)
        #action = 0
        obs, reward, done, _, _ = env.step(action)
        #print('action : ', action)
        #print('obs : ', obs)
        #print('reward : ', reward)
        total_reward += reward
    print(total_reward)
env.close()

2600.0
2600.0
2600.0
2600.0
2600.0
2600.0
2600.0
2600.0
2600.0
2600.0


## Revenue estimation

In [64]:
env = FlightSchedulingEnv(
    flight_schedule=schedule_0,
    lambdas=lambdas,
    max_steps=100,
    revenue_estimation='classic'
)
env = NormalizeObservation(env)
#env = NormalizeReward(env)

In [67]:
model = PPO("MlpPolicy", env).learn(total_timesteps=100000)

In [68]:
for _ in range(10):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    count = 0
    while not done:
        action, _ = model.predict(obs)
        '''if count < 9:
            action = 1
            count += 1
        else:
            action = 4
            count += 1'''
        obs, reward, done, _, _ = env.step(action)
        #print('action : ', action)
        #print('obs : ', obs)
        #print('reward : ', reward)
        total_reward += reward
    print(total_reward)
env.close()

250.0
250.0
250.0
250.0
250.0
250.0
250.0
222.22222222222217
250.0
222.22222222222217


In [23]:
from stable_baselines3.common.evaluation import evaluate_policy
import os
log_path = os.path.join('Training', 'Logs')
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=100)
evaluate_policy(model, env, n_eval_episodes=10, render=True)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -312     |
| time/              |          |
|    fps             | 2008     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------


(-736.1110916137695, 0.0)