# Flight scheduling optimization with RL

## Imports

In [9]:
import polars as pl
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from gymnasium.wrappers import NormalizeObservation, NormalizeReward
from schedule_optimizer.model_evaluation import run_evaluation, RewardTracker
from datetime import datetime
import warnings

from schedule_optimizer.flight_scheduling import FlightSchedulingEnv
from schedule_optimizer.utils import generate_random_flight_schedule, generate_lambdas
from schedule_optimizer.performance_tests import benchmark_polars_vs_numpy
from schedule_optimizer.iterative_optimization import optimize_schedule
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from schedule_optimizer.flight_scheduling_optimized import FlightSchedulingEnvMasked

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Simple case : 2 connecting flights

### Generate flight schedule and lambdas

In [3]:
schedule_0 = pl.DataFrame({
    'departure' : [datetime(1900, 1, 1, 8, 0, 0), datetime(1900, 1, 1, 13, 30, 0)],
    'arrival' : [datetime(1900, 1, 1, 10, 0, 0), datetime(1900, 1, 1, 14, 30, 0)],
    'way' : [-1, 1],
    'airport' : ['JFK', 'MAD'],
    'departure_minutes' : [480, 810],
    'arrival_minutes' : [600, 870],
})
lambdas = generate_lambdas(schedule_0)

### Custom environment

In [4]:
env = FlightSchedulingEnv(
    flight_schedule=schedule_0, 
    lambdas=lambdas, 
    max_steps=50,
    revenue_estimation='classic',
    obs_back="numpy"
)

env = NormalizeObservation(env)
check_env(env)
env.reset()
print("Initial revenue :", env.env.calculate_revenue())

Initial revenue : 875.0


### Random Policy

In [5]:
obs = env.reset()

for _ in range(10):
    action = env.action_space.sample()
    obs, reward, done, truncated, _ = env.step(action)
    print(action)
    print("Observation:", obs, "Reward:", reward)

env.close()

2
Observation: [-0.5140178] Reward: 27.77777777777783
2
Observation: [-1.4600238] Reward: 27.777777777777715
0
Observation: [-0.3961249] Reward: -27.777777777777715
0
Observation: [0.6545185] Reward: -27.77777777777783
3
Observation: [1.5917014] Reward: -27.77777777777783
2
Observation: [0.53099865] Reward: 27.77777777777783
4
Observation: [0.51617306] Reward: 0.0
0
Observation: [1.4849986] Reward: -27.77777777777783
1
Observation: [0.4263473] Reward: 27.77777777777783
0
Observation: [1.4042425] Reward: -27.77777777777783


### RL Policy

In [6]:
TOTAL_TIMESTEPS = 2000
EVAL_FREQ = 1000

model = PPO("MlpPolicy", env,
    learning_rate=0.003,
    batch_size=32,
    n_epochs=10,
    clip_range=0.2,
    verbose=0
)

run_evaluation(TOTAL_TIMESTEPS, EVAL_FREQ, model, {}, env, masked=False)

  Step 1000: Reward = -177.78


<stable_baselines3.ppo.ppo.PPO at 0x7f961c5579e0>

## Random schedule

In [7]:
random_schedule = generate_random_flight_schedule(50)
random_lambdas = generate_lambdas(random_schedule)

perfect_schedule = random_schedule.clone()

perfect_schedule = perfect_schedule.with_columns(
    departure_minutes = pl.when(pl.col('way') == -1)
    .then(pl.lit(400)).otherwise(pl.lit(800)),
    arrival_minutes = pl.when(pl.col('way') == -1)
    .then(pl.lit(680)).otherwise(pl.lit(1000))
)

base_date = datetime(1900, 1, 1, 0, 0, 0)

perfect_schedule = perfect_schedule.with_columns([
    pl.lit(base_date).dt.offset_by(pl.col('departure_minutes').cast(pl.String) + "m").alias('departure'),
    pl.lit(base_date).dt.offset_by(pl.col('arrival_minutes').cast(pl.String) + "m").alias('arrival')
])


In [8]:
env = FlightSchedulingEnv(
    flight_schedule=random_schedule,
    lambdas=random_lambdas,
    max_steps=50,
    revenue_estimation='classic'
)
perfect_env = FlightSchedulingEnv(
    flight_schedule=perfect_schedule,
    lambdas=random_lambdas,
    max_steps=50,
    revenue_estimation='classic'
)
env = NormalizeObservation(env)
check_env(env)

print("Initial revenue :", env.env.calculate_revenue())
print("Perfect revenue :", perfect_env.calculate_revenue())

Initial revenue : 117066.6666666667
Perfect revenue : 423000.0


## Performance Analysis avec Timers

In [10]:
small_schedule = generate_random_flight_schedule(100)
small_lambdas = generate_lambdas(small_schedule)

benchmark_results = benchmark_polars_vs_numpy(
    small_schedule, small_lambdas, num_calculations=5
)

print(f"📊 Résultats benchmark:")
print(f"   Gagnant: {benchmark_results['winner']}")
print(f"   Speedup: {benchmark_results['speedup_ratio']:.2f}x")

⚡ Benchmark Polars vs Numpy pour calcul de revenue...
   🔹 Test avec Polars...
   🔹 Test avec Numpy...
   ✅ Polars: 0.0743s total, 14.85ms/calcul
   ✅ Numpy:  0.0041s total, 0.82ms/calcul
   🏆 Gagnant: Numpy (speedup: 18.16x)
📊 Résultats benchmark:
   Gagnant: Numpy
   Speedup: 18.16x


## Masking environment

In [11]:
env_base = FlightSchedulingEnvMasked(
    flight_schedule=schedule_0,
    lambdas=lambdas,
    max_steps=50,
    revenue_estimation='classic',
    obs_back="numpy"
)

def mask_fn(env):
    return env.get_action_mask()

env = ActionMasker(env_base, mask_fn)
env = NormalizeObservation(env)

model = MaskablePPO("MlpPolicy", env,
    learning_rate=0.003,
    batch_size=32,
    n_epochs=10,
    clip_range=0.2,
    verbose=0
)

print("🚀 Entraînement avec Action Masking...")
run_evaluation(TOTAL_TIMESTEPS, EVAL_FREQ, model, {}, env, masked=True)

🚀 Entraînement avec Action Masking...
  Step 1000: Reward =   0.00


<sb3_contrib.ppo_mask.ppo_mask.MaskablePPO at 0x7f95ae8400e0>

## Optimisation Itérative avec Extraction de Planning

In [15]:
optimized_schedule, metrics = optimize_schedule(
    random_schedule,
    random_lambdas,
    num_iterations=20,
    timesteps_per_iteration=20000,
    n_samples=50,
    max_steps=500
)

print(f"📊 Métriques d'optimisation:")
print(f"   Amélioration: {metrics.get('improvement_percentage', 0):.1f}%")
print(f"   Revenue initial: {metrics.get('initial_revenue', 0):.2f}")
print(f"   Revenue final: {metrics.get('best_revenue', 0):.2f}")
print(f"   Temps total: {metrics.get('total_training_time', 0):.1f}s")

🎯 OPTIMISATION ITÉRATIVE AVEC MULTI-SAMPLING - 20 itérations
🔬 Paramètres: 50 échantillons par extraction
🔄 ITÉRATION 1
   Timesteps par itération: 20000
   Échantillons d'extraction: 50
   🚀 Entraînement en cours...
   📊 Évaluation post-entraînement: -27874.72
🔍 Extraction du planning optimisé avec 50 échantillons...
   ✅ MEILLEUR échantillon: reward=17155.56, revenue=145751.39, steps=501
   📈 Amélioration sur 50 essais
   ✅ ACCEPTÉ Revenue: 128595.83 → 145751.39 (+17155.56, +13.3%)
   ⏱️  Temps d'entraînement: 47.2s
   🏆 Meilleur revenue global: 145751.39
🔄 ITÉRATION 2
   Timesteps par itération: 20000
   Échantillons d'extraction: 50
   🚀 Entraînement en cours...
   📊 Évaluation post-entraînement: -24410.28
🔍 Extraction du planning optimisé avec 50 échantillons...
   ✅ MEILLEUR échantillon: reward=10809.72, revenue=156561.11, steps=501
   📈 Amélioration sur 50 essais
   ✅ ACCEPTÉ Revenue: 145751.39 → 156561.11 (+10809.72, +7.4%)
   ⏱️  Temps d'entraînement: 47.7s
   🏆 Meilleur reven

Nombre d'itérations: 10
Itérations acceptées: 9 (90.0%)
Itérations rejetées: 1 (10.0%)
Total timesteps: 200,000
Temps total: 460.4s
📈 ÉVOLUTION DU REVENUE:
  Itération  1: 128595.83 → 138904.17 (+10308.33,  +8.0%) ✅ eval=-16875.6
  Itération  2: 138904.17 → 149523.61 (+10619.44,  +7.6%) ✅ eval=-27053.9
  Itération  3: 149523.61 → 160704.17 (+11180.56,  +7.5%) ✅ eval=-27380.0
  Itération  4: 160704.17 → 178912.50 (+18208.33, +11.3%) ✅ eval=-16952.8
  Itération  5: 178912.50 → 194718.06 (+15805.56,  +8.8%) ✅ eval=-41305.6
  Itération  6: 194718.06 → 201286.11 (+6568.06,  +3.4%) ✅ eval=-61890.6
  Itération  7: 201286.11 → 217605.56 (+16319.44,  +8.1%) ✅ eval=-31476.4
  Itération  8: 217605.56 → 224318.06 (+6712.50,  +3.1%) ✅ eval=-65282.8
  Itération  9: 224318.06 → 219230.56 (-5087.50,  -2.3%) ❌ eval=-65468.1
  Itération 10: 224318.06 → 226000.00 (+1681.94,  +0.7%) ✅ eval=-16932.5
🎯 AMÉLIORATION TOTALE:
   Initial: 128595.83
   Meilleur: 226000.00
   Gain:    +97404.17 (+75.7%)
   ROI:     211.583 revenue/seconde
   Efficacité: 10822.69 revenue/itération acceptée
📊 Métriques d'optimisation:
   Amélioration: 75.7%
   Revenue initial: 128595.83
   Revenue final: 226000.00
   Temps total: 460.4s