In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, Baseline
from pytorch_forecasting.data import NaNLabelEncoder, GroupNormalizer
from pytorch_lightning import seed_everything, Trainer
from torch.utils.data import DataLoader
from tqdm.autonotebook import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Step 1: Prepare Features

#### Load and preprocess

In [2]:
# Load your full dataset
df = pd.read_csv("ticket_price_dist.csv")

# Ensure dates are in datetime format
df["depart_date"] = pd.to_datetime(df["depart_date"])
df["extract_date"] = pd.to_datetime(df["extract_date"])
df['route'] = df['airport_from'] + "_" + df['airport_to']

# Optional: days to departure as a known future feature
df["days_to_departure"] = (df["depart_date"] - df["extract_date"]).dt.days

df["weekday"] = df["depart_date"].dt.day_name()
df["month"] = df["depart_date"].dt.month_name()

df["weekday"] = df["weekday"].astype("category")
df["month"] = df["month"].astype("category")

# Encode route as category
df["route"] = df["route"].astype("category")
df = df.sort_values(['route', 'depart_date']).reset_index(drop=True)

# Drop missing prices
df = df.dropna(subset=["best_price"])
df

Unnamed: 0.1,Unnamed: 0,extract_timestamp,origin,destination,depart_date,best_price,airport_from,airport_to,distance_km,flight_time_hour,...,weekday_name,calendar_day,calendar_year,calendar_month,week_number,extraction_period,route,days_to_departure,weekday,month
0,30078,2022-12-29 03:13:11.798742,JKTC,BDJ,2023-01-01,1414200.0,CGK,BDJ,946.44,2.02,...,Sunday,1,2023,1,52,3 days,CGK_BDJ,3,Sunday,January
1,30232,2022-12-31 01:49:22.466131,JKTC,BDJ,2023-01-01,1137800.0,CGK,BDJ,946.44,2.02,...,Sunday,1,2023,1,52,1 days,CGK_BDJ,1,Sunday,January
2,30077,2022-12-29 03:13:11.798742,JKTC,BDJ,2023-01-02,1345400.0,CGK,BDJ,946.44,2.02,...,Monday,2,2023,1,1,4 days,CGK_BDJ,4,Monday,January
3,30235,2022-12-31 01:49:22.466131,JKTC,BDJ,2023-01-02,1206600.0,CGK,BDJ,946.44,2.02,...,Monday,2,2023,1,1,2 days,CGK_BDJ,2,Monday,January
4,30438,2023-01-02 00:37:44.519977,JKTC,BDJ,2023-01-02,1348099.0,CGK,BDJ,946.44,2.02,...,Monday,2,2023,1,1,0 days,CGK_BDJ,0,Monday,January
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45390,10995,2023-02-01 22:44:34.976945,JKTC,YIA,2023-07-29,495660.0,CGK,YIA,455.91,1.33,...,Saturday,29,2023,7,30,178 days,CGK_YIA,178,Saturday,July
45391,10807,2023-02-01 09:17:51.827188,JKTC,YIA,2023-07-30,475660.0,CGK,YIA,455.91,1.33,...,Sunday,30,2023,7,30,179 days,CGK_YIA,179,Sunday,July
45392,10987,2023-02-01 22:44:34.976945,JKTC,YIA,2023-07-30,495660.0,CGK,YIA,455.91,1.33,...,Sunday,30,2023,7,30,179 days,CGK_YIA,179,Sunday,July
45393,10809,2023-02-01 09:17:51.827188,JKTC,YIA,2023-07-31,475660.0,CGK,YIA,455.91,1.33,...,Monday,31,2023,7,31,180 days,CGK_YIA,180,Monday,July


#### Select Columns

In [3]:
max_encoder_length = 14
max_prediction_length = 7

In [4]:
training_cutoff = df["depart_date"].max() - pd.Timedelta(days=max_prediction_length)
df.groupby("route")["depart_date"].nunique().sort_values()

route
CGK_MKQ    198
CGK_BIK    201
CGK_BDO    208
CGK_TRK    208
CGK_TNJ    212
CGK_TKG    212
CGK_SUB    212
CGK_SRG    212
CGK_SOC    212
CGK_PNK    212
CGK_PLM    212
CGK_PKU    212
CGK_PGK    212
CGK_PDG    212
CGK_MLG    212
CGK_BDJ    212
CGK_MDC    212
CGK_LOP    212
CGK_KNO    212
CGK_JOG    212
CGK_DPS    212
CGK_DJJ    212
CGK_DJB    212
CGK_BTJ    212
CGK_BTH    212
CGK_BPN    212
CGK_BKS    212
CGK_UPG    212
CGK_YIA    212
Name: depart_date, dtype: int64

#### Preprocessor

In [35]:
training_cutoff = df["depart_date"].max() - pd.Timedelta(days=max_prediction_length)

training = TimeSeriesDataSet(
    df[df["depart_date"] <= training_cutoff],
    time_idx="calendar_day",
    target="best_price",
    group_ids=["route"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    static_categoricals=["route"],
    time_varying_known_categoricals=["weekday", "month"],
    time_varying_known_reals=["days_to_departure"],
    time_varying_unknown_reals=["best_price"],
    target_normalizer=GroupNormalizer(groups=["route"], transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
    min_encoder_length=max_encoder_length,
)

# Validation
validation = TimeSeriesDataSet.from_dataset(training, df, predict=True, stop_randomization=True)

# Dataloaders
train_dataloader = training.to_dataloader(train=True, batch_size=64, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=64, num_workers=0)

### Step 3: Preprocess & Split

In [44]:
import torch
from pytorch_lightning import Trainer
from pytorch_forecasting import TemporalFusionTransformer
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_lightning.loggers import CSVLogger

logger = CSVLogger("lightning_logs", name="tft")

tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    loss=QuantileLoss(),
    log_interval=-1,
    reduce_on_plateau_patience=4,
)

trainer = Trainer(
    max_epochs=10,
    gradient_clip_val=0.1,
    enable_model_summary=True,
    accelerator="cpu",
    devices=1,
)

trainer.fit(
    model=tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 389   
3  | prescalers                         | ModuleDict                      | 96    
4  | static_variable_selection          | VariableSelectionNetwork        | 1.9 K 
5  | encoder_variable_selection         | VariableSelectionNetwork        | 2.0 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 1.4 K 
7  | static_context_variable_selection  | GatedResidualNetwork            | 

                                                                                                                                                                                                                

  rank_zero_warn(
  rank_zero_warn(


Epoch 0:   9%|██████████▎                                                                                                      | 23/253 [00:38<06:20,  1.66s/it, loss=1.54e+05, v_num=3, train_loss_step=9.3e+4]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [25]:
trainer.save_checkpoint("model_checkpoint_v2.ckpt")

In [40]:
from pytorch_forecasting import TemporalFusionTransformer

# Load the model from checkpoint
tft = TemporalFusionTransformer.load_from_checkpoint("model_checkpoint_v2.ckpt", strict=False)
tft

  rank_zero_warn(
  rank_zero_warn(


TemporalFusionTransformer(
  	"attention_head_size":               1
  	"categorical_groups":                {}
  	"causal_attention":                  True
  	"dropout":                           0.1
  	"embedding_labels":                  {'route': {'CGK_BDJ': 0, 'CGK_BDO': 1, 'CGK_BIK': 2, 'CGK_BKS': 3, 'CGK_BPN': 4, 'CGK_BTH': 5, 'CGK_BTJ': 6, 'CGK_DJB': 7, 'CGK_DJJ': 8, 'CGK_DPS': 9, 'CGK_JOG': 10, 'CGK_KNO': 11, 'CGK_LOP': 12, 'CGK_MDC': 13, 'CGK_MKQ': 14, 'CGK_MLG': 15, 'CGK_PDG': 16, 'CGK_PGK': 17, 'CGK_PKU': 18, 'CGK_PLM': 19, 'CGK_PNK': 20, 'CGK_SOC': 21, 'CGK_SRG': 22, 'CGK_SUB': 23, 'CGK_TKG': 24, 'CGK_TNJ': 25, 'CGK_TRK': 26, 'CGK_UPG': 27, 'CGK_YIA': 28}, 'weekday': {'Friday': 0, 'Monday': 1, 'Saturday': 2, 'Sunday': 3, 'Thursday': 4, 'Tuesday': 5, 'Wednesday': 6}, 'month': {'April': 0, 'February': 1, 'January': 2, 'July': 3, 'June': 4, 'March': 5, 'May': 6}}
  	"embedding_paddings":                []
  	"embedding_sizes":                   {'route': (29, 11), 'weekday': 

In [42]:
raw_predictions, x = tft.predict(val_dataloader, mode="raw", return_x=True)
raw_predictions

Output(prediction=tensor([[[1685387.1250, 1714049.5000, 1715069.5000,  ..., 1720715.6250,
          1727055.6250, 1738461.0000],
         [1680097.1250, 1709594.8750, 1711121.0000,  ..., 1716386.2500,
          1721814.5000, 1735234.7500],
         [1120225.8750, 1140477.7500, 1152882.2500,  ..., 1170817.8750,
          1183941.2500, 1201140.8750],
         ...,
         [1055876.0000, 1065794.0000, 1072573.5000,  ..., 1081937.1250,
          1088098.5000, 1099684.7500],
         [1054190.1250, 1067156.6250, 1075466.6250,  ..., 1087288.0000,
          1095221.2500, 1109225.3750],
         [1199799.0000, 1213622.7500, 1222238.3750,  ..., 1240516.2500,
          1252505.6250, 1265446.0000]],

        [[2314097.2500, 2338145.7500, 2355075.2500,  ..., 2384331.7500,
          2402361.2500, 2423334.0000],
         [2317056.5000, 2338808.5000, 2354875.2500,  ..., 2381410.0000,
          2397257.0000, 2416280.2500],
         [2322175.7500, 2344325.0000, 2360669.7500,  ..., 2386514.2500,
      

In [43]:
import pandas as pd
import matplotlib.pyplot as plt

# Load training logs
logs = pd.read_csv("lightning_logs/version_0/metrics.csv")

# Plot validation vs training loss
plt.plot(logs["epoch"], logs["train_loss_step"], label="Train Loss")
plt.plot(logs["epoch"], logs["val_loss"], label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.title("Training vs Validation Loss")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'lightning_logs/version_0/metrics.csv'

In [36]:
from pytorch_lightning.tuner.tuning import Tuner

res = Tuner(trainer).lr_find(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=10.0,
    min_lr=1e-6,
)

print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
`Trainer.fit` stopped: `max_epochs=20` reached.
LR finder stopped early after 5040 steps due to diverging loss.
Restoring states from the checkpoint path at /Users/fjung/dev/ticket-price-forecast/.lr_find_80555e4c-b559-4793-ad74-2c551ca97fa2.ckpt


KeyError: 'radam_buffer'

In [33]:
# Predict using the validation set
raw_predictions, x = tft.predict(val_dataloader, mode="raw", return_x=True)

# Plot one example prediction
tft.plot_prediction(x, raw_predictions, idx=0)

ImportError: cannot import name 'Tuner' from 'pytorch_lightning' (/Users/fjung/miniconda3/envs/pytorch/lib/python3.10/site-packages/pytorch_lightning/__init__.py)

#### Predict

In [45]:
from pytorch_forecasting import TimeSeriesDataSet
from datetime import timedelta

def predict_future_prices(
    model,
    training_dataset,
    route: str,
    start_date: pd.Timestamp,
    days_ahead: int = 7,
):
    """
    Predict future best prices for a given route and starting date.

    Parameters:
        model (TemporalFusionTransformer): trained TFT model
        training_dataset (TimeSeriesDataSet): original training dataset (for from_dataset)
        route (str): route code, e.g., "CGK_DPS"
        start_date (pd.Timestamp): date to start forecasting from
        days_ahead (int): number of days to forecast

    Returns:
        pd.DataFrame with predictions
    """

    assert route in df["route"].cat.categories, f"Unknown route: {route}"
    base_date = df["depart_date"].min()


    # Get last known entry for the route
    past_data = training_dataset.data[training_dataset.data["route"] == route].copy()
    past_data = past_data.sort_values("depart_date")
    last_known_row = past_data.iloc[-1]

    # Use original DataFrame to get categories and reference
    future_dates = [start_date + timedelta(days=i) for i in range(days_ahead)]
    future_df = pd.DataFrame({
        "depart_date": future_dates,
        "calendar_day": [(d - base_date).days for d in future_dates],
        "extract_date": last_known_row["extract_date"],
        "route": route,
        "airport_from": last_known_row["airport_from"],
        "airport_to": last_known_row["airport_to"],
        "distance_km": last_known_row["distance_km"],
        "flight_time_hour": last_known_row["flight_time_hour"],
        "days_to_departure": [(d - last_known_row["extract_date"]).days for d in future_dates],
        "weekday": [d.strftime("%A") for d in future_dates],
        "month": [d.strftime("%B") for d in future_dates],
    })
    
    # Ensure categorical types
    future_df["weekday"] = pd.Categorical(future_df["weekday"], categories=df["weekday"].cat.categories)
    future_df["month"] = pd.Categorical(future_df["month"], categories=df["month"].cat.categories)
    future_df["route"] = pd.Categorical([route] * days_ahead, categories=df["route"].cat.categories)


    # Create prediction dataset
    prediction_dataset = TimeSeriesDataSet.from_dataset(training_dataset, future_df, predict=True, stop_randomization=True)

    # Predict
    raw_preds, x = model.predict(prediction_dataset, mode="raw", return_x=True)
    y_pred = model.to_prediction(raw_preds)

    # Return forecast as DataFrame
    result_df = pd.DataFrame({
        "date": future_df["depart_date"],
        "predicted_best_price": y_pred.flatten()
    })

    return result_df

In [46]:
future_prices = predict_future_prices(
    model=tft,
    training_dataset=training,
    route="CGK_DPS",
    start_date=pd.Timestamp("2023-08-01"),
    days_ahead=7
)
future_prices

KeyError: 'route'