<a href="https://colab.research.google.com/github/cswcjt/Dacon_Bike/blob/main/TFT_bike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install pytorch_forecasting
!pip install holidays
!pip install statsmodels --upgrade

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf 
import tensorboard as tb 
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

In [None]:
import itertools
import os
import holidays

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

In [None]:
base_path = "/content/drive/MyDrive/fastcamp/datas/bike/"
train = pd.read_csv(base_path + "train.csv")
submission = pd.read_csv(base_path + "sample_submission.csv")
save_path = "/content/drive/MyDrive/fastcamp/datas/"

In [None]:
train['일시'] = pd.to_datetime(train['일시'], format='%Y%m%d')
train = train.rename(columns={"일시": "date", "광진구": "g", "동대문구": "d", "성동구": "s", "중랑구": "j"})
display((train.merge((train[['date']].drop_duplicates(ignore_index=True).rename_axis('time_idx')).reset_index(), on = ['date'])))
train = train.set_index(keys=['date'], drop=True)
train

In [None]:
# data = (train.merge((train[['date']].drop_duplicates(ignore_index=True).rename_axis('time_idx')).reset_index(), on = ['date']))
# data
# holidays.Korea(years = 2019)

In [None]:
# 데이터의 시작 시점을 기억
earliest_time = train.index.min() 
earliest_time

In [None]:
df_list = []

for region in train:
    # label = region
    ts = train[region]
    # display(ts)
    
    # tmp 생성
    # date도 생성 -> month, day, hour 등 시간에 관련 된 모든 데이터 갖고있다.
    tmp = pd.DataFrame({'usage': ts})
    date = tmp.index

    # 시간 변화 대한 정보들 컬럼들도 만들어 준다.  
    tmp['days_from_start'] = (date - earliest_time).days
    tmp['date'] = date

    # 시간 자체에 대한 정보들 컬럼들도 만들어 준다.  
    tmp['date'] = date
    tmp['day'] = date.day.astype(str).astype("category")
    tmp['day_of_week'] = date.dayofweek.astype(str).astype("category")
    tmp["week_of_year"] = date.weekofyear.astype(str).astype("category") 
    tmp['month'] = date.month.astype(str).astype("category")
    tmp['year'] = date.year.astype(str).astype("category")

    # 고객정보 컬럼도 만들어준다. 
    tmp['region'] = region

    # stack all time series vertically
    df_list.append(tmp)

demand_df = pd.concat(df_list).reset_index(drop=True)
demand_df

In [None]:
#Hyperparameters
#batch size=64
#number heads=4, hidden sizes=160, lr=0.001, gr_clip=0.1

# 1년 사용량 예측
max_prediction_length = 334

# 1년을 lookback window 로 설정: 1*365
max_encoder_length = 365*3

# train data 설정
training_cutoff = demand_df["days_from_start"].max() - max_prediction_length

training = TimeSeriesDataSet(
    # train dataframe 설정
    # idx 설정
    demand_df[lambda x: x.days_from_start <= training_cutoff],
    time_idx="days_from_start",

    # target 설정
    target="usage",

    # 그룹 설정
    group_ids=["region"],

    # encoder, prediciton
    min_encoder_length=max_encoder_length//2, 
    max_encoder_length=max_encoder_length,
    min_prediction_length=30,
    max_prediction_length=max_prediction_length,

    # 각 컬럼이 어떤 유형의 변수인지 구분
    static_categoricals=["region"],
    time_varying_known_categoricals=["day","day_of_week", "week_of_year", "month", "year"],
    time_varying_known_reals=["days_from_start"],
    time_varying_unknown_reals=['usage'],

    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,

)

In [None]:
# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, demand_df, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

In [None]:
training.index

In [None]:
validation.index

In [None]:
#let's see how a naive model does

actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
(actuals - baseline_predictions).abs().mean().item()

sm = MAE()

In [None]:
print(f"Median loss for naive prediction on validation: {sm.loss(actuals, baseline_predictions).mean(axis = 1).median().item()}")

In [None]:
PATIENCE = 30
MAX_EPOCHS = 120
LEARNING_RATE = 0.03
OPTUNA = True

In [None]:
early_stop_callback = EarlyStopping(monitor="train_loss", min_delta=1e-2, patience=PATIENCE, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

trainer = pl.Trainer(
    max_epochs=MAX_EPOCHS,
    gpus=1,
    devices=1, accelerator="gpu",
    enable_model_summary=True,
    gradient_clip_val=0.25,
    limit_train_batches=10,  # coment in for training, running valiation every 30 batches
    #fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)

tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=LEARNING_RATE,
    lstm_layers=2,
    hidden_size=16,
    attention_head_size=2,
    dropout=0.2,
    hidden_continuous_size=8,
    output_size=1,  # 7 quantiles by default
    loss=MAE(),
    log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=4
)

print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

In [None]:
if OPTUNA:

    from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

    # create study
    study = optimize_hyperparameters(
        train_dataloader,
        val_dataloader,
        model_path="optuna_test",
        n_trials=50,
        max_epochs=50,
        gradient_clip_val_range=(0.01, 1.0),
        hidden_size_range=(8, 128),
        hidden_continuous_size_range=(8, 128),
        attention_head_size_range=(1, 4),
        learning_rate_range=(0.001, 0.1),
        dropout_range=(0.1, 0.3),
        trainer_kwargs=dict(limit_train_batches=30),
        reduce_on_plateau_patience=4,
        use_learning_rate_finder=False,  # use Optuna to find ideal learning rate or use in-built learning rate finder
    )

In [None]:
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader, mode="prediction")
raw_predictions, x = best_tft.predict(val_dataloader, mode="raw", return_x=True)

sm = MAE()
print(f"Validation median MAE loss: {sm.loss(actuals, predictions).mean(axis = 1).median().item()}")

In [None]:
print(f"Validation median MAE loss: {sm.loss(actuals, predictions).mean(axis = 1).median().item()}")

In [None]:
for idx in range(raw_predictions.prediction.shape[0]):
    best_tft.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True);