In [1]:
import hydra
import pandas as pd
import pytorch_lightning as pl
from optuna.integration import PyTorchLightningPruningCallback
from pytorch_lightning.callbacks import EarlyStopping, GPUStatsMonitor, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

from src import DATA_DIR, LOGGING_DIR, MODEL_CHECKPOINTS_DIR, TRACK1_DIR
from src.configs import register_configs
from src.configs.train import TrainConfig
from src.data import LenaDataModuleExtra
from src.models import LenaTransExtra
from src.system import LenaSystemExtra
from src.utils.torch import get_embeddings_projections


In [2]:
def get_datamodule(batch_size, num_workers):
    features_df = pd.read_csv(DATA_DIR / "features_extra.csv")
    datamodule = LenaDataModuleExtra(
        features_df=features_df, batch_size=batch_size, num_workers=num_workers
    )

    return datamodule


In [3]:
def train(cfg: TrainConfig, trial=None):
    logger = TensorBoardLogger(
        str(LOGGING_DIR),
        name=cfg.name,
        version=cfg.version,
        log_graph=False,
        default_hp_metric=True,
    )

    checkpoints = ModelCheckpoint(
        dirpath=str(MODEL_CHECKPOINTS_DIR / cfg.name),
        monitor="hp_metric",
        verbose=True,
        mode="max",
        save_top_k=-1,
    )

    early_stopping = EarlyStopping(monitor="Val/score")
    if trial:
        early_stopping = PyTorchLightningPruningCallback(monitor="Val/score", trial=trial)  # type: ignore

    gpu_monitor = GPUStatsMonitor()

    datamodule = get_datamodule(batch_size=cfg.batch_size, num_workers=cfg.num_workers)

    # trainer
    trainer = pl.Trainer(
        logger=logger,
        callbacks=[gpu_monitor, checkpoints, early_stopping],
        profiler="simple",
        benchmark=True,
        gpus=cfg.gpus,
        max_epochs=cfg.max_epochs
        # enable_pl_optimizer=True,
    )

    embeddings_projections = get_embeddings_projections(
        categorical_features=datamodule.categorical_features, features_df=datamodule.features_df
    )

    model = LenaTransExtra(
        cat_features=datamodule.categorical_features,
        embeddings_projections=embeddings_projections,
        numerical_features=datamodule.numerical_features,
        target_cols=datamodule.target_cols,
        station_col_name="hydro_fixed_station_id_categorical",
        day_col_name="day_target_categorical",
        rnn_units=cfg.rnn_units,
        top_classifier_units=cfg.top_classifier_units,
    )

    system = LenaSystemExtra(model=model, alpha=cfg.alpha, gamma=cfg.gamma, lr=cfg.lr, weight_decay=cfg.weight_decay)

    trainer.fit(system, datamodule=datamodule)

    return datamodule


In [4]:
from src.configs.train import TrainConfig

In [5]:
cfg = TrainConfig()

In [6]:
datamodule = train(cfg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.target_df["full_date"] = self.target_df[["year", "day"]].apply(make_full_date, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.target_df["day_target_categorical"] = self.target_df["hydro_fixed_day_categorical"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.target_df["full_dat

In [14]:
datamodule.setup()

In [16]:
train_dl = datamodule.train_dataloader()

In [17]:
for batch in train_dl:
    pass

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/dan/.cache/pypoetry/virtualenvs/emergency-hack-xcMZg9e2-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/dan/.cache/pypoetry/virtualenvs/emergency-hack-xcMZg9e2-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/dan/.cache/pypoetry/virtualenvs/emergency-hack-xcMZg9e2-py3.8/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/dan/Projects/emergency_datahack/src/data.py", line 120, in __getitem__
    encoded_station_id = self.full_df[features_mask]["hydro_fixed_station_id_categorical"].values[0]
IndexError: index 0 is out of bounds for axis 0 with size 0


In [18]:
%debug

> [0;32m/home/dan/.cache/pypoetry/virtualenvs/emergency-hack-xcMZg9e2-py3.8/lib/python3.8/site-packages/torch/_utils.py[0m(429)[0;36mreraise[0;34m()[0m
[0;32m    427 [0;31m            [0;31m# have message field[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    428 [0;31m            [0;32mraise[0m [0mself[0m[0;34m.[0m[0mexc_type[0m[0;34m([0m[0mmessage[0m[0;34m=[0m[0mmsg[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 429 [0;31m        [0;32mraise[0m [0mself[0m[0;34m.[0m[0mexc_type[0m[0;34m([0m[0mmsg[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    430 [0;31m[0;34m[0m[0m
[0m[0;32m    431 [0;31m[0;34m[0m[0m
[0m
ipdb> q
