# Iterator dataset

We need a version of the dataset that is an iterator and that provides a lesser amount of lines for each batch.
This will allow us to study more variations of the EMOS model.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import hydra
import itertools
import os
import pathlib
import pandas as pd

from smc01.postprocessing.dataset import SMCParquetDataset
from smc01.postprocessing.util import load_checkpoint_from_run, PandasBatchIteratorDataset

In [None]:
with hydra.initialize_config_module('smc01.postprocessing.conf'):
    cfg = hydra.compose('train', ['experiment=emos_gdps_metar'])

In [None]:
dataset = hydra.utils.instantiate(cfg.experiment.dataset)

In [None]:
dataset[0]

In [None]:
DATA_DIR = pathlib.Path(os.getenv("DATA_DIR"))
DATASET_DIR = DATA_DIR / '2021-12-20-gdps-metar/'

In [None]:
dataset = SMCParquetDataset(DATASET_DIR, years=[2021])

In [None]:
iterator_dataset = PandasBatchIteratorDataset(dataset, 50000)

In [None]:
batch = next(iter(iterator_dataset))
print('len of batch', len(batch.index))

# Test implementation

I want to test that we get the same amount of examples if we use the iterator or not.

In [None]:
from smc01.postprocessing.train import make_datasets, make_dataloader

In [None]:
sample_dir = DATA_DIR / 'interpolated/2022-02-08-sample'
with hydra.initialize_config_module('smc01.postprocessing.conf'):
    cfg = hydra.compose('train', [
        'experiment=emos_gdps_metar',
        'experiment.limit_dataframe_size=5000',
        'num_workers=8',
        'experiment.batch_size=1',
        f'experiment.dataset.dataset_dir={sample_dir!s}',
    ])

In [None]:
train_dataset, *_ = make_datasets(cfg)

In [None]:
loader = make_dataloader(cfg, train_dataset)


In [None]:
n_examples = 0
for b in loader:
    n_examples += b['station_id'].shape[0]
    
print(n_examples)

In [None]:
sample_dir = DATA_DIR / 'interpolated/2022-02-08-sample'
with hydra.initialize_config_module('smc01.postprocessing.conf'):
    cfg_no_limit = hydra.compose('train', [
        'experiment=emos_gdps_metar',
        '~experiment.limit_dataframe_size',
        'num_workers=8',
        'experiment.batch_size=1',
        f'experiment.dataset.dataset_dir={sample_dir!s}',
    ])

In [None]:
train_dataset_no_limit, *_ = make_datasets(cfg_no_limit)

In [None]:
loader = make_dataloader(cfg_no_limit, train_dataset_no_limit)

In [None]:
n_examples = 0
for b in loader:
    n_examples += b['station_id'].shape[0]
print(n_examples)