In [6]:
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from jre_utils.datapath import model_ready_data_paths

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [42]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [8]:
metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

metric = metrics["weighted_median"]
metric_pct_chg = metric + "_pct_chg"

In [50]:
start_year = 2006
test_start_year = 2021 # test_years = [2021]

df = pd.read_csv(model_ready_data_paths["sequence"])
train_df = df[(df["year"] >= start_year) & (df["year"] < test_start_year)]
test_df = df[df["year"] >= test_start_year]

In [84]:
# https://datascience.stackexchange.com/questions/48796/how-to-feed-lstm-with-different-input-array-sizes

class TimeSeriesDataset(Dataset):
    def __init__(
        self,
        complete_df,
        df,
        metrics=[metric_pct_chg],
        shift=1,
        window_length=5,
        transform=None,
    ):
        self.complete_df = complete_df
        self.df = df
        self.transform = transform
        self.metrics = metrics
        self.shift = shift
        self.window_length = window_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        row = self.df.iloc[idx]
        target = row[self.metrics]
        area_code, year = row["area_code"], row["year"]
        area_df = (
            self.complete_df[
                (self.complete_df["area_code"] == area_code)
                & (self.complete_df["year"] <= year - self.shift)
            ]
            .sort_values(by="year")  # sorting just to be safe
            .tail(self.window_length)
        )

        sample = {
            "window": area_df,
            "target": target,
        }

        if self.transform:
            sample = self.transform(sample)

        return sample


class ToNumpy(object):
    """Convert pandas dataframes in sample to ndarrays."""

    def __call__(self, sample):
        window, target = sample["window"], sample["target"]
        return {"window": window.values, "target": target.values.squeeze()}


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        window, target = sample["window"], sample["target"]
        return {"window": torch.from_numpy(window), "target": torch.from_numpy(target)}
    
class PadAndMask(object):
    """Pad all inputs to be of the same length and create a mask"""
    # to be double checked
    def __init__(self, pad_length = 5):
        self.pad_length = pad_length

    def __call__(self, sample):
        window, target = sample["window"], sample["target"]
        
        # the first n elements of the mask are 1, the rest are 0
        mask = np.zeros(self.pad_length)
        mask[: window.shape[0]] = 1

        padded_window = np.pad(
            window, ((0, self.pad_length - window.shape[0]), (0, 0)), "constant"
        )
        return {"window": padded_window, "target": target, "mask": mask}

In [80]:
time_series_dataset = TimeSeriesDataset(df, train_df, shift=1)

for i, sample in enumerate(time_series_dataset):
    print(i, sample["window"].shape, sample["target"].shape, sample["target"])
    if i == 3:
        break

0 (0, 12) (1,) unit_price_wmedian_pct_chg   -0.426053
Name: 0, dtype: float64
1 (1, 12) (1,) unit_price_wmedian_pct_chg   -0.087079
Name: 1, dtype: float64
2 (2, 12) (1,) unit_price_wmedian_pct_chg    0.389261
Name: 2, dtype: float64
3 (3, 12) (1,) unit_price_wmedian_pct_chg   -0.231734
Name: 3, dtype: float64


In [88]:
transformed_dataset = TimeSeriesDataset(
    df, train_df, transform=transforms.Compose([ToNumpy(), PadAndMask(), ToTensor()])
)

for i, sample in enumerate(transformed_dataset):
    print(i, sample["window"].shape, sample["target"].shape, sample["target"])
    if i == 3:
        break

0 torch.Size([5, 12]) torch.Size([]) tensor(-0.4261, dtype=torch.float64)
1 torch.Size([5, 12]) torch.Size([]) tensor(-0.0871, dtype=torch.float64)
2 torch.Size([5, 12]) torch.Size([]) tensor(0.3893, dtype=torch.float64)
3 torch.Size([5, 12]) torch.Size([]) tensor(-0.2317, dtype=torch.float64)


In [90]:
dataloader = DataLoader(transformed_dataset, batch_size=64, shuffle=False, num_workers=0)
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched["window"].size(), sample_batched["target"].size())

0 torch.Size([64, 5, 12]) torch.Size([64])
1 torch.Size([64, 5, 12]) torch.Size([64])
2 torch.Size([64, 5, 12]) torch.Size([64])
3 torch.Size([64, 5, 12]) torch.Size([64])
4 torch.Size([64, 5, 12]) torch.Size([64])
5 torch.Size([64, 5, 12]) torch.Size([64])
6 torch.Size([64, 5, 12]) torch.Size([64])
7 torch.Size([64, 5, 12]) torch.Size([64])
8 torch.Size([64, 5, 12]) torch.Size([64])
9 torch.Size([64, 5, 12]) torch.Size([64])
10 torch.Size([64, 5, 12]) torch.Size([64])
11 torch.Size([64, 5, 12]) torch.Size([64])
12 torch.Size([64, 5, 12]) torch.Size([64])
13 torch.Size([64, 5, 12]) torch.Size([64])
14 torch.Size([64, 5, 12]) torch.Size([64])
15 torch.Size([64, 5, 12]) torch.Size([64])
16 torch.Size([64, 5, 12]) torch.Size([64])
17 torch.Size([64, 5, 12]) torch.Size([64])
18 torch.Size([64, 5, 12]) torch.Size([64])
19 torch.Size([64, 5, 12]) torch.Size([64])
20 torch.Size([64, 5, 12]) torch.Size([64])
21 torch.Size([64, 5, 12]) torch.Size([64])
22 torch.Size([64, 5, 12]) torch.Size([64]