In [1]:
from petastorm import make_batch_reader, TransformSpec
from petastorm.pytorch import DataLoader as PetaDataLoader
from torch.utils.data import TensorDataset, DataLoader as TorchDataLoader, IterableDataset
from sklearn import preprocessing
from collections import OrderedDict
from torch import tensor
import math
import os
import re

In [2]:
FILE_PREFIX = 'file:'

pre_open_fds = None
def patch_leaking_fd():
    global pre_open_fds
    from pyarrow.parquet import ParquetFile, ParquetReader
    def _patched_init(self, source, **kwargs):
        self.source = source
        return ParquetFile.__old_init__(self, source, **kwargs)

    def _exit(self, *args, **kwargs):
        if hasattr(self.source, 'close'):
            self.source.close()
            del self.source

    def _bopen(fn):    
        return open(fn, 'rb')

    pre_open_fds = _bopen
    if not hasattr(ParquetFile, '__old_init__'):
        print("Patching")
        ParquetFile.__old_init__ = ParquetFile.__init__

        ParquetFile.__init__ = _patched_init
        ParquetFile.__exit__ = _exit
        ParquetFile.__del__ = _exit

    else:
        print("Already patched")

patch_leaking_fd()



class MyIterableDataset(IterableDataset):
    def __init__(self, filename, rex=None):
        super(MyIterableDataset).__init__()
        self._filename_param = filename
        self.filename = self._init_filenames(filename, rex)

    def _init_filenames(self, filename, rex):
        if rex is None:
            return filename
        
        filename = filename[len(FILE_PREFIX):]
        if not os.path.isdir(filename):
            raise ValueError(f"Filtering only possible for dirs, {filename} is not a one")
        paths = [os.path.join(dp, f) for dp, dn, fn in os.walk(filename) for f in fn]
        res = list(map(
            lambda f: FILE_PREFIX + f,
            filter(lambda f: re.match(rex, f) is not None, paths)
        ))
        if (len(res) == 0):
            raise ValueError(f"0 files remained out ot {len(paths)} - seems regex is too restrictive")

        return res;

    def _init_petaloader(self):
        def _transform_row(df_batch):
            return df_batch

        transform = TransformSpec(_transform_row, removed_fields=['cat_id', 'store_id', 'state_id'])
        reader = make_batch_reader(self.filename,
                 schema_fields=['id', 'item_id', 'dept_id', 'cat_id', 'day_id',
               'sales', 'day_date_str', 'month_id', 'date', 'wm_yr_wk',
               'snap_flag', 'sell_price', 'sales_dollars', 'store_id', 'state_id'],
                workers_count=1
                #,transform_spec = transform
        )
        return PetaDataLoader(reader=reader, batch_size=128, shuffling_queue_capacity=100000)
        
    def __len__(self):
        return 1913*30490 # can be arbitrary large value to prevent WARN logs, seem to be ignored anyway

    def __iter__(self):
        print(f"Iterator created on {self._filename_param}")
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            count_cells = 0
            count_batches = 0
            with self._init_petaloader() as loader:
                if pre_open_fds:
                    loader.reader.dataset.fs.open = pre_open_fds
                for batch in loader:
                    count_batches += 1
                    for price, sales_dollars in zip(batch['sell_price'], batch['sales_dollars']):
                        price_is_nan = math.isnan(price)
                        price_or_zero = 0. if price_is_nan else price
                        count_cells += 1
                        yield {'features': tensor([price_or_zero, price_is_nan]),
                               'targets': tensor([sales_dollars])}
                        
            print(f'Done iterating: {count_batches} batches / ({count_cells} cells) ')
        else:
            raise ValueError("Not implemented for multithreading")

Patching


In [3]:
import torch
from torch import nn
import torch.nn.functional as F

from catalyst.dl import SupervisedRunner
from catalyst.utils import set_global_seed

  from pandas import Panel

numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject



In [4]:
SEED=42
set_global_seed(SEED)

In [5]:
batch = 128

train_ds = MyIterableDataset('file:./sales_series_melt.parquet/parquet_partition=2')
valid_ds = MyIterableDataset('file:./sales_series_melt.parquet/parquet_partition=1')

train_dl = TorchDataLoader(train_ds, batch_size=batch, shuffle=False, num_workers=0, drop_last=False)
valid_dl = TorchDataLoader(valid_ds, batch_size=batch, shuffle=False, num_workers=0, drop_last=False)

data = OrderedDict()
data["train"] = train_dl
data["valid"] = valid_dl

In [50]:
class Net(nn.Sequential):
    def __init__(self, num_features):
        layers = []
        layer_dims = [num_features, 200,200,20,20,1]
        for in_features, out_features in zip(layer_dims[:-1], layer_dims[1:]):
            l = nn.Linear(in_features, out_features)
            # Note to self: loss @ init is quite important!
            torch.nn.init.xavier_uniform_(l.weight) 
            torch.nn.init.zeros_(l.bias)

            layers.append(l)
            layers.append(nn.ReLU())
        super(Net, self).__init__(*layers)

class MyLoss(nn.MSELoss):
    def __init__(self):
        super(MyLoss, self).__init__()

    def forward(self, inp, target):
        return super().forward(inp, target)

In [65]:
model = Net(num_features=2)

In [66]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = MyLoss()
runner = SupervisedRunner()

In [67]:
trn_batch = next(iter(train_dl))
model.forward(trn_batch['features']).transpose(1, 0)

Iterator created on file:./sales_series_melt.parquet/parquet_partition=2


tensor([[0.0081, 0.0373, 0.0135, 0.0975, 0.0367, 0.0526, 0.0225, 0.0646, 0.0324,
         0.0121, 0.0308, 0.0184, 0.0077, 0.0544, 0.0204, 0.0482, 0.0730, 0.0808,
         0.0000, 0.0000, 0.0235, 0.0000, 0.0243, 0.0568, 0.0081, 0.0000, 0.0568,
         0.0251, 0.0000, 0.0267, 0.0446, 0.0135, 0.0081, 0.0163, 0.0511, 0.0081,
         0.0000, 0.0243, 0.0000, 0.0242, 0.0468, 0.0204, 0.0000, 0.0079, 0.0000,
         0.1178, 0.0104, 0.0169, 0.0242, 0.0894, 0.0161, 0.0267, 0.0323, 0.0590,
         0.0284, 0.0161, 0.0204, 0.0161, 0.0243, 0.0588, 0.0204, 0.0397, 0.0470,
         0.0321, 0.0313, 0.0812, 0.0161, 0.0000, 0.0266, 0.0243, 0.0080, 0.0178,
         0.0000, 0.0000, 0.0365, 0.0000, 0.0000, 0.0650, 0.0243, 0.0299, 0.0266,
         0.1220, 0.0080, 0.0813, 0.0000, 0.0568, 0.0161, 0.0000, 0.0402, 0.0323,
         0.0385, 0.0000, 0.0242, 0.0000, 0.0000, 0.0997, 0.0000, 0.0145, 0.0227,
         0.0000, 0.0243, 0.0000, 0.0650, 0.0323, 0.0484, 0.0235, 0.0000, 0.0315,
         0.0000, 0.0209, 0.0

In [68]:
valid_batch = next(iter(valid_dl))
model.forward(valid_batch['features']).transpose(1, 0)

Iterator created on file:./sales_series_melt.parquet/parquet_partition=1


tensor([[0.0204, 0.0226, 0.0093, 0.0284, 0.0000, 0.0894, 0.0000, 0.0161, 0.0000,
         0.0080, 0.0402, 0.0363, 0.0204, 0.0000, 0.0161, 0.0000, 0.0088, 0.0285,
         0.0079, 0.0000, 0.0732, 0.0511, 0.0142, 0.0000, 0.0000, 0.0227, 0.0715,
         0.0640, 0.0324, 0.1057, 0.0000, 0.0234, 0.0284, 0.0210, 0.0204, 0.0894,
         0.1220, 0.0715, 0.0055, 0.0349, 0.0201, 0.0227, 0.0161, 0.0258, 0.0161,
         0.0000, 0.0728, 0.0080, 0.0161, 0.1628, 0.1290, 0.0487, 0.0772, 0.0649,
         0.0044, 0.0324, 0.0242, 0.0201, 0.0202, 0.0243, 0.0731, 0.0227, 0.0313,
         0.0079, 0.0079, 0.0568, 0.0242, 0.0161, 0.0019, 0.0000, 0.0406, 0.0077,
         0.0363, 0.0242, 0.0047, 0.0079, 0.0895, 0.0402, 0.0487, 0.0202, 0.0638,
         0.0101, 0.0000, 0.0243, 0.0000, 0.0487, 0.0000, 0.0405, 0.0242, 0.0975,
         0.0081, 0.0275, 0.0161, 0.0649, 0.1058, 0.0308, 0.0034, 0.0405, 0.0242,
         0.0243, 0.0161, 0.0528, 0.0486, 0.0000, 0.0186, 0.0243, 0.0299, 0.0292,
         0.0079, 0.0202, 0.0

In [69]:
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    loaders=data,
    logdir="run",
    load_best_on_end=True,
    num_epochs=1)

Iterator created on file:./sales_series_melt.parquet/parquet_partition=2
Done iterating: 3047 batches / (389957 cells) 
Iterator created on file:./sales_series_melt.parquet/parquet_partition=1
Done iterating: 3042 batches / (389299 cells) 
[2020-06-04 07:28:56,298] 
1/1 * Epoch 1 (_base): lr=0.0100 | momentum=0.9000
1/1 * Epoch 1 (train): loss=94.0748
1/1 * Epoch 1 (valid): loss=89.3520
Top best models:
run/checkpoints/train.1.pth	89.3520
=> Loading checkpoint run/checkpoints/best_full.pth
loaded state checkpoint run/checkpoints/best_full.pth (global epoch 1, epoch 1, stage train)


In [70]:
model.forward(trn_batch['features']).transpose(1, 0)

tensor([[3.4297, 3.3355, 3.9812, 5.1528, 3.2807, 3.8872, 2.7261, 4.2225, 2.7297,
         4.3285, 2.7261, 2.7261, 3.0483, 3.9366, 2.7261, 3.7669, 4.4599, 4.6809,
         0.0000, 0.0000, 2.7261, 0.0000, 2.7261, 4.0017, 3.4297, 0.0000, 4.0017,
         2.7261, 0.0000, 2.7261, 3.6705, 3.9812, 3.4297, 3.0932, 3.8445, 3.4297,
         0.0000, 2.7261, 0.0000, 2.7261, 3.7284, 2.7261, 0.0000, 3.2584, 0.0000,
         5.7260, 4.5804, 2.7261, 2.7261, 4.9226, 3.1384, 2.7261, 2.7261, 4.0627,
         2.7261, 3.1384, 2.7261, 3.1384, 2.7261, 4.0580, 2.7261, 3.5187, 3.7348,
         2.7261, 2.7261, 4.6924, 3.1581, 0.0000, 2.7261, 2.7261, 3.3166, 2.7261,
         0.0000, 0.0000, 3.2669, 0.0000, 0.0000, 4.2341, 2.7261, 2.7261, 2.7261,
         5.8434, 3.3166, 4.6947, 0.0000, 4.0017, 3.1581, 0.0000, 3.5567, 2.7261,
         3.4284, 0.0000, 2.7261, 0.0000, 0.0000, 5.2149, 0.0000, 3.6469, 2.7261,
         0.0000, 2.7261, 0.0000, 4.2341, 2.7261, 3.7712, 2.7261, 0.0000, 2.7261,
         0.0000, 2.7261, 3.1

In [71]:
model.forward(valid_batch['features']).transpose(1, 0)

tensor([[2.7261, 2.7261, 4.1698, 2.7261, 0.0000, 4.9226, 0.0000, 3.1384, 0.0000,
         3.3166, 3.5567, 3.2532, 2.7261, 0.0000, 3.1384, 0.0000, 3.8799, 2.7261,
         3.2584, 0.0000, 4.4645, 3.8445, 3.7573, 0.0000, 0.0000, 2.7261, 4.4162,
         4.2063, 2.7297, 5.3830, 0.0000, 2.7261, 2.7261, 2.7261, 2.7261, 4.9226,
         5.8434, 4.4162, 2.7261, 3.1372, 2.7261, 2.7261, 3.1581, 2.7261, 3.1581,
         0.0000, 4.4553, 3.3166, 3.1384, 6.9966, 6.0413, 3.7798, 4.5796, 4.2318,
         2.7261, 2.7297, 2.7261, 2.7261, 2.7261, 2.7261, 4.4622, 2.7261, 2.7261,
         3.2584, 3.2584, 4.0017, 2.7261, 3.1384, 2.7261, 0.0000, 3.5656, 3.0483,
         3.2532, 2.7261, 2.7261, 3.2584, 4.9249, 3.5567, 3.7798, 2.7261, 4.1994,
         4.5478, 0.0000, 2.7261, 0.0000, 3.7798, 0.0000, 3.5634, 2.7261, 5.1528,
         3.4297, 2.7261, 3.1384, 4.2318, 5.3853, 2.7261, 2.7261, 3.5634, 2.7261,
         2.7261, 3.1384, 3.8917, 3.7777, 0.0000, 2.7261, 2.7261, 2.7261, 2.7261,
         3.2584, 2.7261, 3.2

In [72]:
import itertools
for batch in runner.predict_loader(loader=itertools.islice(data['valid'], 1)):
    print(batch['logits'].transpose(1,0))

Iterator created on file:./sales_series_melt.parquet/parquet_partition=1
tensor([[3.5955, 5.1528, 2.7261, 3.1384, 2.7261, 2.7261, 3.2584, 2.7261, 4.2248,
         2.7261, 2.7261, 2.7261, 3.0483, 3.6469, 2.7261, 3.7584, 3.8782, 0.0000,
         3.5567, 2.7261, 4.2341, 4.1160, 3.3714, 3.7027, 3.7573, 0.0000, 4.9249,
         3.4297, 3.0483, 4.2318, 3.6384, 2.7261, 0.0000, 3.7798, 3.4019, 2.7261,
         3.5656, 2.7261, 2.7261, 2.7261, 2.7297, 3.5567, 3.5634, 3.9793, 3.2584,
         2.7261, 2.7261, 3.7712, 2.7297, 0.0000, 4.3285, 0.0000, 3.3287, 2.7261,
         3.4683, 2.7261, 3.1581, 3.7798, 4.1183, 3.1384, 0.0000, 3.9276, 3.5634,
         0.0000, 2.7261, 3.2584, 2.7261, 2.7261, 2.7261, 2.7261, 2.7261, 0.0000,
         0.0000, 3.2584, 0.0000, 2.7261, 2.7261, 4.0039, 2.7297, 3.8378, 3.7070,
         3.6194, 0.0000, 3.7798, 3.8872, 3.8799, 2.7261, 2.7261, 3.2807, 3.1581,
         2.7261, 3.1372, 2.7261, 2.7261, 2.7261, 3.5634, 0.0000, 3.1384, 3.7798,
         4.0017, 2.7261, 3.5242, 3.2