In [34]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from typing import Union, Dict, List
import numpy as np
from sklearn import preprocessing
from torch.utils.data import Dataset
from datetime import datetime

from utils import load_config

In [107]:
class ChildInstituteDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        date_num = int(self.data[idx].get('date').replace('-', ''))
        return {
            'X': torch.from_numpy(self.data[idx].get('X')),
            'y': torch.tensor(self.data[idx]['event'], dtype=torch.float32),
            'series_id': torch.Tensor([self.data[idx].get('series_id')]),
            'date': date_num,
            'step': torch.Tensor([self.data[idx].get('step')]),
        }
    # def __getitem__(self, idx):
    #     date = datetime.strptime(self.data[idx].get('date'), '%Y-%m-%d')
    #     year = date.year
    #     month = date.month
    #     day = date.day
    #     return {
    #         'X': torch.from_numpy(self.data[idx].get('X')),
    #         'y': torch.tensor(self.data[idx]['event'], dtype=torch.float32),
    #         'series_id': torch.Tensor([self.data[idx].get('series_id')]),
    #         'year': torch.tensor([year], dtype=torch.int32),
    #         'month': torch.tensor([month], dtype=torch.int32), 
    #         'day': torch.tensor([day], dtype=torch.int32),
    #         'step': torch.Tensor([self.data[idx].get('step')]),
    #     }

def preprocess(data, key: List[str] = ['series_id'], **kwargs) -> pd.DataFrame:
    data.rename(columns={'timestamp': 'date'}, inplace=True)
    data['date'] = pd.to_datetime(data['date'], utc=True)
    data['date'] = data['date'].dt.date.astype(str)

    return data

def scale(data, config: Dict[str, str]) -> None:
    excluding_columnns = config.get('general').get('data').get('excluding_columns')
    target_columns = list(set(data.columns) - set(excluding_columnns))
    scaler = getattr(preprocessing, config.get('train').get('data').get('scaler'))()

    data.loc[:, target_columns] = scaler.fit_transform(data.filter(items=target_columns))


def to_list(data, window_size: int, config: Dict[str, str], step: int = 1, key: List[str] = ['series_id']) -> List[pd.DataFrame]:
    data = [datum[1] for datum in data.groupby(key)]
    for datum in data:
        scale(datum, config)

    start_of_feature_index = np.where(data[0].columns.str.find('event') == 0)[0].item()
    slided_window = [
            np.lib.stride_tricks.sliding_window_view(
                datum.iloc[:, start_of_feature_index:],
                window_size,
                axis=0)[::step]
            for datum
            in data]

    return np.concatenate(slided_window, dtype=np.float32)

def extract_keys(data, window_size: int, step: int = 1, key: List[str] = ['series_id']):
    return (
        data.groupby(key).apply(lambda x: x.iloc[window_size-1:])
        .drop(columns=key)
        .reset_index()
        .drop(columns=[f'level_{len(key)}', 'anglez', 'enmo'])
        .to_dict('records')
    )[::step]


In [108]:
test_data = pd.read_parquet('./data/test_series.parquet')
test_data['event'] = 0
test_data = test_data[['series_id', 'timestamp', 'step', 'event', 'anglez', 'enmo']]

In [109]:
test_data['series_id'], unique_series = pd.factorize(test_data['series_id'])
id_map = {idx: id_ for idx, id_ in enumerate(unique_series)}
id_map

{0: '038441c925bb', 1: '03d92c9f6f8a', 2: '0402a003dae9'}

In [110]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

config = load_config('./configs')

window_size = config.get('inference').get('window_size')
step = config.get('inference').get('step')

preprocessed_data = preprocess(test_data)

test_list = to_list(preprocessed_data, window_size, config, step)
test_keys = extract_keys(preprocessed_data, window_size, step)

for i, test_key in enumerate(test_keys):
        test_key['X'] = test_list[i]
test_list = test_keys

test_dataset = ChildInstituteDataset(test_list)

batch_size = config.get('inference').get('batch_size')
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [111]:
test_list

[{'series_id': 0,
  'date': '2018-08-14',
  'step': 19,
  'event': 0,
  'X': array([[0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
          0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
          0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
          0.       , 0.       ],
         [1.6763793, 1.676382 , 1.6763872, 1.676382 , 1.676382 , 1.6763793,
          1.6763793, 1.6763793, 1.6805546, 1.6879761, 1.6800939, 1.6733716,
          1.6705861, 1.6705861, 1.6705861, 1.6705861, 1.6705861, 1.6705941,
          2.8991497, 3.0279403],
         [1.1931645, 1.1583107, 1.1757376, 1.1234565, 1.1583107, 1.1931645,
          1.1931645, 1.2105914, 1.2977259, 1.1931645, 1.4022878, 1.1757376,
          1.2105914, 1.1408838, 1.1408838, 1.1408838, 1.1583107, 1.1931645,
          4.29516  , 0.3043905]], dtype=float32)},
 {'series_id': 0,
  'date': '2018-08-14',
  'step': 29,
  'event': 0,
  'X': array([[ 0.        ,  0.        ,  0.        , 

In [112]:
batch = next(iter(test_dataloader))

In [113]:
batch

{'X': tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [1.6764, 1.6764, 1.6764, 1.6764, 1.6764, 1.6764, 1.6764, 1.6764,
           1.6806, 1.6880, 1.6801, 1.6734, 1.6706, 1.6706, 1.6706, 1.6706,
           1.6706, 1.6706, 2.8991, 3.0279],
          [1.1932, 1.1583, 1.1757, 1.1235, 1.1583, 1.1932, 1.1932, 1.2106,
           1.2977, 1.1932, 1.4023, 1.1757, 1.2106, 1.1409, 1.1409, 1.1409,
           1.1583, 1.1932, 4.2952, 0.3044]]]),
 'y': tensor([0.]),
 'series_id': tensor([[0.]]),
 'date': tensor([20180814]),
 'step': tensor([[19.]])}

In [None]:
submission['series_id'] = submission['series_id'].map(id_map)

In [39]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from typing import Union, Dict, List
import numpy as np
from datetime import datetime

from data import ChildInstituteDataset, to_list, extract_keys
from utils import load_config

In [40]:
config = load_config()
test_data_path = config.get('general').get('test_data').get('path')

In [41]:
test_data = pd.read_parquet(test_data_path)
test_data['event'] = 0
test_data = test_data[['series_id', 'timestamp', 'step', 'event', 'anglez', 'enmo']]
test_data['series_id'], unique_series = pd.factorize(test_data['series_id'])
id_map = {idx: id_ for idx, id_ in enumerate(unique_series)}

In [42]:
test_data.head()

Unnamed: 0,series_id,timestamp,step,event,anglez,enmo
0,0,2018-08-14T15:30:00-0400,0,0,2.6367,0.0217
1,0,2018-08-14T15:30:05-0400,1,0,2.6368,0.0215
2,0,2018-08-14T15:30:10-0400,2,0,2.637,0.0216
3,0,2018-08-14T15:30:15-0400,3,0,2.6368,0.0213
4,0,2018-08-14T15:30:20-0400,4,0,2.6368,0.0215


In [46]:
def preprocess(data, key: List[str] = ['series_id'], **kwargs) -> pd.DataFrame:
    data.rename(columns={'timestamp': 'date'}, inplace=True)

    if isinstance(data.date[0], (np.int0, np.int8, np.int16, np.int32, np.int64)):
        return data

    if not isinstance(data.date[0], datetime):
        data.date = data.date.astype(str).str.replace(r'[-+]\d{2}00$', '',regex=True)
        data.date = pd.to_datetime(
            data.date,
            format='%Y-%m-%dT%H:%M:%S',
            utc=True)
        data.date = data.date.dt.date
        data.date = data.date.astype('datetime64[ns]')

    if isinstance(data.date[0], datetime):
        data.date = (
            data.date
            .fillna(-1)
            .astype(str)
            .str.replace('-', '')
            .str.replace('^20', '', regex=True)
            .astype(np.int32)
        ) - 100000

    return data

preprocessed_data = preprocess(test_data)

In [47]:
preprocessed_data

Unnamed: 0,series_id,date,step,event,anglez,enmo
0,0,80814,0,0,2.636700,0.0217
1,0,80814,1,0,2.636800,0.0215
2,0,80814,2,0,2.637000,0.0216
3,0,80814,3,0,2.636800,0.0213
4,0,80814,4,0,2.636800,0.0215
...,...,...,...,...,...,...
445,2,81218,145,0,-59.696899,0.0601
446,2,81218,146,0,-35.656601,0.0427
447,2,81218,147,0,-21.582399,0.0309
448,2,81218,148,0,-42.616001,0.0328
