In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
class z24Dataset(Dataset):
    def __init__(self, window_size=100, skipstep=50, lookahead=1, include_environmental_data=False):
        self.window_size = window_size
        self.skipstep = skipstep
        self.lookahead = lookahead
        self.include_environmental_data = include_environmental_data        
        self.index_file = np.loadtxt('../data/z24/name_to_index.txt',dtype=str)
        self.name_index_dict = dict(zip(range(len(self.index_file)),list(self.index_file)))
        self.slices_per_file  = 65536//self.skipstep if (self.window_size+self.lookahead) < self.skipstep else \
                               (65536//self.skipstep) - (self.window_size+self.lookahead)
        
    def __len__(self):
        return len(self.index_file) * self.slices_per_file

    def __getitem__(self, index):
        index_to_read = index // self.slices_per_file
        file_to_read = self.name_index_dict[index_to_read]
        index_in_dataframe = index - index_to_read*self.slices_per_file
        
        df_list = []
        for end in ['03','05','06', '07', '10', '12', '14', '16']:
            df = pd.read_csv('../data/z24/permanent/'+file_to_read+end+'.aaa', sep=' ', nrows=65536, skiprows=1)
            df.columns = [end]    
            df_list.append(df)
        data = pd.concat(df_list, axis=1).as_matrix()
        
        return data[index_in_dataframe:index_in_dataframe+self.window_size,:]

In [36]:
test = z24Dataset(window_size=100, skipstep=50)

In [25]:
dataloader = DataLoader(test, batch_size=100,
                        shuffle=False, num_workers=2)

In [26]:
%%time
next(iter(dataloader))

CPU times: user 7 ms, sys: 15 ms, total: 22 ms
Wall time: 15.5 s



( 0 ,.,.) = 
  1.0000e-02  1.0000e-02  1.0000e-02  ...   1.0000e-02  1.0000e-02  1.0000e-02
 -3.1667e-05  5.3390e-04  3.5500e-05  ...   1.9184e-05 -4.4166e-05  8.8667e-05
 -3.5333e-05  5.3411e-04  3.1833e-05  ...   1.7445e-05 -4.4799e-05  8.8000e-05
                 ...                   ⋱                   ...                
 -3.4333e-05  5.3305e-04  2.5167e-05  ...   1.0330e-05 -4.9542e-05  8.2000e-05
 -4.1333e-05  5.3411e-04  3.0000e-05  ...   1.6444e-05 -4.8488e-05  8.2667e-05
 -4.0333e-05  5.3400e-04  3.4333e-05  ...   1.8710e-05 -4.6802e-05  9.0000e-05

( 1 ,.,.) = 
 -3.1667e-05  5.3390e-04  3.5500e-05  ...   1.9184e-05 -4.4166e-05  8.8667e-05
 -3.5333e-05  5.3411e-04  3.1833e-05  ...   1.7445e-05 -4.4799e-05  8.8000e-05
 -4.0333e-05  5.3348e-04  2.9833e-05  ...   1.5337e-05 -4.5853e-05  8.9667e-05
                 ...                   ⋱                   ...                
 -4.1333e-05  5.3411e-04  3.0000e-05  ...   1.6444e-05 -4.8488e-05  8.2667e-05
 -4.0333e-05  5.3400e-04

In [22]:
for i_batch, batch in enumerate(dataloader):
    print(i_batch)
    continue

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


KeyboardInterrupt: 

Process Process-26:
Process Process-25:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/dsteinar/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/dsteinar/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/dsteinar/miniconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/dsteinar/miniconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/dsteinar/miniconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 42, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/dsteinar/miniconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 42, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/ds

In [70]:
%%timeit
skip=65000
tst=pd.read_csv('../data/z24/permanent/'+'12A18'+'03'+'.aaa', sep=' ', nrows=65536-skip, skiprows=skip)

5.84 ms ± 341 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [71]:
%%timeit
tst1=pd.read_csv('../data/z24/permanent/'+'12A18'+'03'+'.aaa', sep=' ', nrows=65536, skiprows=1)

18.9 ms ± 869 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [66]:
tst.shape

(536, 1)

In [67]:
tst1.shape

(6553, 1)