# Petastorm

In [None]:
import torch
from petastorm import make_reader, make_batch_reader
from petastorm.pytorch import DataLoader

In [1]:
DATA_DIR = 'file:///home/kachauha/Downloads/data_Q4_2018_parquet/part.0.parquet'
with DataLoader(make_batch_reader(DATA_DIR, num_epochs=10), batch_size=64) as train_loader:
    for batch in train_loader:
        print(batch)

TypeError: 'NoneType' object is not iterable

**RESULT** not ready for plug n play

# Dask + Pandas + Pytorch

- first need to filter out serial numbers that lasted less than window_size, then filter out columns not used in training, then group data by serial number and save those csvs
- For loading in batches of serial numbers, intuitively makes more sense to store data serial number wise instead of date wise

In [1]:
import os
import time
import glob

import numpy as np
import pandas as pd
import dask.dataframe as dd

import joblib
from joblib import Parallel, delayed, parallel_backend

In [2]:
# inferred int32 types cause a type mismatch (int vs float) error when dask sees a null value
# null values cannot be interpreted as ints
custom_dtypes = {
    "date": "object",
    "serial_number": "object",
    "model": "object",
    "capacity_bytes": "float32",
    "failure": "float32",
    "smart_1_normalized": "float32",
    "smart_1_raw": "float32",
    "smart_2_normalized": "float32",
    "smart_2_raw": "float32",
    "smart_3_normalized": "float32",
    "smart_3_raw": "float32",
    "smart_4_normalized": "float32",
    "smart_4_raw": "float32",
    "smart_5_normalized": "float32",
    "smart_5_raw": "float32",
    "smart_7_normalized": "float32",
    "smart_7_raw": "float32",
    "smart_8_normalized": "float32",
    "smart_8_raw": "float32",
    "smart_9_normalized": "float32",
    "smart_9_raw": "float32",
    "smart_10_normalized": "float32",
    "smart_10_raw": "float32",
    "smart_11_normalized": "float32",
    "smart_11_raw": "float32",
    "smart_12_normalized": "float32",
    "smart_12_raw": "float32",
    "smart_13_normalized": "float32",
    "smart_13_raw": "float32",
    "smart_15_normalized": "float32",
    "smart_15_raw": "float32",
    "smart_16_normalized": "float32",
    "smart_16_raw": "float32",
    "smart_17_normalized": "float32",
    "smart_17_raw": "float32",
    "smart_22_normalized": "float32",
    "smart_22_raw": "float32",
    "smart_23_normalized": "float32",
    "smart_23_raw": "float32",
    "smart_24_normalized": "float32",
    "smart_24_raw": "float32",
    "smart_168_normalized": "float32",
    "smart_168_raw": "float32",
    "smart_170_normalized": "float32",
    "smart_170_raw": "float32",
    "smart_173_normalized": "float32",
    "smart_173_raw": "float32",
    "smart_174_normalized": "float32",
    "smart_174_raw": "float32",
    "smart_177_normalized": "float32",
    "smart_177_raw": "float32",
    "smart_179_normalized": "float32",
    "smart_179_raw": "float32",
    "smart_181_normalized": "float32",
    "smart_181_raw": "float32",
    "smart_182_normalized": "float32",
    "smart_182_raw": "float32",
    "smart_183_normalized": "float32",
    "smart_183_raw": "float32",
    "smart_184_normalized": "float32",
    "smart_184_raw": "float32",
    "smart_187_normalized": "float32",
    "smart_187_raw": "float32",
    "smart_188_normalized": "float32",
    "smart_188_raw": "float32",
    "smart_189_normalized": "float32",
    "smart_189_raw": "float32",
    "smart_190_normalized": "float32",
    "smart_190_raw": "float32",
    "smart_191_normalized": "float32",
    "smart_191_raw": "float32",
    "smart_192_normalized": "float32",
    "smart_192_raw": "float32",
    "smart_193_normalized": "float32",
    "smart_193_raw": "float32",
    "smart_194_normalized": "float32",
    "smart_194_raw": "float32",
    "smart_195_normalized": "float32",
    "smart_195_raw": "float32",
    "smart_196_normalized": "float32",
    "smart_196_raw": "float32",
    "smart_197_normalized": "float32",
    "smart_197_raw": "float32",
    "smart_198_normalized": "float32",
    "smart_198_raw": "float32",
    "smart_199_normalized": "float32",
    "smart_199_raw": "float32",
    "smart_200_normalized": "float32",
    "smart_200_raw": "float32",
    "smart_201_normalized": "float32",
    "smart_201_raw": "float32",
    "smart_218_normalized": "float32",
    "smart_218_raw": "float32",
    "smart_220_normalized": "float32",
    "smart_220_raw": "float32",
    "smart_222_normalized": "float32",
    "smart_222_raw": "float32",
    "smart_223_normalized": "float32",
    "smart_223_raw": "float32",
    "smart_224_normalized": "float32",
    "smart_224_raw": "float32",
    "smart_225_normalized": "float32",
    "smart_225_raw": "float32",
    "smart_226_normalized": "float32",
    "smart_226_raw": "float32",
    "smart_231_normalized": "float32",
    "smart_231_raw": "float32",
    "smart_232_normalized": "float32",
    "smart_232_raw": "float32",
    "smart_233_normalized": "float32",
    "smart_233_raw": "float32",
    "smart_235_normalized": "float32",
    "smart_235_raw": "float32",
    "smart_240_normalized": "float32",
    "smart_240_raw": "float32",
    "smart_241_normalized": "float32",
    "smart_241_raw": "float32",
    "smart_242_normalized": "float32",
    "smart_242_raw": "float32",
    "smart_250_normalized": "float32",
    "smart_250_raw": "float32",
    "smart_251_normalized": "float32",
    "smart_251_raw": "float32",
    "smart_252_normalized": "float32",
    "smart_252_raw": "float32",
    "smart_254_normalized": "float32",
    "smart_254_raw": "float32",
    "smart_255_normalized": "float32",
    "smart_255_raw": "float32",
}

## Reformat and save data

In [60]:
csv_paths = ['/home/kachauha/Downloads/data_Q4_2018/2018-10-01.csv',
             '/home/kachauha/Downloads/data_Q4_2018/2018-10-02.csv',
             '/home/kachauha/Downloads/data_Q4_2018/2018-10-03.csv']
ddf = dd.read_csv(csv_paths[0], dtype=custom_dtypes)
for path in csv_paths[1:]:
    ddf = ddf.append(dd.read_csv(path, dtype=custom_dtypes))

((298861, 129),)

In [5]:
# time window slice that will be fed to lstm
time_window = 2

In [137]:
# first 9 days worth of data
csv_paths = glob.glob('/home/kachauha/Downloads/data_Q4_2018/2018-10-0*.csv')
df = pd.read_csv(csv_paths[0])
for path in csv_paths[1:]:
    df = df.append(pd.read_csv(path), ignore_index=True)

# keep only a small subset for experimenting - keep 10% of each frequency
subset_sers = []
vc = df['serial_number'].value_counts()
for count in vc.unique():
    if count > time_window:
        subset_sers = np.append(subset_sers, vc[vc==count].sample(frac=0.1).index.values)
print(len(subset_sers), 'serial numbers kept')
        
df = df[df['serial_number'].isin(subset_sers)]
print('df shape =', df.shape)
df.head()

10084 serial numbers kept
df shape = (90212, 129)


Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
0,2018-10-05,Z305B2QN,ST4000DM000,4000787030016,0,119.0,221968208.0,,,91.0,...,,,,,,,,,,
21,2018-10-05,PL1331LAHD1HTH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,134.0,103.0,100.0,...,,,,,,,,,,
30,2018-10-05,ZA13YGBE,ST8000DM002,8001563222016,0,78.0,70714944.0,,,92.0,...,,,,,,,,,,
31,2018-10-05,ZA18CEB3,ST8000NM0055,8001563222016,0,78.0,57687512.0,,,96.0,...,,,,,,,,,,
58,2018-10-05,ZJV0T566,ST12000NM0007,12000138625024,0,77.0,45629552.0,,,98.0,...,,,,,,,,,,


In [138]:
def save_group(ser, ser_df):
    SAVE_DIR = '/home/kachauha/Downloads/data_Q4_2018_serials/'
    ser_df.to_csv(os.path.join(SAVE_DIR, ser + '.csv'), index=False)

In [139]:
start = time.time()
_ = Parallel(n_jobs=-1, prefer='threads')(
    delayed(save_group)(s, d) for s,d in df.groupby('serial_number'))
end = time.time()
print(end-start)

24.355740785598755


## Load in time window slices

In [22]:
batch_size = 5

In [6]:
import itertools
import random

In [17]:
# experiment
# total serials
num_total_sers = 100

# generate varying length (from min to 4*min) random number filled sequences
ts_data = []
for i in range(num_total_sers):
    curr_ts_len = random.randint(time_window, 4*time_window)
    ts_data.append(list(np.random.random(size=(curr_ts_len))))

In [19]:
ts_data[:3]

[[0.96310499342986,
  0.7591460584111961,
  0.8389684098703237,
  0.4648483219318159,
  0.5933514956859721,
  0.7100772916714108],
 [0.8530215355855947,
  0.557167740304858,
  0.45128951694050734,
  0.5501080968376889,
  0.7432796469530133,
  0.7006215439612169],
 [0.2566581185645632,
  0.2798830182815527,
  0.414283457119197,
  0.4162419398995375,
  0.38434181581618565,
  0.756601568539205,
  0.8950328881222918,
  0.7806650368943411]]

In [30]:
# itertool experiments
testlist =[[1,2,3,4], [5,6,7], [8,9,10], [11,12,13,14,15]]
for i in itertools.zip_longest(*testlist):
    print(i)

(1, 5, 8, 11)
(2, 6, 9, 12)
(3, 7, 10, 13)
(4, None, None, 14)
(None, None, None, 15)


In [35]:
# serial numbers in current batch
curr_batch_sers = 

Index(['Z30218JY', 'ZA16DWQK', 'ZA12Y7GP', 'VKGZ9VRX', 'ZA13E6KE', 'Z302A1D3',
       'PL2331LAHAWVGJ', 'ZCH083A5', 'PL1331LAGT9P1H', 'ZA16V3D3'],
      dtype='object')