## Data Ingestion + Cleanup

In [1]:
import os
import gc
import glob
import time
import utils
import joblib
import mlflow
import pyarrow
import datetime

import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import Client

from matplotlib import pyplot as plt
import seaborn as sns; sns.set()

from joblib import delayed, parallel_backend, Parallel

  class heapdict(collections.MutableMapping):


In [2]:
# reproducibility is major key
rand_seed = 42
np.random.seed(rand_seed)

glob_init_state = np.random.get_state()
rand_init_state = np.random.RandomState(rand_seed)

In [3]:
# client = Client()
pbar = ProgressBar()
pbar.register()

In [4]:
# define thresholds as timedelta
BAD_THRESHOLD_NDAYS = np.timedelta64(14, 'D')
WARNING_THRESHOLD_NDAYS = np.timedelta64(42, 'D')

In [5]:
# how many days of data in a chunk (that is passed to ceph)
time_window = 6

In [6]:
# inferred int32 types cause a type mismatch (int vs float) error when dask sees a null value
# null values cannot be interpreted as ints
custom_dtypes = {
    "date": "object",
    "serial_number": "object",
    "model": "object",
    "capacity_bytes": "float32",
    "failure": "float32",
    "smart_1_normalized": "float32",
    "smart_1_raw": "float32",
    "smart_2_normalized": "float32",
    "smart_2_raw": "float32",
    "smart_3_normalized": "float32",
    "smart_3_raw": "float32",
    "smart_4_normalized": "float32",
    "smart_4_raw": "float32",
    "smart_5_normalized": "float32",
    "smart_5_raw": "float32",
    "smart_7_normalized": "float32",
    "smart_7_raw": "float32",
    "smart_8_normalized": "float32",
    "smart_8_raw": "float32",
    "smart_9_normalized": "float32",
    "smart_9_raw": "float32",
    "smart_10_normalized": "float32",
    "smart_10_raw": "float32",
    "smart_11_normalized": "float32",
    "smart_11_raw": "float32",
    "smart_12_normalized": "float32",
    "smart_12_raw": "float32",
    "smart_13_normalized": "float32",
    "smart_13_raw": "float32",
    "smart_15_normalized": "float32",
    "smart_15_raw": "float32",
    "smart_16_normalized": "float32",
    "smart_16_raw": "float32",
    "smart_17_normalized": "float32",
    "smart_17_raw": "float32",
    "smart_22_normalized": "float32",
    "smart_22_raw": "float32",
    "smart_23_normalized": "float32",
    "smart_23_raw": "float32",
    "smart_24_normalized": "float32",
    "smart_24_raw": "float32",
    "smart_168_normalized": "float32",
    "smart_168_raw": "float32",
    "smart_170_normalized": "float32",
    "smart_170_raw": "float32",
    "smart_173_normalized": "float32",
    "smart_173_raw": "float32",
    "smart_174_normalized": "float32",
    "smart_174_raw": "float32",
    "smart_177_normalized": "float32",
    "smart_177_raw": "float32",
    "smart_179_normalized": "float32",
    "smart_179_raw": "float32",
    "smart_181_normalized": "float32",
    "smart_181_raw": "float32",
    "smart_182_normalized": "float32",
    "smart_182_raw": "float32",
    "smart_183_normalized": "float32",
    "smart_183_raw": "float32",
    "smart_184_normalized": "float32",
    "smart_184_raw": "float32",
    "smart_187_normalized": "float32",
    "smart_187_raw": "float32",
    "smart_188_normalized": "float32",
    "smart_188_raw": "float32",
    "smart_189_normalized": "float32",
    "smart_189_raw": "float32",
    "smart_190_normalized": "float32",
    "smart_190_raw": "float32",
    "smart_191_normalized": "float32",
    "smart_191_raw": "float32",
    "smart_192_normalized": "float32",
    "smart_192_raw": "float32",
    "smart_193_normalized": "float32",
    "smart_193_raw": "float32",
    "smart_194_normalized": "float32",
    "smart_194_raw": "float32",
    "smart_195_normalized": "float32",
    "smart_195_raw": "float32",
    "smart_196_normalized": "float32",
    "smart_196_raw": "float32",
    "smart_197_normalized": "float32",
    "smart_197_raw": "float32",
    "smart_198_normalized": "float32",
    "smart_198_raw": "float32",
    "smart_199_normalized": "float32",
    "smart_199_raw": "float32",
    "smart_200_normalized": "float32",
    "smart_200_raw": "float32",
    "smart_201_normalized": "float32",
    "smart_201_raw": "float32",
    "smart_218_normalized": "float32",
    "smart_218_raw": "float32",
    "smart_220_normalized": "float32",
    "smart_220_raw": "float32",
    "smart_222_normalized": "float32",
    "smart_222_raw": "float32",
    "smart_223_normalized": "float32",
    "smart_223_raw": "float32",
    "smart_224_normalized": "float32",
    "smart_224_raw": "float32",
    "smart_225_normalized": "float32",
    "smart_225_raw": "float32",
    "smart_226_normalized": "float32",
    "smart_226_raw": "float32",
    "smart_231_normalized": "float32",
    "smart_231_raw": "float32",
    "smart_232_normalized": "float32",
    "smart_232_raw": "float32",
    "smart_233_normalized": "float32",
    "smart_233_raw": "float32",
    "smart_235_normalized": "float32",
    "smart_235_raw": "float32",
    "smart_240_normalized": "float32",
    "smart_240_raw": "float32",
    "smart_241_normalized": "float32",
    "smart_241_raw": "float32",
    "smart_242_normalized": "float32",
    "smart_242_raw": "float32",
    "smart_250_normalized": "float32",
    "smart_250_raw": "float32",
    "smart_251_normalized": "float32",
    "smart_251_raw": "float32",
    "smart_252_normalized": "float32",
    "smart_252_raw": "float32",
    "smart_254_normalized": "float32",
    "smart_254_raw": "float32",
    "smart_255_normalized": "float32",
    "smart_255_raw": "float32",
}

In [7]:
DATA_ROOT_DIR = '/home/kachauha/Downloads/'
MANUFACTURER = 'seagate'

In [8]:
# for now, keep the highly correlated ones but remove 194. remove 240, 242 as well (too low corr)
CRITICAL_STATS = [1, 5, 7, 10, 187, 188, 190, 193, 197, 198, 241]
crit_cols_raw = ['smart_{}_raw'.format(i) for i in CRITICAL_STATS]
crit_cols_normalized = ['smart_{}_normalized'.format(i) for i in CRITICAL_STATS]
keep_cols = ['date', 'serial_number', 'model', 'capacity_bytes', 'failure'] + crit_cols_raw + crit_cols_normalized

In [9]:
# read all the data into one dataframe
df = dd.read_parquet(os.path.join(DATA_ROOT_DIR, 'data_Q4_2018_parquet'),
                     columns=keep_cols,
                     engine='pyarrow',
                     index=False)
seagate_df = df[df['model'].str.startswith('S')]

In [10]:
# remove nans
seagate_df = seagate_df[(~seagate_df['smart_1_raw'].isna())\
                       & (~seagate_df['smart_5_raw'].isna())\
                       & (~seagate_df['smart_187_raw'].isna())\
                       & (~seagate_df['smart_193_raw'].isna())\
                       & (~seagate_df['smart_241_raw'].isna())]

In [11]:
# may be useful for saving differently
failed_sers = seagate_df[seagate_df['failure']==1]['serial_number'].compute()
failed_sers.head()

[########################################] | 100% Completed |  8.2s


29978    Z305B8DE
38504    ZA10CTP4
38544    ZA114N2J
40880    ZCH0CLFQ
84763    S30116JR
Name: serial_number, dtype: object

In [12]:
# convert from str to datetime
seagate_df['date'] = seagate_df['date'].astype('datetime64')

## Prepare + Preprocess

In [13]:
# =============================== FOR DASK =============================== #
# create meta of the resulting failed_df otherwise dask complains
rul_meta = seagate_df._meta
rul_meta = rul_meta.assign(rul_days=rul_meta['date'].max()-rul_meta['date'])
# ======================================================================== #

# get remaining useful life as diff(today, maxday)
# reset index coz result is multiindexed. drop=True coz serial_number already exists as a col
seagate_df = seagate_df.groupby('serial_number').apply(utils.append_rul_days_column, meta=rul_meta).reset_index(drop=True)

In [14]:
# remove working drive data that is recorded after [quarter end minus 6 weeks]
# because we dont know (as of quarter end) if those drives survived more than 6 weeks or not
seagate_df = seagate_df[(seagate_df['serial_number'].isin(failed_sers)) | (seagate_df['rul_days'] >= WARNING_THRESHOLD_NDAYS)]

In [17]:
# drop serial numbers where we have less than 6 days of data -- ceph upstream rejects these
seagate_df = seagate_df[seagate_df['rul_days'] >= np.timedelta64(time_window, 'D')]

In [18]:
# NOTE: assignment must be done in th
# df.head()is order otherwise it wont be correct. FIXME
# assign all as good initially
seagate_df['status'] = 0

# overwrite those which have rul less than 6 weeks as warning
seagate_df['status'] = seagate_df['status'].mask(seagate_df['rul_days'] < WARNING_THRESHOLD_NDAYS, 1)

# overwrite those which have rul less than 2 weeks as bad
seagate_df['status'] = seagate_df['status'].mask(seagate_df['rul_days'] < BAD_THRESHOLD_NDAYS, 2)

### Add month, day

In [19]:
seagate_df = seagate_df.assign(month=seagate_df['date'].dt.month)
seagate_df = seagate_df.assign(day=seagate_df['date'].dt.day)

## Save to Mem

### TODO
- pull all in memory like in cdfp then save drives
- there are several drives (15%) which lived less than 6 days. should these be removed since the upstream removes them too


In [20]:
SAVE_DIR = '/home/kachauha/Downloads/data_Q4_2018_serials/'
FAIL_DIR = os.path.join(SAVE_DIR, 'failed')
WORK_DIR = os.path.join(SAVE_DIR, 'working')
META_DIR = os.path.join(SAVE_DIR, 'meta')

In [21]:
def save_files(df, SAVE_DIR):
    """
    Splits input dataframe into serial number wise groups, then saves each serial number
    group as a csv file
    """
    def save_group(ser, ser_df):
        ser_df.to_csv(os.path.join(SAVE_DIR, ser + '.csv'), index=False)
    
    # spawn a thread for each serial
    start = time.time()
    _ = Parallel(n_jobs=-1, prefer='threads')(
        delayed(save_group)(s, d) for s,d in df.groupby('serial_number'))
    end = time.time()
    print(end-start)

### Failed

In [23]:
# save all failed ones
failed_df = seagate_df[seagate_df['serial_number'].isin(failed_sers)]

In [25]:
save_files(failed_df.compute(), FAIL_DIR)

[########################################] | 100% Completed |  3min 56.8s
1.3347220420837402


### Working

In [26]:
# get all working serials value counts (how many days of data did we have)
working_sers_vc = seagate_df['serial_number'][~seagate_df['serial_number'].isin(failed_sers)].value_counts().compute()

[########################################] | 100% Completed |  4min 12.2s


In [31]:
shorts = seagate_df[seagate_df['rul_days'] < np.timedelta64(time_window, 'D')].compute()
shorts

[########################################] | 100% Completed |  4min 11.7s


Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_raw,smart_5_raw,smart_7_raw,smart_10_raw,smart_187_raw,...,smart_188_normalized,smart_190_normalized,smart_193_normalized,smart_197_normalized,smart_198_normalized,smart_241_normalized,rul_days,status,month,day


In [33]:
seagate_df[seagate_df['serial_number']=='ZJV03MPR'].compute()

[########################################] | 100% Completed |  3min 53.8s


Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_raw,smart_5_raw,smart_7_raw,smart_10_raw,smart_187_raw,...,smart_188_normalized,smart_190_normalized,smart_193_normalized,smart_197_normalized,smart_198_normalized,smart_241_normalized,rul_days,status,month,day
69280,2018-10-02,ZJV03MPR,ST12000NM0007,12000140000000.0,0.0,83231976.0,0.0,1298041.0,0.0,0.0,...,100.0,66.0,100.0,100.0,100.0,100.0,90 days,0,10,2


# FIXME
some serial numbers occur <6 times even though we have filtered out data points with rul <6.
ZJV03MPR shows rul 90 but then thats it. no other data for ruls <90

In [32]:
working_sers_vc.sort_values()

ZJV0MJTM     1
ZJV03MPR     1
ZCH0DYQM     2
ZA21HJEA     3
ZCH04TBP     3
            ..
ZA11QS54    50
ZA11QQZJ    50
ZA11QNZ3    50
ZJV00P1J    50
ZA10BWTP    50
Name: serial_number, Length: 76955, dtype: int64

In [29]:
# we should have already dropped serial numbers for which there is <6 days of data
assert working_sers_vc.min() >= time_window

AssertionError: 

In [51]:
# keep only a small subset for experimenting - keep 10% of each frequency
subset_sers = []
for count in working_sers_vc.unique():
    subset_sers = np.append(subset_sers, working_sers_vc[working_sers_vc==count].sample(frac=0.4, random_state=rand_seed).index.values)
print(len(subset_sers), 'serial numbers kept')

15384 serial numbers kept


In [52]:
working_df = seagate_df[seagate_df['serial_number'].isin(subset_sers)]

In [64]:
save_files(working_df.compute(), WORK_DIR)

[########################################] | 100% Completed |  4min  4.3s
70.35569500923157


## Metadata

In [22]:
feat_cols = ['capacity_bytes'] + crit_cols_raw + crit_cols_normalized + ['month', 'day']

In [54]:
# concatenate the two
df = working_df.append(failed_df)

# convert from bytes to gigabytes
df['capacity_bytes'] /= 10**9

# metadata for normalizing
means =  df[feat_cols].mean().compute()
stds = df[feat_cols].std().compute()

[########################################] | 100% Completed |  3min  1.1s
[##################                      ] | 46% Completed | 47.7s

  x = np.divide(x1, x2, out)


[########################################] | 100% Completed |  3min 10.2s


In [66]:
# save all metadata
# NOTE: must keep index
means.to_csv(os.path.join(META_DIR, 'means.csv'))
stds.to_csv(os.path.join(META_DIR, 'stds.csv'))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


## Get serials already saved then compute meta

#### FIXME: capacity needs to be smaller numbers otherwise mean centering does not work

In [106]:
# get serial numbers
failed_sers = [f.split('.')[0] for f in os.listdir(FAIL_DIR) if os.path.isfile(os.path.join(FAIL_DIR, f)) and f.endswith('.csv')]
working_sers = [f.split('.')[0] for f in os.listdir(WORK_DIR) if os.path.isfile(os.path.join(WORK_DIR, f)) and f.endswith('.csv')]

In [107]:
relevant_df = df[(df['serial_number'].isin(failed_sers)) | (df['serial_number'].isin(working_sers))].compute()

[########################################] | 100% Completed |  4.7s


In [108]:
# convert from bytes to gigabytes
relevant_df['capacity_bytes'] /= 10**9

In [109]:
means = relevant_df[feat_cols].mean()
stds = relevant_df[feat_cols].std()

In [110]:
META_DIR = '/home/kachauha/Downloads/data_Q4_2018_serials/meta'
means.to_csv(os.path.join(META_DIR, 'means.csv'))
stds.to_csv(os.path.join(META_DIR, 'stds.csv'))

In [117]:
pd.read_csv(os.path.join(META_DIR, 'means.csv'), header=None).set_index(0).transpose()

Unnamed: 0,capacity_bytes,smart_1_raw,smart_5_raw,smart_7_raw,smart_10_raw,smart_187_raw,smart_188_raw,smart_190_raw,smart_193_raw,smart_197_raw,...,smart_5_normalized,smart_7_normalized,smart_10_normalized,smart_187_normalized,smart_188_normalized,smart_190_normalized,smart_193_normalized,smart_197_normalized,smart_198_normalized,smart_241_normalized
1,7956.243344,121972800.0,578.490021,2837305000.0,0.0,5.801774,288814900.0,28.788683,33090.953144,11.695257,...,99.797553,85.965448,100.0,96.777237,100.0,71.211317,87.061664,99.979469,99.979469,100.0


In [121]:
list(pd.read_csv(os.path.join(META_DIR, 'means.csv'), header=None)[0])

['capacity_bytes',
 'smart_1_raw',
 'smart_5_raw',
 'smart_7_raw',
 'smart_10_raw',
 'smart_187_raw',
 'smart_188_raw',
 'smart_190_raw',
 'smart_193_raw',
 'smart_197_raw',
 'smart_198_raw',
 'smart_241_raw',
 'smart_1_normalized',
 'smart_5_normalized',
 'smart_7_normalized',
 'smart_10_normalized',
 'smart_187_normalized',
 'smart_188_normalized',
 'smart_190_normalized',
 'smart_193_normalized',
 'smart_197_normalized',
 'smart_198_normalized',
 'smart_241_normalized']