## Data Ingestion + Cleanup

In [43]:
import os
import gc
import time
import utils
import joblib
import mlflow
import pyarrow
import datetime

import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import Client

from matplotlib import pyplot as plt
import seaborn as sns; sns.set()

from joblib import delayed, parallel_backend, Parallel

In [2]:
# client = Client()
pbar = ProgressBar()
pbar.register()

In [3]:
# define thresholds as timedelta
BAD_THRESHOLD_NDAYS = np.timedelta64(14, 'D')
WARNING_THRESHOLD_NDAYS = np.timedelta64(42, 'D')

In [4]:
# inferred int32 types cause a type mismatch (int vs float) error when dask sees a null value
# null values cannot be interpreted as ints
custom_dtypes = {
    "date": "object",
    "serial_number": "object",
    "model": "object",
    "capacity_bytes": "float32",
    "failure": "float32",
    "smart_1_normalized": "float32",
    "smart_1_raw": "float32",
    "smart_2_normalized": "float32",
    "smart_2_raw": "float32",
    "smart_3_normalized": "float32",
    "smart_3_raw": "float32",
    "smart_4_normalized": "float32",
    "smart_4_raw": "float32",
    "smart_5_normalized": "float32",
    "smart_5_raw": "float32",
    "smart_7_normalized": "float32",
    "smart_7_raw": "float32",
    "smart_8_normalized": "float32",
    "smart_8_raw": "float32",
    "smart_9_normalized": "float32",
    "smart_9_raw": "float32",
    "smart_10_normalized": "float32",
    "smart_10_raw": "float32",
    "smart_11_normalized": "float32",
    "smart_11_raw": "float32",
    "smart_12_normalized": "float32",
    "smart_12_raw": "float32",
    "smart_13_normalized": "float32",
    "smart_13_raw": "float32",
    "smart_15_normalized": "float32",
    "smart_15_raw": "float32",
    "smart_16_normalized": "float32",
    "smart_16_raw": "float32",
    "smart_17_normalized": "float32",
    "smart_17_raw": "float32",
    "smart_22_normalized": "float32",
    "smart_22_raw": "float32",
    "smart_23_normalized": "float32",
    "smart_23_raw": "float32",
    "smart_24_normalized": "float32",
    "smart_24_raw": "float32",
    "smart_168_normalized": "float32",
    "smart_168_raw": "float32",
    "smart_170_normalized": "float32",
    "smart_170_raw": "float32",
    "smart_173_normalized": "float32",
    "smart_173_raw": "float32",
    "smart_174_normalized": "float32",
    "smart_174_raw": "float32",
    "smart_177_normalized": "float32",
    "smart_177_raw": "float32",
    "smart_179_normalized": "float32",
    "smart_179_raw": "float32",
    "smart_181_normalized": "float32",
    "smart_181_raw": "float32",
    "smart_182_normalized": "float32",
    "smart_182_raw": "float32",
    "smart_183_normalized": "float32",
    "smart_183_raw": "float32",
    "smart_184_normalized": "float32",
    "smart_184_raw": "float32",
    "smart_187_normalized": "float32",
    "smart_187_raw": "float32",
    "smart_188_normalized": "float32",
    "smart_188_raw": "float32",
    "smart_189_normalized": "float32",
    "smart_189_raw": "float32",
    "smart_190_normalized": "float32",
    "smart_190_raw": "float32",
    "smart_191_normalized": "float32",
    "smart_191_raw": "float32",
    "smart_192_normalized": "float32",
    "smart_192_raw": "float32",
    "smart_193_normalized": "float32",
    "smart_193_raw": "float32",
    "smart_194_normalized": "float32",
    "smart_194_raw": "float32",
    "smart_195_normalized": "float32",
    "smart_195_raw": "float32",
    "smart_196_normalized": "float32",
    "smart_196_raw": "float32",
    "smart_197_normalized": "float32",
    "smart_197_raw": "float32",
    "smart_198_normalized": "float32",
    "smart_198_raw": "float32",
    "smart_199_normalized": "float32",
    "smart_199_raw": "float32",
    "smart_200_normalized": "float32",
    "smart_200_raw": "float32",
    "smart_201_normalized": "float32",
    "smart_201_raw": "float32",
    "smart_218_normalized": "float32",
    "smart_218_raw": "float32",
    "smart_220_normalized": "float32",
    "smart_220_raw": "float32",
    "smart_222_normalized": "float32",
    "smart_222_raw": "float32",
    "smart_223_normalized": "float32",
    "smart_223_raw": "float32",
    "smart_224_normalized": "float32",
    "smart_224_raw": "float32",
    "smart_225_normalized": "float32",
    "smart_225_raw": "float32",
    "smart_226_normalized": "float32",
    "smart_226_raw": "float32",
    "smart_231_normalized": "float32",
    "smart_231_raw": "float32",
    "smart_232_normalized": "float32",
    "smart_232_raw": "float32",
    "smart_233_normalized": "float32",
    "smart_233_raw": "float32",
    "smart_235_normalized": "float32",
    "smart_235_raw": "float32",
    "smart_240_normalized": "float32",
    "smart_240_raw": "float32",
    "smart_241_normalized": "float32",
    "smart_241_raw": "float32",
    "smart_242_normalized": "float32",
    "smart_242_raw": "float32",
    "smart_250_normalized": "float32",
    "smart_250_raw": "float32",
    "smart_251_normalized": "float32",
    "smart_251_raw": "float32",
    "smart_252_normalized": "float32",
    "smart_252_raw": "float32",
    "smart_254_normalized": "float32",
    "smart_254_raw": "float32",
    "smart_255_normalized": "float32",
    "smart_255_raw": "float32",
}

In [5]:
DATA_ROOT_DIR = '/home/kachauha/Downloads/'
MANUFACTURER = 'seagate'

In [6]:
# for now, keep the highly correlated ones but remove 194. remove 240, 242 as well (too low corr)
CRITICAL_STATS = [1, 5, 7, 10, 187, 188, 190, 193, 197, 198, 241]
crit_cols_raw = ['smart_{}_raw'.format(i) for i in CRITICAL_STATS]
crit_cols_normalized = ['smart_{}_normalized'.format(i) for i in CRITICAL_STATS]
keep_cols = ['date', 'serial_number', 'model', 'capacity_bytes', 'failure'] + crit_cols_raw + crit_cols_normalized

In [7]:
# read all the data into one dataframe
df = dd.read_parquet(os.path.join(DATA_ROOT_DIR, 'data_Q4_2018_parquet'),
                     columns=keep_cols,
                     engine='pyarrow',
                     index=False)
seagate_df = df[df['model'].str.startswith('S')]

In [8]:
# remove nans
seagate_df = seagate_df[(~seagate_df['smart_1_raw'].isna())\
                       & (~seagate_df['smart_5_raw'].isna())\
                       & (~seagate_df['smart_187_raw'].isna())\
                       & (~seagate_df['smart_193_raw'].isna())\
                       & (~seagate_df['smart_241_raw'].isna())]

In [10]:
# may be useful for saving differently
failed_sers = seagate_df[seagate_df['failure']==1]['serial_number'].compute()
failed_sers.head()

[########################################] | 100% Completed | 11.3s


29978    Z305B8DE
38504    ZA10CTP4
38544    ZA114N2J
40880    ZCH0CLFQ
84763    S30116JR
Name: serial_number, dtype: object

In [9]:
# convert from str to datetime
seagate_df['date'] = seagate_df['date'].astype('datetime64')

## Prepare + Preprocess

In [11]:
# =============================== FOR DASK =============================== #
# create meta of the resulting failed_df otherwise dask complains
rul_meta = seagate_df._meta
rul_meta = rul_meta.assign(rul_days=rul_meta['date'].max()-rul_meta['date'])
# ======================================================================== #

# get remaining useful life as diff(today, maxday)
# reset index coz result is multiindexed. drop=True coz serial_number already exists as a col
seagate_df = seagate_df.groupby('serial_number').apply(utils.append_rul_days_column, meta=rul_meta).reset_index(drop=True)

In [12]:
# remove working drive data that is recorded after [quarter end minus 6 weeks]
# because we dont know (as of quarter end) if those drives survived more than 6 weeks or not
seagate_df = seagate_df[(seagate_df['serial_number'].isin(failed_sers)) | (seagate_df['rul_days'] >= WARNING_THRESHOLD_NDAYS)]

In [13]:
# NOTE: assignment must be done in th
# df.head()is order otherwise it wont be correct. FIXME
# assign all as good initially
seagate_df['status'] = 0

# overwrite those which have rul less than 6 weeks as warning
seagate_df['status'] = seagate_df['status'].mask(seagate_df['rul_days'] < WARNING_THRESHOLD_NDAYS, 1)

# overwrite those which have rul less than 2 weeks as bad
seagate_df['status'] = seagate_df['status'].mask(seagate_df['rul_days'] < BAD_THRESHOLD_NDAYS, 2)

In [14]:
# get all serial numbers
all_serials = seagate_df['serial_number'].unique().compute()
num_serials = len(all_serials)

[########################################] | 100% Completed |  2min 59.1s


## Save to Mem

### TODO
- pull all in memory like in cfp then save drives
- there are several drives (15%) which lived less than 6 days. should these be kept

In [51]:
def save_files(df):
    # root dir
    SAVE_DIR = '/home/kachauha/Downloads/data_Q4_2018_serials/working'

    def save_group(ser, ser_df):
        ser_df.to_csv(os.path.join(SAVE_DIR, ser + '.csv'), index=False)
    
    # spawn a thread for each serial
    start = time.time()
    _ = Parallel(n_jobs=-1, prefer='threads')(
        delayed(save_group)(s, d) for s,d in df.groupby('serial_number'))
    end = time.time()
    print(end-start)

### Failed

In [37]:
# save all failed ones
failed_df = seagate_df[seagate_df['serial_number'].isin(failed_sers)].compute()

[########################################] | 100% Completed |  4min  1.7s


In [None]:
save_files(failed_df)

### Working

In [48]:
# get all working serials
working_sers_vc = seagate_df['serial_number'][~seagate_df['serial_number'].isin(failed_sers)].value_counts().compute()

[########################################] | 100% Completed |  3min 46.2s


In [49]:
time_window = 6

In [50]:
# keep only a small subset for experimenting - keep 10% of each frequency
subset_sers = []
for count in working_sers_vc.unique():
    if count > time_window:
        subset_sers = np.append(subset_sers, working_sers_vc[working_sers_vc==count].sample(frac=0.1).index.values)
print(len(subset_sers), 'serial numbers kept')

7686 serial numbers kept


In [52]:
save_files(seagate_df[seagate_df['serial_number'].isin(subset_sers)].compute())

[########################################] | 100% Completed |  3min 34.1s
32.27455759048462


In [None]:
# # downsample working ones and save those
# num_working_serials = 10000
# working_sers = seagate_df[~seagate_df['serial_number'].isin(failed_sers)]['serial_number'].unique()
# working_repr_sers = working_sers.sample(frac=(num_working_serials/len(working_sers))).compute()