This notebook reads in the data and saves it as a train/test/validate split. 

Next notebook will get us some summary s about each set. 

In [1]:
import gc
import os
import pdb
import random

import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

from matplotlib import pyplot as plt
import seaborn as sns

pbar = ProgressBar()
pbar.register()

In [2]:
# inferred int32 types cause a type mismatch (int vs float) error when dask sees a null value
# null values cannot be interpreted as ints
custom_dtypes = {
    "date": "object",
    "serial_number": "object",
    "model": "object",
    "capacity_bytes": "float32",
    "failure": "float32",
    "smart_1_normalized": "float32",
    "smart_1_raw": "float32",
    "smart_2_normalized": "float32",
    "smart_2_raw": "float32",
    "smart_3_normalized": "float32",
    "smart_3_raw": "float32",
    "smart_4_normalized": "float32",
    "smart_4_raw": "float32",
    "smart_5_normalized": "float32",
    "smart_5_raw": "float32",
    "smart_7_normalized": "float32",
    "smart_7_raw": "float32",
    "smart_8_normalized": "float32",
    "smart_8_raw": "float32",
    "smart_9_normalized": "float32",
    "smart_9_raw": "float32",
    "smart_10_normalized": "float32",
    "smart_10_raw": "float32",
    "smart_11_normalized": "float32",
    "smart_11_raw": "float32",
    "smart_12_normalized": "float32",
    "smart_12_raw": "float32",
    "smart_13_normalized": "float32",
    "smart_13_raw": "float32",
    "smart_15_normalized": "float32",
    "smart_15_raw": "float32",
    "smart_16_normalized": "float32",
    "smart_16_raw": "float32",
    "smart_17_normalized": "float32",
    "smart_17_raw": "float32",
    "smart_22_normalized": "float32",
    "smart_22_raw": "float32",
    "smart_23_normalized": "float32",
    "smart_23_raw": "float32",
    "smart_24_normalized": "float32",
    "smart_24_raw": "float32",
    "smart_168_normalized": "float32",
    "smart_168_raw": "float32",
    "smart_170_normalized": "float32",
    "smart_170_raw": "float32",
    "smart_173_normalized": "float32",
    "smart_173_raw": "float32",
    "smart_174_normalized": "float32",
    "smart_174_raw": "float32",
    "smart_177_normalized": "float32",
    "smart_177_raw": "float32",
    "smart_179_normalized": "float32",
    "smart_179_raw": "float32",
    "smart_181_normalized": "float32",
    "smart_181_raw": "float32",
    "smart_182_normalized": "float32",
    "smart_182_raw": "float32",
    "smart_183_normalized": "float32",
    "smart_183_raw": "float32",
    "smart_184_normalized": "float32",
    "smart_184_raw": "float32",
    "smart_187_normalized": "float32",
    "smart_187_raw": "float32",
    "smart_188_normalized": "float32",
    "smart_188_raw": "float32",
    "smart_189_normalized": "float32",
    "smart_189_raw": "float32",
    "smart_190_normalized": "float32",
    "smart_190_raw": "float32",
    "smart_191_normalized": "float32",
    "smart_191_raw": "float32",
    "smart_192_normalized": "float32",
    "smart_192_raw": "float32",
    "smart_193_normalized": "float32",
    "smart_193_raw": "float32",
    "smart_194_normalized": "float32",
    "smart_194_raw": "float32",
    "smart_195_normalized": "float32",
    "smart_195_raw": "float32",
    "smart_196_normalized": "float32",
    "smart_196_raw": "float32",
    "smart_197_normalized": "float32",
    "smart_197_raw": "float32",
    "smart_198_normalized": "float32",
    "smart_198_raw": "float32",
    "smart_199_normalized": "float32",
    "smart_199_raw": "float32",
    "smart_200_normalized": "float32",
    "smart_200_raw": "float32",
    "smart_201_normalized": "float32",
    "smart_201_raw": "float32",
    "smart_218_normalized": "float32",
    "smart_218_raw": "float32",
    "smart_220_normalized": "float32",
    "smart_220_raw": "float32",
    "smart_222_normalized": "float32",
    "smart_222_raw": "float32",
    "smart_223_normalized": "float32",
    "smart_223_raw": "float32",
    "smart_224_normalized": "float32",
    "smart_224_raw": "float32",
    "smart_225_normalized": "float32",
    "smart_225_raw": "float32",
    "smart_226_normalized": "float32",
    "smart_226_raw": "float32",
    "smart_231_normalized": "float32",
    "smart_231_raw": "float32",
    "smart_232_normalized": "float32",
    "smart_232_raw": "float32",
    "smart_233_normalized": "float32",
    "smart_233_raw": "float32",
    "smart_235_normalized": "float32",
    "smart_235_raw": "float32",
    "smart_240_normalized": "float32",
    "smart_240_raw": "float32",
    "smart_241_normalized": "float32",
    "smart_241_raw": "float32",
    "smart_242_normalized": "float32",
    "smart_242_raw": "float32",
    "smart_250_normalized": "float32",
    "smart_250_raw": "float32",
    "smart_251_normalized": "float32",
    "smart_251_raw": "float32",
    "smart_252_normalized": "float32",
    "smart_252_raw": "float32",
    "smart_254_normalized": "float32",
    "smart_254_raw": "float32",
    "smart_255_normalized": "float32",
    "smart_255_raw": "float32",
}

In [3]:
csv_ids = ["Q3_2020"]
DATA_ROOT_DIR = 'Backblaze'
for csv_id in csv_ids:
    df = dd.read_csv(os.path.join(DATA_ROOT_DIR, 'data_{}'.format(csv_id), '*.csv'), dtype=custom_dtypes)

We want to filter our dataset so that it contains only wdc drives.

In [4]:
wdc  = df[df.model.str.startswith("W")]

We will then grab a list of drives that never failed in this time period and those that did in order to ensure that our final datasets include failed drives.  

In [5]:
failed_serials = wdc[wdc['failure'] == 1]['serial_number'].compute()

[########################################] | 100% Completed |  1min 35.3s


In [6]:
working_serials = wdc[~wdc['serial_number'].isin(failed_serials)]\
                    ['serial_number']\
                    .drop_duplicates(keep='last')\
                    .compute()

[########################################] | 100% Completed |  1min 33.5s


In [7]:
len(working_serials)

264

In [8]:
random.seed(45)
subset_working = random.sample(list(working_serials.values), 90)

In [9]:
new_wdc = list(failed_serials.values) + subset_working

In [10]:
len(new_wdc)

92

In [11]:
new_wdc = wdc[wdc.serial_number.isin(new_wdc)]

In [12]:
new_wdc.shape[0].compute()

[########################################] | 100% Completed |  1min 33.8s


8350

In [13]:
new_wdc.serial_number.nunique().compute()

[########################################] | 100% Completed |  1min 35.1s


92

In [14]:
new_wdc.head()

[########################################] | 100% Completed |  1.6s


Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
311,2020-07-01,WD-WX31A356P1DC,WDC WD5000LPVX,500107900000.0,0.0,200.0,0.0,,,152.0,...,,,,,,,,,,
661,2020-07-01,WD-WX41A3575J2P,WDC WD5000LPVX,500107900000.0,0.0,200.0,0.0,,,155.0,...,,,,,,,,,,
1210,2020-07-01,WD-WX81A1468540,WDC WD5000LPVX,500107900000.0,0.0,200.0,0.0,,,152.0,...,,,,,,,,,,
1542,2020-07-01,WD-WX41A356NL78,WDC WD5000LPVX,500107900000.0,0.0,200.0,0.0,,,154.0,...,,,,,,,,,,
3779,2020-07-01,WD-WXE1E84AEW27,WDC WD5000LPVX,500107900000.0,0.0,200.0,0.0,,,151.0,...,,,,,,,,,,


In [15]:
def random_partition(list_in,n):
    random.seed(10)
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]


In [16]:
failed_partition = random_partition(list(failed_serials),10)
working_partition = random_partition(list(subset_working),10)

In [17]:
training = working_partition[0:6] + failed_partition[0:6]
testing = working_partition[6:8] + failed_partition[6:8]
validation = working_partition[8:] + failed_partition[8:]

training = [item for sublist in training for item in sublist]
testing = [item for sublist in testing for item in sublist]
validation = [item for sublist in validation for item in sublist]

In [18]:
print(len(training))

56


In [19]:
print(len(testing))

18


In [20]:
print(len(validation))

18


In [21]:
len(training) + len(testing) + len(validation)

92

In [22]:
print(len(set(training).intersection(set(testing))) == 0)
print(len(set(training).intersection(set(validation))) == 0)
print(len(set(testing).intersection(set(validation))) == 0)

True
True
True


we have a 60/20/20 split for our data divided by individual hard drives with an similar distribution of failed and working drives in both. We will now save them as 3 separate csv files  

In [23]:
training_wdc = new_wdc[new_wdc.serial_number.isin(training)]
testing_wdc = new_wdc[new_wdc.serial_number.isin(testing)]
validation_wdc = new_wdc[new_wdc.serial_number.isin(validation)]

In [24]:
training_wdc = training_wdc.compute()
training_wdc.to_csv('train_backblaze_wdc_q2_2019.csv')
del (training_wdc)

[########################################] | 100% Completed |  1min 37.9s


In [25]:
testing_wdc = testing_wdc.compute()
testing_wdc.to_csv('test_backblaze_wdc_q2_2019.csv')
del (testing_wdc)

[########################################] | 100% Completed |  1min 36.4s


In [26]:
validation_wdc = validation_wdc.compute()
validation_wdc.to_csv('validation_backblaze_wdc_q2_2019.csv')
del (validation_wdc)

[########################################] | 100% Completed |  1min 36.4s
