# Get Dataset

In [6]:
import os
import shutil
import wget
import zipfile

In [13]:
base_url = 'https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/'
base_path = os.path.dirname('../HDD_dataset/')
# years = ['2013', '2014', '2015', '2016', '2017', '2018', '2019']
years = ['2018','2019']

# zips contain different directory names or no directory at all, which causes
# unavoidable "spaghettiness" in the code
suffixes = {
        'data_2013.zip': '2013',
        'data_2014.zip': '2014',
        'data_2015.zip': '2015',
        'data_Q1_2016.zip': 'data_Q1_2016',
        'data_Q2_2016.zip': 'data_Q2_2016',
        'data_Q3_2016.zip': 'data_Q3_2016',
        'data_Q4_2016.zip': None,
        'data_Q1_2017.zip': None,
        'data_Q2_2017.zip': None,
        'data_Q3_2017.zip': None,
        'data_Q4_2017.zip': 'data_Q4_2017',
        'data_Q1_2018.zip': None,
        'data_Q2_2018.zip': None,
        'data_Q3_2018.zip': None,
        'data_Q4_2018.zip': None,
        'data_Q1_2019.zip': None,
        'data_Q2_2019.zip': None,
        'data_Q3_2019.zip': None,
        }


In [14]:
def main(years, base_path):
    os.makedirs(base_path, exist_ok=True)
    # just in case they are passed as int
    years = [str(_) for _ in years]
    for y in years:
        print("Year:", y)
        year_path = os.path.join(base_path, y)
        os.makedirs(year_path, exist_ok=True)
        for zip_name, unzip_dir in suffixes.items():
            if y in zip_name:
                url = base_url + zip_name
                zip_path = os.path.join(base_path, zip_name)
                if not os.path.exists(zip_path):
                    print("Downloading:", url)
                    wget.download(url, out=base_path)
                print("\nUnzipping:", zip_path)
                dest_path = year_path if unzip_dir is None else base_path
                with zipfile.ZipFile(zip_path, 'r') as z:
                    z.extractall(dest_path)

                if unzip_dir is not None and unzip_dir != y:
                    unzip_path = os.path.join(dest_path, unzip_dir)
                    for f in os.listdir(unzip_path):
                        shutil.move(os.path.join(unzip_path, f),
                                os.path.join(year_path, f))
                    os.rmdir(unzip_path)



if __name__ == "__main__":
    main(years, base_path)


Year: 2018

Unzipping: ../HDD_dataset/data_Q1_2018.zip

Unzipping: ../HDD_dataset/data_Q2_2018.zip

Unzipping: ../HDD_dataset/data_Q3_2018.zip

Unzipping: ../HDD_dataset/data_Q4_2018.zip
Year: 2019

Unzipping: ../HDD_dataset/data_Q1_2019.zip
Downloading: https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2019.zip

Unzipping: ../HDD_dataset/data_Q2_2019.zip
Downloading: https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2019.zip

Unzipping: ../HDD_dataset/data_Q3_2019.zip


# Find Failed

In [None]:
import os
import pandas as pd
import datetime
import numpy as np
data_dir = os.path.dirname('../HDD_dataset/')
year_dir ={'2013': os.path.dirname('2013/'), '2014': os.path.dirname('2014/'),'2015': os.path.dirname('2015/'),\
           '2016': os.path.dirname('2016/'),'2017': os.path.dirname('2017/'), '2018': os.path.dirname('2018/'),\
           '2019': os.path.dirname('2019/')}
# years = ['2013','2014','2015','2016','2017']
years = ['2018', '2019']
model = 'ST3000DM001'
list_failed = []
failed = False
for year in years:
    first = 1
    old_time = datetime.datetime.strptime(year+'-01-01', '%Y-%m-%d')
    read_dir = os.path.join(data_dir,year_dir[year])
    for file in sorted(os.listdir(read_dir)):
        if os.path.isfile(os.path.join(read_dir,file)):
            if 'csv' in file:
                if datetime.datetime.strptime(file.split('.')[0], '%Y-%m-%d')>=old_time:
                    file_r = pd.read_csv(os.path.join(read_dir,file))
                    model_chosen = file_r[file_r['model']==model]
                    print('processing day ' + str(np.asarray(model_chosen['date'].values)))
                    if failed == True:
                        model_chosen = model_chosen[model_chosen['failure']==1]
                    for serial in model_chosen['serial_number'].values:
                        list_failed.append(serial)
os.makedirs('../temp', exist_ok=True)
if failed == True:
    np.save('../temp/HDD_failed_'+ model, list_failed)
else:
    np.save('../temp/HDD_all_'+ model, list_failed)


In [1]:
import os
import pandas as pd
import datetime
import numpy as np

In [None]:
def get_year_quarter(filename):
    date_str = filename.split('.')[0]  # Get the date part of the filename
    year, month, _ = date_str.split('-')  # Split the date into year, month, and day

    # Determine the quarter based on the month
    if 1 <= int(month) <= 3:
        quarter = 'Q1'
    elif 4 <= int(month) <= 6:
        quarter = 'Q2'
    elif 7 <= int(month) <= 9:
        quarter = 'Q3'
    else:
        quarter = 'Q4'

    return year + quarter

In [None]:
data_dir = os.path.dirname('../HDD_dataset/')
year_dir ={'2013': os.path.dirname('2013/'), '2014': os.path.dirname('2014/'),'2015': os.path.dirname('2015/'),\
           '2016': os.path.dirname('2016/'),'2017': os.path.dirname('2017/'), '2018': os.path.dirname('2018/'),\
           '2019': os.path.dirname('2019/')}
# years = ['2013','2014','2015','2016','2017']
years = ['2018', '2019']

hdd_models = dict()

for year in years:
    read_dir = os.path.join(data_dir,year_dir[year])
    for file in sorted(os.listdir(read_dir)):
        if os.path.isfile(os.path.join(read_dir,file)):
            if 'csv' in file:
                file_r = pd.read_csv(os.path.join(read_dir,file))
                for model in file_r['model'].unique():
                    if model not in hdd_models:
                        hdd_models[model] = file_r['model'].value_counts()[model] # TODO: what exactly are we counting?
                    else:
                        hdd_models[model] += file_r['model'].value_counts()[model]
