# Dataset Creation

Convert to python: run `jupyter nbconvert --to python dataset_creation.ipynb`

In [None]:
import os
import shutil
import wget
import zipfile
import pandas as pd
import datetime
import numpy as np
import IPython
from glob import glob

## Config

Make sure to change these configs before running the whole notebook.

- Base URL: The BackBlaze dataset URL

In [None]:
base_url = "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/"

- Base Path: This should be pointing to where to hold the dataset.  
Note that this relative path is relative to the current working directory, or in other words, usually where this ipynb is.

In [None]:
notebook_path = IPython.get_ipython().starting_dir
base_path = os.path.abspath(os.path.join(notebook_path, '..', 'HDD_dataset'))
os.makedirs(base_path, exist_ok=True)
base_path

- Output Path: This should be pointing to where to output the database, used by the `Classification.py` script.

In [None]:
# Create output directory
output_dir = os.path.abspath(os.path.join(notebook_path, '..', 'output'))
os.makedirs(output_dir, exist_ok=True)
output_dir

- Years: Years of data to download and analyze (From 2013 to 2019)

In [None]:
years = [str(year) for year in range(2013, 2020)]

- Model: The specific HDD model we want to keep the data for

In [None]:
model = "ST3000DM001"

- Find Failed: if `True`, keep only failed HDDs, otherwise keep all HDDs

In [None]:
find_failed = False
suffix = 'failed' if find_failed else 'all'
suffix

- Define variables for the name of the output files

In [None]:
# Define the directories for each year
year_dirs = {year: os.path.join(base_path, year) for year in years}
years_list = "_" + "_".join(years)
years_list

zips contain different directory names or no directory at all, which causes unavoidable "spaghettiness" in the code.
For example, the data in year 2013 is in the directory "2013", so the key is `"2013"`. The data from 2016 are in the root of the zip file, hence the key is `None`. 

In [None]:
# zips contain different directory names or no directory at all, which causes
# unavoidable "spaghettiness" in the code
suffixes = {
    "data_2013.zip": '2013',
    "data_2014.zip": '2014',
    "data_2015.zip": '2015',
    "data_Q1_2016.zip": None,
    "data_Q2_2016.zip": None,
    "data_Q3_2016.zip": None,
    "data_Q4_2016.zip": None,
    "data_Q1_2017.zip": None,
    "data_Q2_2017.zip": None,
    "data_Q3_2017.zip": None,
    "data_Q4_2017.zip": None,
    "data_Q1_2018.zip": None,
    "data_Q2_2018.zip": None,
    "data_Q3_2018.zip": None,
    "data_Q4_2018.zip": None,
    "data_Q1_2019.zip": None,
    "data_Q2_2019.zip": None,
    "data_Q3_2019.zip": None,
}

## Get Dataset

Download and unzip the dataset, moving files to the correct directory
The dataset is structured as follows:
```
base_path  
├── 2018
│   ├── 2018-01-01.csv
│   ├── 2018-01-02.csv
│   └── 2018-01-03.csv
├── 2019
│   ├── 2019-01-01.csv
│   ├── 2019-01-02.csv
│   └── 2019-01-03.csv
├── data_Q1_2018.zip
├── data_Q1_2019.zip
├── data_Q2_2018.zip
├── data_Q2_2019.zip
├── data_Q3_2018.zip
├── data_Q3_2019.zip
└── data_Q4_2018.zip
```

In [None]:
# just in case they are passed as int
years = [str(_) for _ in years]
for y in years:
    print("Year:", y)
    year_path = os.path.join(base_path, y)
    os.makedirs(year_path, exist_ok=True)
    for zip_name, unzip_dir in suffixes.items():
        if y in zip_name:
            url = base_url + zip_name
            zip_path = os.path.join(base_path, zip_name)
            if not os.path.exists(zip_path):
                print("Downloading:", url)
                wget.download(url, out=base_path)
            print("\nUnzipping:", zip_path)
            dest_path = year_path if unzip_dir is None else base_path
            with zipfile.ZipFile(zip_path, 'r') as z:
                z.extractall(dest_path)

            if unzip_dir is not None and unzip_dir != y:
                unzip_path = os.path.join(dest_path, unzip_dir)
                for f in os.listdir(unzip_path):
                    shutil.move(os.path.join(unzip_path, f),
                            os.path.join(year_path, f))
                os.rmdir(unzip_path)


## Collect all serial numbers of given HDD model

In [None]:
list_failed = []

# for each year
for year in years:
    year_dir = year_dirs[year]
    files = glob(os.path.join(year_dir, '*.csv'))

    # for each file, or day
    for file_path in sorted(files):
        try:
            file_r = pd.read_csv(file_path)
        except FileNotFoundError:
            print(f"Error: The file {file_path} does not exist.")
            continue

        # choose the HDD model we need
        model_chosen = file_r[file_r['model'] == model]

        # if that particular HDD model is not present, continue
        if model_chosen.empty:
            continue

        # Print processing day
        # print('processing day ' + str(model_chosen['date'].values))

        if find_failed:
            # choose only the failed hard drives
            model_chosen = model_chosen[model_chosen['failure'] == 1]
            # print(f"Number of entries after filtering by failure: {len(model_chosen)}")

        # keep the failed hard drives' serial number
        list_failed.extend(model_chosen['serial_number'].values)

# Save the list of failed or all hard drives
np.save(os.path.join(output_dir, f'HDD{years_list}_{suffix}_{model}.npy'), list_failed)

## Read to a DataFrame

In [None]:
# failed = list_failed
failed = set(np.load(os.path.join(output_dir, f'HDD{years_list}_{suffix}_{model}.npy')))

database = pd.DataFrame()

# Iterate over each year
for year in years:
    year_path = year_dirs[year]
    files = sorted([f for f in os.listdir(year_path) if f.endswith('.csv')])

    # Iterate over each file in the directory
    for file in files:
        file_path = os.path.join(year_path, file)
        file_date = datetime.datetime.strptime(file.split('.')[0], '%Y-%m-%d')
        old_time = datetime.datetime.strptime(f'{year}-01-01', '%Y-%m-%d')
        
        if file_date >= old_time:
            df = pd.read_csv(file_path)
            model_chosen = df[df['model'] == model]
            relevant_rows = model_chosen[model_chosen['serial_number'].isin(failed)]

            # Drop unnecessary columns since the following columns are not standard for all models
            drop_columns = [col for col in relevant_rows if 'smart_' in col and int(col.split('_')[1]) in {22, 220, 222, 224, 226}]
            relevant_rows.drop(columns=drop_columns, errors='ignore', inplace=True)

            # Append the row to the database
            database = pd.concat([database, relevant_rows], ignore_index=True)
            print('adding day ' + str(model_chosen['date'].values))

# Save the database to a pickle file
database.to_pickle(os.path.join(output_dir, f'HDD{years_list}_{suffix}_{model}_appended.pkl'))

# Check the most common models
most_common_models = df.groupby(['model'], as_index=True)['model', 'date']. size()
most_common_models = most_common_models.sort_values(ascending=False)