# cscratch Disk Failures

Meng Wang and John Bent want a list of every failed cscratch drive, the time it failed, and its position.  This is my attempt to filter down drive failures from all the service tickets that came out of CrayPort.

In [1]:
%matplotlib inline

In [2]:
import os
import re

import numpy
import pandas

# pandas.set_option('display.max_rows', None)
# pandas.set_option('display.max_colwidth', None)

In [3]:
# describe layout of cscratch file system
layout_df = pandas.DataFrame(
    index=[f"snx11168n{x:03d}" for x in range(4, 252)],
    data=range(4, 252),
    columns=["node"])
layout_df["ssu"] = [x // 2 for x in range(248)]
layout_df["rack"] = [x // 16 + (1 if x < 128 else 5) for x in range(248)]
layout_df["ru_low"] = layout_df["ssu"].apply(lambda x: (x % 8) * 5 + 1)
layout_df["ru_high"] = layout_df["ru_low"] + 4
layout_df.tail()

Unnamed: 0,node,ssu,rack,ru_low,ru_high
snx11168n247,247,121,20,6,10
snx11168n248,248,122,20,11,15
snx11168n249,249,122,20,11,15
snx11168n250,250,123,20,16,20
snx11168n251,251,123,20,16,20


In [4]:
# load all vaguely disk-like errors
if os.path.isfile("crayport-scratch-dump-anon.csv"):
    print("Reading from pre-anonymized CSV")
    df = pandas.read_csv("crayport-scratch-dump-anon.csv")
else:
    print("Anonymizing raw TSV")
    df = pandas.read_csv("crayport-cscratch-dump.tsv", sep="\t")
    df = df[["Subject", "Date Created", "Date Completed", "Last Modified"]].copy()
    df.to_csv("crayport-scratch-dump-anon.csv", index=False)
df.head()

Reading from pre-anonymized CSV


Unnamed: 0,Subject,Date Created,Date Completed,Last Modified
0,2202052303 snx11168n185 Z4F05WV3 S41 disk driv...,2/7/2022 7:03 AM,2/10/2022 1:21 PM,2/10/2022 1:21 PM
1,2201310000 snx11168n208 Z4F04FCF S48 disk driv...,1/31/2022 6:32 AM,2/3/2022 9:03 AM,2/3/2022 9:03 AM
2,2201211916 snx11168n173 Z4F05VST S82 disk driv...,1/24/2022 6:49 AM,1/31/2022 8:52 AM,1/31/2022 8:52 AM
3,2201201529 snx11168n143 Z4F04W8H S64 drive was...,1/21/2022 6:55 AM,,2/9/2022 4:45 PM
4,2201200208 snx11168n159 Z4F061R2 S57 disk driv...,1/20/2022 6:30 AM,1/24/2022 9:01 AM,1/24/2022 9:02 AM


In [5]:
# try to filter out errors that don't contain an enclosure and slot
REX_DISK_FAILURES = re.compile(r"(disk|drive|slot|s\d\d)", flags=re.IGNORECASE)
def is_drive_error(line):
    if REX_DISK_FAILURES.search(line["Subject"]):
        return True
    return False

failures_df = df[df.apply(is_drive_error, axis=1)].copy()
print("Looks like there were {:d} failures".format(failures_df.shape[0]))

Looks like there were 364 failures


In [6]:
REX_NODE_AND_SLOT = re.compile("snx11168n(\d+).*[^a-zA-Z][sS](lot\s*)?(\d+)")
def find_node_and_slot(line):
    subject = line["Subject"].replace("_", " ")
    match = REX_NODE_AND_SLOT.search(subject)
    if match:
        node = int(match.group(1))
        slot = int(match.group(3))
        if node >= 0 and node <= 252 and slot >= 0 and slot < 84:
            return (node, slot)
    #return subject
    return (-1, -1)

tmp = failures_df.apply(find_node_and_slot, axis=1)
failures_df["node"] = tmp.apply(lambda x: int(x[0]))
failures_df["slot"] = tmp.apply(lambda x: int(x[1]))
failures_df.head()

Unnamed: 0,Subject,Date Created,Date Completed,Last Modified,node,slot
0,2202052303 snx11168n185 Z4F05WV3 S41 disk driv...,2/7/2022 7:03 AM,2/10/2022 1:21 PM,2/10/2022 1:21 PM,185,41
1,2201310000 snx11168n208 Z4F04FCF S48 disk driv...,1/31/2022 6:32 AM,2/3/2022 9:03 AM,2/3/2022 9:03 AM,208,48
2,2201211916 snx11168n173 Z4F05VST S82 disk driv...,1/24/2022 6:49 AM,1/31/2022 8:52 AM,1/31/2022 8:52 AM,173,82
3,2201201529 snx11168n143 Z4F04W8H S64 drive was...,1/21/2022 6:55 AM,,2/9/2022 4:45 PM,143,64
4,2201200208 snx11168n159 Z4F061R2 S57 disk driv...,1/20/2022 6:30 AM,1/24/2022 9:01 AM,1/24/2022 9:02 AM,159,57


In [7]:
result_df = failures_df.join(layout_df.set_index("node"), on="node")
for key in "ssu", "rack", "ru_low", "ru_high":
    result_df[key] = result_df[key].fillna(-1).astype(int)
result_df.index.name = "id"
result_df

Unnamed: 0_level_0,Subject,Date Created,Date Completed,Last Modified,node,slot,ssu,rack,ru_low,ru_high
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2202052303 snx11168n185 Z4F05WV3 S41 disk driv...,2/7/2022 7:03 AM,2/10/2022 1:21 PM,2/10/2022 1:21 PM,185,41,90,16,11,15
1,2201310000 snx11168n208 Z4F04FCF S48 disk driv...,1/31/2022 6:32 AM,2/3/2022 9:03 AM,2/3/2022 9:03 AM,208,48,102,17,31,35
2,2201211916 snx11168n173 Z4F05VST S82 disk driv...,1/24/2022 6:49 AM,1/31/2022 8:52 AM,1/31/2022 8:52 AM,173,82,84,15,21,25
3,2201201529 snx11168n143 Z4F04W8H S64 drive was...,1/21/2022 6:55 AM,,2/9/2022 4:45 PM,143,64,69,13,26,30
4,2201200208 snx11168n159 Z4F061R2 S57 disk driv...,1/20/2022 6:30 AM,1/24/2022 9:01 AM,1/24/2022 9:02 AM,159,57,77,14,26,30
...,...,...,...,...,...,...,...,...,...,...
753,1509011812 R01 snx11168n006/007 Z4F05J0Q S81 f...,9/22/2015 10:19 AM,,1/3/2018 1:37 PM,6,81,1,1,6,10
754,1508311600 R02 snx11168n030/031 Z4F05E9P S26 f...,9/22/2015 10:13 AM,,1/3/2018 1:37 PM,30,26,13,2,26,30
755,1508281258 R07 snx11168n104/105 Z4F04GQR S08 f...,9/22/2015 10:03 AM,,1/3/2018 1:37 PM,104,8,50,7,11,15
756,1508280915 R06 snx11168n099/100 Z3E01BLA SSD S...,9/22/2015 9:54 AM,,1/3/2018 1:37 PM,99,28,47,6,36,40


In [8]:
result_df[["Date Created", "Date Completed", "Last Modified", "node", "slot", "ssu", "rack", "ru_low", "ru_high"]].to_csv("cscratch_drive_failures.csv")