In [2]:
%run -i "preamble.py"

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from tueplots import bundles
from tueplots.constants.color import rgb
df = pd.read_csv("data/jobs_ferranti.csv", sep="|")

In [5]:
import pandas as pd
import re

# Function to classify the structure of the JobID
def get_job_id_type(jid):
    jid = str(jid)
    
    if jid.isdigit():
        return "Standard Integer (e.g. 633)"
    
    elif '.' in jid:
        # Check suffix to distinguish batch/extern from numeric steps
        suffix = jid.split('.')[-1]
        if suffix.isdigit():
            return "Job Step: Numeric (e.g. 633.0)"
        else:
            return f"Job Step: String (e.g. 633.{suffix})"
            
    elif '[' in jid and ']' in jid:
        return "Array Range Summary (e.g. 5419_[0-10])"
        
    elif '_' in jid:
        return "Array Task Instance (e.g. 5419_1)"
        
    else:
        return "Other / Unknown"

# 1. Apply classification
# We use the original 'df' here to see everything before filtering
job_types = df['JobID'].apply(get_job_id_type)

# 2. Create a summary
summary = df.groupby(job_types)['JobID'].agg(
    Count='count',
    Examples=lambda x: x.sample(3, random_state=1).tolist() if len(x) > 3 else x.tolist()
).reset_index()

print(summary)

                                     JobID   Count  \
0   Array Range Summary (e.g. 5419_[0-10])      82   
1        Array Task Instance (e.g. 5419_1)  221909   
2           Job Step: Numeric (e.g. 633.0)  249476   
3        Job Step: String (e.g. 633.batch)  261845   
4       Job Step: String (e.g. 633.extern)  267505   
5  Job Step: String (e.g. 633.interactive)      93   
6              Standard Integer (e.g. 633)   74959   

                                            Examples  
0   [36906_[0-2%3], 275400_[11-21], 76697_[0-39%40]]  
1                  [256050_11, 257219_25, 184756_10]  
2               [238784_8.0, 251986_3.0, 277515_5.0]  
3  [106500_0.batch, 107646_20.batch, 182475_19.ba...  
4  [36859_6.extern, 291294_47.extern, 283504_64.e...  
5  [222626.interactive, 243082.interactive, 22238...  
6                              [10627, 42452, 39677]  


In [6]:
# Filter for "Real Allocations" (Types 1 and 6) (See Notion)
# Logic: Keep rows where JobID has NO dots AND NO brackets
mask_real_allocations = (
    ~df['JobID'].str.contains(r'\.', regex=True) & 
    ~df['JobID'].str.contains(r'\[', regex=True)
)
print(f"Total Rows in data before cutting: {len(df)}")

df = df[mask_real_allocations].copy()

print(f"Total Unique Allocations (Types 1 & 6): {len(df)}")
df.head()

Total Rows in data before cutting: 1075869
Total Unique Allocations (Types 1 & 6): 296868


Unnamed: 0,JobID,Partition,State,ExitCode,Submit,Start,End,Elapsed,AllocTRES,ConsumedEnergyRaw
0,633,h100-ferranti,FAILED,1:0,2024-10-15T13:36:43,2024-10-15T13:36:43,2024-10-15T13:36:43,00:00:00,"billing=208,cpu=1,gres/gpu:h100=8,gres/gpu=8,m...",0.0
2,634,h100-ferranti,FAILED,1:0,2024-10-15T13:37:07,2024-10-15T13:37:07,2024-10-15T13:37:07,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
4,635,h100-ferranti,FAILED,1:0,2024-10-15T13:37:12,2024-10-15T13:37:12,2024-10-15T13:37:12,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
6,636,h100-ferranti,FAILED,1:0,2024-10-15T13:38:27,2024-10-15T13:38:27,2024-10-15T13:38:27,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
8,637,h100-ferranti,FAILED,127:0,2024-10-15T13:42:37,2024-10-15T13:42:37,2024-10-15T13:42:48,00:00:11,"billing=208,cpu=1,mem=2063800M,node=1",0.0


In [7]:
# Filter invalid partitions
print("--- Unique Partitions ---")
print(df['Partition'].unique())

print("\n--- Partition Counts ---")
print(df['Partition'].value_counts())

valid_partitions = [
    'h100-ferranti', 
    'cpu-ferranti', 
    'h100-preemptable-ferranti',
    # Or should we keep this, it seems like a real (albeit rare/misnamed) partition?
    # 'h1001-ferranti' 
]
df = df[df['Partition'].isin(valid_partitions)].copy()
print("\n--- Filtered Partition Counts ---")
print(df['Partition'].value_counts())
print(f"Total Rows after partition filtering: {len(df)}")

--- Unique Partitions ---
['h100-ferranti' 'cpu-ferranti' 'h100-preemptable-ferranti'
 'h1001-ferranti' 'h100-preemptable-ferranti,h100-ferranti'
 'h100-ferranti,h100-preemptable-ferranti']

--- Partition Counts ---
Partition
h100-ferranti                              222581
cpu-ferranti                                68963
h100-preemptable-ferranti                    5254
h1001-ferranti                                 67
h100-preemptable-ferranti,h100-ferranti         2
h100-ferranti,h100-preemptable-ferranti         1
Name: count, dtype: int64

--- Filtered Partition Counts ---
Partition
h100-ferranti                222581
cpu-ferranti                  68963
h100-preemptable-ferranti      5254
Name: count, dtype: int64
Total Rows after partition filtering: 296798


In [8]:
# ARE THESE USER IDS? IS THIS SENSITIVE DATA?
print("\n--- Top 20 Most Common States ---")
print(df['State'].value_counts().head(10))
print("\n--- Sample of 'Weird' States ---")
# Filter for states that are unusually long (likely containing " by user")
weird_states = df[df['State'].astype(str).str.len() > 15]['State'].unique()
print(weird_states[:10])

# Unify all unique "cancels" into one "cancel"
df['State'] = df['State'].astype(str).str.split().str[0]
print("\n--- Top 20 Most Common States After Unifying ---")
print(df['State'].value_counts().head(20))
df.head()


--- Top 20 Most Common States ---
State
COMPLETED            221989
FAILED                27020
CANCELLED by 5074     17380
TIMEOUT                5272
CANCELLED by 4789      3882
CANCELLED by 4673      2356
CANCELLED by 4562      2301
CANCELLED by 4400      1695
CANCELLED by 5003      1634
OUT_OF_MEMORY           982
Name: count, dtype: int64

--- Sample of 'Weird' States ---
['CANCELLED by 4573' 'CANCELLED by 4635' 'CANCELLED by 4321'
 'CANCELLED by 4559' 'CANCELLED by 4532' 'CANCELLED by 4537'
 'CANCELLED by 4814' 'CANCELLED by 4272' 'CANCELLED by 4872'
 'CANCELLED by 4429']

--- Top 20 Most Common States After Unifying ---
State
COMPLETED        221989
CANCELLED         41249
FAILED            27020
TIMEOUT            5272
OUT_OF_MEMORY       982
NODE_FAIL           114
PENDING              91
PREEMPTED            44
RUNNING              37
Name: count, dtype: int64


Unnamed: 0,JobID,Partition,State,ExitCode,Submit,Start,End,Elapsed,AllocTRES,ConsumedEnergyRaw
0,633,h100-ferranti,FAILED,1:0,2024-10-15T13:36:43,2024-10-15T13:36:43,2024-10-15T13:36:43,00:00:00,"billing=208,cpu=1,gres/gpu:h100=8,gres/gpu=8,m...",0.0
2,634,h100-ferranti,FAILED,1:0,2024-10-15T13:37:07,2024-10-15T13:37:07,2024-10-15T13:37:07,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
4,635,h100-ferranti,FAILED,1:0,2024-10-15T13:37:12,2024-10-15T13:37:12,2024-10-15T13:37:12,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
6,636,h100-ferranti,FAILED,1:0,2024-10-15T13:38:27,2024-10-15T13:38:27,2024-10-15T13:38:27,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
8,637,h100-ferranti,FAILED,127:0,2024-10-15T13:42:37,2024-10-15T13:42:37,2024-10-15T13:42:48,00:00:11,"billing=208,cpu=1,mem=2063800M,node=1",0.0
