In [1]:
%run -i "preamble.py"

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from tueplots import bundles
from tueplots.constants.color import rgb
df = pd.read_csv("data/jobs_ferranti.csv", sep="|")

In [3]:
import pandas as pd
import re

# Function to classify the structure of the JobID
def get_job_id_type(jid):
    jid = str(jid)
    
    if jid.isdigit():
        return "Standard Integer (e.g. 633)"
    
    elif '.' in jid:
        # Check suffix to distinguish batch/extern from numeric steps
        suffix = jid.split('.')[-1]
        if suffix.isdigit():
            return "Job Step: Numeric (e.g. 633.0)"
        else:
            return f"Job Step: String (e.g. 633.{suffix})"
            
    elif '[' in jid and ']' in jid:
        return "Array Range Summary (e.g. 5419_[0-10])"
        
    elif '_' in jid:
        return "Array Task Instance (e.g. 5419_1)"
        
    else:
        return "Other / Unknown"

# 1. Apply classification
# We use the original 'df' here to see everything before filtering
job_types = df['JobID'].apply(get_job_id_type)

# 2. Create a summary
summary = df.groupby(job_types)['JobID'].agg(
    Count='count',
    Examples=lambda x: x.sample(3, random_state=1).tolist() if len(x) > 3 else x.tolist()
).reset_index()

print(summary)

                                     JobID   Count  \
0   Array Range Summary (e.g. 5419_[0-10])      82   
1        Array Task Instance (e.g. 5419_1)  221909   
2           Job Step: Numeric (e.g. 633.0)  249476   
3        Job Step: String (e.g. 633.batch)  261845   
4       Job Step: String (e.g. 633.extern)  267505   
5  Job Step: String (e.g. 633.interactive)      93   
6              Standard Integer (e.g. 633)   74959   

                                            Examples  
0   [36906_[0-2%3], 275400_[11-21], 76697_[0-39%40]]  
1                  [256050_11, 257219_25, 184756_10]  
2               [238784_8.0, 251986_3.0, 277515_5.0]  
3  [106500_0.batch, 107646_20.batch, 182475_19.ba...  
4  [36859_6.extern, 291294_47.extern, 283504_64.e...  
5  [222626.interactive, 243082.interactive, 22238...  
6                              [10627, 42452, 39677]  


In [4]:
# Filter for "Real Allocations" (Types 1 and 6) (See Notion)
# Logic: Keep rows where JobID has NO dots AND NO brackets
mask_real_allocations = (
    ~df['JobID'].str.contains(r'\.', regex=True) & 
    ~df['JobID'].str.contains(r'\[', regex=True)
)

df_analysis = df[mask_real_allocations].copy()

print(f"Total Rows in raw data: {len(df)}")
print(f"Total Unique Allocations (Types 1 & 6): {len(df_analysis)}")
df_analysis.head()

Total Rows in raw data: 1075869
Total Unique Allocations (Types 1 & 6): 296868


Unnamed: 0,JobID,Partition,State,ExitCode,Submit,Start,End,Elapsed,AllocTRES,ConsumedEnergyRaw
0,633,h100-ferranti,FAILED,1:0,2024-10-15T13:36:43,2024-10-15T13:36:43,2024-10-15T13:36:43,00:00:00,"billing=208,cpu=1,gres/gpu:h100=8,gres/gpu=8,m...",0.0
2,634,h100-ferranti,FAILED,1:0,2024-10-15T13:37:07,2024-10-15T13:37:07,2024-10-15T13:37:07,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
4,635,h100-ferranti,FAILED,1:0,2024-10-15T13:37:12,2024-10-15T13:37:12,2024-10-15T13:37:12,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
6,636,h100-ferranti,FAILED,1:0,2024-10-15T13:38:27,2024-10-15T13:38:27,2024-10-15T13:38:27,00:00:00,"billing=208,cpu=1,mem=2063800M,node=1",0.0
8,637,h100-ferranti,FAILED,127:0,2024-10-15T13:42:37,2024-10-15T13:42:37,2024-10-15T13:42:48,00:00:11,"billing=208,cpu=1,mem=2063800M,node=1",0.0
