In [2]:
%pip install pandas numpy

Collecting pandas
  Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.2.5-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------  11.5/11.6 MB 144.5 MB/s eta 0:00:01
   ---------------------------------------- 11.6/11.6 MB 48.2 MB/s eta 0:00:00
Downloading numpy-2.2.5-cp310-cp310-win_amd64.whl (12.9 MB)
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------------- 12.9/12.9 MB 134.9 MB/s eta 0:00:00
Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Insta

In [2]:
import pandas as pd
import numpy as np
import re


file_path = "dataset\dcgm.csv"

print("Loading the dataset")

try:
    dataset = pd.read_csv(file_path)
    print("Dataset loaded successfully")
except FileNotFoundError:
    print("File was not found")
    exit()

#display info
print("Printing dataset information")
print(dataset.info())
#display few items from top
print("Printing few items from top")
print(dataset.head())

Loading the dataset
Dataset loaded successfully
Printing dataset information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96893 entries, 0 to 96892
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Node                           96893 non-null  object 
 1   avgmemoryutilization_pct       96893 non-null  float64
 2   avgsmutilization_pct           96893 non-null  float64
 3   energyconsumed_joules          96893 non-null  float64
 4   gpu_id                         96893 non-null  int64  
 5   maxgpumemoryused_bytes         96893 non-null  float64
 6   memoryutilization_pct_avg      96893 non-null  float64
 7   memoryutilization_pct_max      96893 non-null  float64
 8   memoryutilization_pct_min      96893 non-null  float64
 9   pcierxbandwidth_megabytes_avg  96893 non-null  float64
 10  pcierxbandwidth_megabytes_max  96893 non-null  float64
 11  pcierxbandwidth_megabytes_min

In [4]:
#Cleaingin the data
print("Cleaning the data")

#Handle missing values, i.e. filling NaN with 0
dataset.fillna(0,inplace=True)

#Remove redundant rows
dataset.drop_duplicates(inplace=True)

#Remove columns that are not required
if 'gpu_id' in dataset.columns:
    dataset.drop(columns=['gpu_id'], inplace=True)


print("dataset is cleaned")
print(dataset.info())

Cleaning the data
dataset is cleaned
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96893 entries, 0 to 96892
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Node                           96893 non-null  object 
 1   avgmemoryutilization_pct       96893 non-null  float64
 2   avgsmutilization_pct           96893 non-null  float64
 3   energyconsumed_joules          96893 non-null  float64
 4   maxgpumemoryused_bytes         96893 non-null  float64
 5   memoryutilization_pct_avg      96893 non-null  float64
 6   memoryutilization_pct_max      96893 non-null  float64
 7   memoryutilization_pct_min      96893 non-null  float64
 8   pcierxbandwidth_megabytes_avg  96893 non-null  float64
 9   pcierxbandwidth_megabytes_max  96893 non-null  float64
 10  pcierxbandwidth_megabytes_min  96893 non-null  float64
 11  pcietxbandwidth_megabytes_avg  96893 non-null  float64
 12  pcietxban

In [7]:
print("Formatting the dataset now")

#change the type of 'totalexecutiontime_sec' to float
if 'totalexecutiontime_sec' in dataset.columns:
    dataset['totalexecutiontime_sec'] = dataset['totalexecutiontime_sec'].astype(float)

#change the datatype of all numerical column to float32 to optmize memory
numeric_columns = dataset.select_dtypes(include=['int64','float64']).columns
dataset[numeric_columns] = dataset[numeric_columns].astype('float32')

print("DataSet formatted successfully")
print(dataset.info())

Formatting the dataset now
DataSet formatted successfully
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96893 entries, 0 to 96892
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Node                           96893 non-null  object 
 1   avgmemoryutilization_pct       96893 non-null  float32
 2   avgsmutilization_pct           96893 non-null  float32
 3   energyconsumed_joules          96893 non-null  float32
 4   maxgpumemoryused_bytes         96893 non-null  float32
 5   memoryutilization_pct_avg      96893 non-null  float32
 6   memoryutilization_pct_max      96893 non-null  float32
 7   memoryutilization_pct_min      96893 non-null  float32
 8   pcierxbandwidth_megabytes_avg  96893 non-null  float32
 9   pcierxbandwidth_megabytes_max  96893 non-null  float32
 10  pcierxbandwidth_megabytes_min  96893 non-null  float32
 11  pcietxbandwidth_megabytes_avg  96893 non-null  f

In [10]:
print("Annotate dataset")

def annotate_cpu_utilization(value) -> str:
    if value > 80:
        return "High"
    if value > 50:
        return "Medium"
    if value > 20:
        return "Low"
    else:
        return "Idle"
    
if "avgsmutilization_pct" in dataset.columns:
    dataset['gpu_utilization_category'] = dataset['avgsmutilization_pct'].apply(annotate_cpu_utilization)

print("Display the data set")
dataset.head()

Annotate dataset
Display the data set


Unnamed: 0,Node,avgmemoryutilization_pct,avgsmutilization_pct,energyconsumed_joules,maxgpumemoryused_bytes,memoryutilization_pct_avg,memoryutilization_pct_max,memoryutilization_pct_min,pcierxbandwidth_megabytes_avg,pcierxbandwidth_megabytes_max,...,pcietxbandwidth_megabytes_min,powerusage_watts_avg,powerusage_watts_max,powerusage_watts_min,smutilization_pct_avg,smutilization_pct_max,smutilization_pct_min,totalexecutiontime_sec,id_job,gpu_utilization_category
0,r9170217-n750018,0.0,1.0,157521.0,2645557000.0,3.0,24.0,0.0,1646.0,1748.0,...,676.0,44.0546,156.962006,25.468,11.0,64.0,0.0,4452.470215,4393785000.0,Idle
1,r44132-n469339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.54,18177100000.0,Idle
2,r5354770-n172107,0.0,3.0,94816.0,29171710.0,0.0,0.0,0.0,1846.0,1849.0,...,1346.0,26.372499,27.483,25.979,6.0,43.0,0.0,8398.299805,39529490000.0,Idle
3,r1900814-n772143,0.0,0.0,69451.0,1527587000.0,0.0,1.0,0.0,539.0,542.0,...,293.0,37.5481,52.039001,26.676001,1.0,25.0,0.0,1851.719971,19994330000.0,Idle
4,r9515777-n469339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1991.0,1991.0,...,1109.0,25.032,25.032,25.032,0.0,0.0,0.0,0.99,397881700.0,Idle


In [None]:
print("Writing the prepared dataset to a new file")

outputfilename= "dataset/prepared_dcgm.csv"

dataset.to_csv(outputfilename, index=False)

print(f"prepared dataset is at {outputfilename}")

Writing the prepared dataset to a new file
prepared dataset is at dataset/prepared_dcgm.csv
