In [None]:
# Exploratory Data Analysis (CPU-Bound)

# In this notebook I'll perform EDA to explain feature columns, 
# their relationships and overall everything we should know about the
# given dataset.

In [None]:
# Feature Columns

# time_ms = Timestamp of measurement in milliseconds

# cpu_psi_some_* = Shows how much time tasks are delayed because CPU is saturated. 

# cpu_psi_full_* = Percentage of time all non-idle tasks are stalled, waiting for CPU.  

# io_psi_some_* = Shows delay caused by storage / disk IO.

# io_psi_full_* = All runable tasks blocked on IO.

# mem_psi_some_* = Shows tasks waiting in memory.

# mem_psi_full_* = Shows all tasks stalled due to memory pressure.

# load_avg_* = Average number of runnable or IO-waiting tasks over time windows.

# procs_running = Number of processes currently runable.

# procs_blocked = Number of processes blocked waiting for IO completion.

# procs_disk_io = Processes specifically blocked on disk IO.

# Some -> At least 1 task waiting.
# Full -> Entire workload stalled.

In [6]:
# Read CSV files
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
DATASET_DIR = PROJECT_ROOT / "datasets"
PRIO_DIR = DATASET_DIR / "prio-cpu.csv"
RR_DIR = DATASET_DIR / "rr-cpu.csv"

pd_prio = pd.read_csv(PRIO_DIR)
pd_rr = pd.read_csv(RR_DIR)

# pd_prio.head(5)
# pd_rr.head(5)

In [13]:
# Change column names and add task id (preparation for join)

pd_prio = pd_prio.rename(columns={
    "time_ms": "time_ms_prio",
    "cpu_psi_some_10": "cpu_psi_some_10_prio",
    "cpu_psi_some_60": "cpu_psi_some_60_prio",
    "cpu_psi_some_300": "cpu_psi_some_300_prio",
    "cpu_psi_full_10": "cpu_psi_full_10_prio",
    "cpu_psi_full_60": "cpu_psi_full_60_prio",
    "cpu_psi_full_300": "cpu_psi_full_300_prio",
    "io_psi_some_10": "io_psi_some_10_prio",
    "io_psi_some_60": "io_psi_some_60_prio",
    "io_psi_some_300": "io_psi_some_300_prio",
    "io_psi_full_10": "io_psi_full_10_prio",
    "io_psi_full_60": "io_psi_full_60_prio",
    "io_psi_full_300": "io_psi_full_300_prio",
    "mem_psi_some_10": "mem_psi_some_10_prio",
    "mem_psi_some_60": "mem_psi_some_60_prio",
    "mem_psi_some_300": "mem_psi_some_300_prio",
    "mem_psi_full_10": "mem_psi_full_10_prio",
    "mem_psi_full_60": "mem_psi_full_60_prio",
    "mem_psi_full_300": "mem_psi_full_300_prio",
    "load_avg_1": "load_avg_1_prio",
    "load_avg_5": "load_avg_5_prio",
    "load_avg_15": "load_avg_15_prio",
    "procs_running": "procs_running_prio",
    "procs_blocked": "procs_blocked_prio",
    "procs_disk_io": "procs_disk_io_prio"
})

pd_rr = pd_rr.rename(columns={
    "time_ms": "time_ms_rr",
    "cpu_psi_some_10": "cpu_psi_some_10_rr",
    "cpu_psi_some_60": "cpu_psi_some_60_rr",
    "cpu_psi_some_300": "cpu_psi_some_300_rr",
    "cpu_psi_full_10": "cpu_psi_full_10_rr",
    "cpu_psi_full_60": "cpu_psi_full_60_rr",
    "cpu_psi_full_300": "cpu_psi_full_300_rr",
    "io_psi_some_10": "io_psi_some_10_rr",
    "io_psi_some_60": "io_psi_some_60_rr",
    "io_psi_some_300": "io_psi_some_300_rr",
    "io_psi_full_10": "io_psi_full_10_rr",
    "io_psi_full_60": "io_psi_full_60_rr",
    "io_psi_full_300": "io_psi_full_300_rr",
    "mem_psi_some_10": "mem_psi_some_10_rr",
    "mem_psi_some_60": "mem_psi_some_60_rr",
    "mem_psi_some_300": "mem_psi_some_300_rr",
    "mem_psi_full_10": "mem_psi_full_10_rr",
    "mem_psi_full_60": "mem_psi_full_60_rr",
    "mem_psi_full_300": "mem_psi_full_300_rr",
    "load_avg_1": "load_avg_1_rr",
    "load_avg_5": "load_avg_5_rr",
    "load_avg_15": "load_avg_15_rr",
    "procs_running": "procs_running_rr",
    "procs_blocked": "procs_blocked_rr",
    "procs_disk_io": "procs_disk_io_rr"
})

# Add scheduler label
pd_prio["scheduler"] = "PRIORITY"
pd_rr["scheduler"] = "RR"

# Add ID for both
pd_prio["id"] = pd_prio.index + 1
pd_rr["id"] = pd_rr.index + 1

# pd_prio.head(5)
# pd_rr.head(5)

In [17]:
# Concatenate two dataframes
df = pd.merge(pd_prio, pd_rr, on="id", how="inner")

print(df.shape)
df.head(10)

(962, 53)


Unnamed: 0,time_ms_prio,cpu_psi_some_10_prio,cpu_psi_some_60_prio,cpu_psi_some_300_prio,cpu_psi_full_10_prio,cpu_psi_full_60_prio,cpu_psi_full_300_prio,io_psi_some_10_prio,io_psi_some_60_prio,io_psi_some_300_prio,...,mem_psi_full_10_rr,mem_psi_full_60_rr,mem_psi_full_300_rr,load_avg_1_rr,load_avg_5_rr,load_avg_15_rr,procs_running_rr,procs_blocked_rr,procs_disk_io_rr,scheduler_y
0,0,3.14,2.19,0.72,0,0,0,0.01,0.15,0.13,...,0,0,0,3.46,7.2,4.26,2,0,0,RR
1,1000,4.2,2.41,0.78,0,0,0,0.01,0.14,0.13,...,0,0,0,3.19,7.08,4.23,4,0,0,RR
2,2000,4.2,2.41,0.78,0,0,0,0.01,0.14,0.13,...,0,0,0,3.19,7.08,4.23,1,0,0,RR
3,3000,4.52,2.53,0.82,0,0,0,0.01,0.14,0.13,...,0,0,0,3.19,7.08,4.23,2,0,0,RR
4,4000,4.52,2.53,0.82,0,0,0,0.01,0.14,0.13,...,0,0,0,3.19,7.08,4.23,2,0,0,RR
5,5000,5.51,2.78,0.88,0,0,0,0.01,0.13,0.12,...,0,0,0,3.19,7.08,4.23,2,0,0,RR
6,6000,5.51,2.78,0.88,0,0,0,0.01,0.13,0.12,...,0,0,0,3.25,7.02,4.23,3,0,0,RR
7,7000,6.15,2.98,0.94,0,0,0,0.0,0.13,0.12,...,0,0,0,3.25,7.02,4.23,2,0,0,RR
8,8000,6.15,2.98,0.94,0,0,0,0.0,0.13,0.12,...,0,0,0,3.25,7.02,4.23,3,0,0,RR
9,9000,6.66,3.18,0.99,0,0,0,0.0,0.12,0.12,...,0,0,0,3.25,7.02,4.23,2,0,0,RR


In [None]:
# TODO Basic Cleaning

In [None]:
# TODO Summary Stats per Scheduler