✅ Loads the cleaned dataset

✅ Adds regression + classification labels

✅ Creates sliding-window features

✅ Saves final ML-ready datasets for your Java models

In [1]:
import pandas as pd
import numpy as np
import os

CLEAN_PATH = "/Users/azka/Downloads/Java/data/bitbrains_clean_all.csv"

df = pd.read_csv(CLEAN_PATH, parse_dates=["timestamp"])
df.head()

FUTURE_STEPS = 6  # 6 * 5-min = 30 min

df["cpu_future_30min"] = (
    df.groupby("vm_id")["cpu_usage_percent"].shift(-FUTURE_STEPS)
)

df["high_load"] = (df["cpu_usage_percent"] > 80).astype(int)

df["high_load_future_30min"] = (
    df["cpu_future_30min"] > 80
).astype(int)

df.head()

WINDOW = 12  # 1 hour of history

def create_sliding_windows(df, window=12, future_steps=6):
    X_list = []
    y_list = []
    class_list = []
    vm_list = []

    for vm_id, group in df.groupby("vm_id"):
        cpu_vals = group["cpu_usage_percent"].values
        future_vals = group["cpu_future_30min"].values
        class_vals = group["high_load_future_30min"].values

        for i in range(len(cpu_vals) - window - future_steps):
            # past 12 CPU values
            window_vals = cpu_vals[i:i+window]

            # future CPU
            y_val = future_vals[i+window]

            # classification label
            c_val = class_vals[i+window]

            X_list.append(window_vals)
            y_list.append(y_val)
            class_list.append(c_val)
            vm_list.append(vm_id)

    return pd.DataFrame({
        "vm_id": vm_list,
        "features": X_list,
        "target_cpu_future": y_list,
        "target_high_load": class_list
    })

window_df = create_sliding_windows(df, window=WINDOW, future_steps=FUTURE_STEPS)
window_df.head()

OUTPUT_FULL = "/Users/azka/Downloads/Java/data/bitbrains_ml_windows.csv"
window_df.to_csv(OUTPUT_FULL, index=False)
print("Saved full ML dataset:", OUTPUT_FULL)


regression_df = window_df[["vm_id", "features", "target_cpu_future"]]
OUTPUT_REG = "/Users/azka/Downloads/Java/data/bitbrains_regression.csv"
regression_df.to_csv(OUTPUT_REG, index=False)
print("Saved regression dataset:", OUTPUT_REG)


classification_df = window_df[["vm_id", "features", "target_high_load"]]
OUTPUT_CLS = "/Users/azka/Downloads/Java/data/bitbrains_classification.csv"
classification_df.to_csv(OUTPUT_CLS, index=False)
print("Saved classification dataset:", OUTPUT_CLS)


Saved full ML dataset: /Users/azka/Downloads/Java/data/bitbrains_ml_windows.csv
Saved regression dataset: /Users/azka/Downloads/Java/data/bitbrains_regression.csv
Saved classification dataset: /Users/azka/Downloads/Java/data/bitbrains_classification.csv
