In [19]:
# ============================================
# 0. Imports & Paths
# ============================================
import os
import re
import time

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    r2_score,
    accuracy_score,
    f1_score,
)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ---- Adjust paths if needed ----
DATA_DIR = "/Users/azka/Downloads/Java/data"
ML_DATA_PATH = os.path.join(DATA_DIR, "bitbrains_ml_windows.csv")
PRED_OUTPUT_PATH = os.path.join(DATA_DIR, "bitbrains_predictions_for_cloudsim.csv")

print("ML_DATA_PATH:", ML_DATA_PATH)
print("PRED_OUTPUT_PATH:", PRED_OUTPUT_PATH)

# ============================================
# 1. Load ML dataset
# ============================================
df = pd.read_csv(ML_DATA_PATH)

print("Shape:", df.shape)
print(df.dtypes)
print("Columns:", df.columns.tolist())
df.head()


# ============================================
# 2. Parse 'features' column (NumPy-style strings)
#    Example string: "[93.23 85.11 77.92]"
# ============================================

def parse_numpy_array(x: str):
    """
    Convert a NumPy-style string array like:
        "[93.23333333 93.05 89.15]"
    into a Python list of floats: [93.2333, 93.05, 89.15]
    """
    if isinstance(x, (list, np.ndarray)):
        # Already parsed
        return list(x)

    if not isinstance(x, str):
        raise ValueError(f"Unexpected features type: {type(x)}")

    s = x.strip()
    # Normalize whitespace
    s = re.sub(r"\s+", " ", s)

    if s.startswith("[") and s.endswith("]"):
        inner = s[1:-1].strip()
        if inner == "":
            return []
        parts = inner.split(" ")
        return [float(p) for p in parts]

    raise ValueError(f"Invalid array string: {x[:50]}...")

# Apply parser
t0 = time.time()
feat_list = df["features"].apply(parse_numpy_array)
print("Parsed one example:", feat_list.iloc[0])
print("Length of example feature vector:", len(feat_list.iloc[0]))

# Stack into a 2D numpy array: (n_samples, n_features)
feat_array = np.stack(feat_list.values)
t1 = time.time()

print("Feature matrix shape:", feat_array.shape)
print(f"Feature parsing time: {t1 - t0:.2f} seconds")


# ============================================
# 3. Prepare targets (regression + classification)
# ============================================
if "target_cpu_future" not in df.columns:
    raise ValueError("Column 'target_cpu_future' not found in dataset!")

if "target_high_load" not in df.columns:
    raise ValueError("Column 'target_high_load' not found in dataset!")

X = feat_array
y_reg = df["target_cpu_future"].values
y_cls = df["target_high_load"].values.astype(int)

print("X shape:", X.shape)
print("y_reg shape:", y_reg.shape)
print("y_cls shape:", y_cls.shape)


# ============================================
# 4. Train / Test split (time-ordered)
#    We assume the rows are already in time order.
#    We'll use the first 80% for training, last 20% for testing.
# ============================================

n = X.shape[0]
split_idx = int(n * 0.8)

X_train_reg, X_test_reg = X[:split_idx], X[split_idx:]
y_train_reg, y_test_reg = y_reg[:split_idx], y_reg[split_idx:]

X_train_cls, X_test_cls = X[:split_idx], X[split_idx:]
y_train_cls, y_test_cls = y_cls[:split_idx], y_cls[split_idx:]

print("Train size (reg):", X_train_reg.shape, "Test size (reg):", X_test_reg.shape)
print("Train size (cls):", X_train_cls.shape, "Test size (cls):", X_test_cls.shape)


ML_DATA_PATH: /Users/azka/Downloads/Java/data/bitbrains_ml_windows.csv
PRED_OUTPUT_PATH: /Users/azka/Downloads/Java/data/bitbrains_predictions_for_cloudsim.csv
Shape: (9640028, 4)
vm_id                  int64
features              object
target_cpu_future    float64
target_high_load       int64
dtype: object
Columns: ['vm_id', 'features', 'target_cpu_future', 'target_high_load']
Parsed one example: [93.23333333, 93.05, 89.15, 90.05, 93.56666667, 93.25, 92.75, 86.78333333, 89.51666667, 95.08333333, 94.25, 55.41666667]
Length of example feature vector: 12
Feature matrix shape: (9640028, 12)
Feature parsing time: 35.53 seconds
X shape: (9640028, 12)
y_reg shape: (9640028,)
y_cls shape: (9640028,)
Train size (reg): (7712022, 12) Test size (reg): (1928006, 12)
Train size (cls): (7712022, 12) Test size (cls): (1928006, 12)


In [20]:
# ============================================
# 5. Define models (Regressor + Classifier)
# ============================================

reg_model = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("rf", RandomForestRegressor(
            n_estimators=100,
            random_state=42,
            n_jobs=-1,
        )),
    ]
)

cls_model = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("rf", RandomForestClassifier(
            n_estimators=200,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1,
        )),
    ]
)

# ============================================
# 6. Train models
# ============================================

# ---- Regression model ----
t0 = time.time()
reg_model.fit(X_train_reg, y_train_reg)
t1 = time.time()
print(f"âœ… Trained RandomForestRegressor in {t1 - t0:.2f} seconds")

# ---- Classification model ----
t0 = time.time()
cls_model.fit(X_train_cls, y_train_cls)
t1 = time.time()
print(f"âœ… Trained RandomForestClassifier in {t1 - t0:.2f} seconds")


# ============================================
# 7. Evaluate on test split
# ============================================

# ---- Regression ----
y_pred_reg = reg_model.predict(X_test_reg)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print("\nðŸ“ˆ Regression Metrics (target_cpu_future):")
print(f"  MAE: {mae:.4f}")
print(f"  RÂ² : {r2:.4f}")

# ---- Classification ----
y_pred_cls = cls_model.predict(X_test_cls)
acc = accuracy_score(y_test_cls, y_pred_cls)
f1 = f1_score(y_test_cls, y_pred_cls)

print("\nðŸ“Š Classification Metrics (target_high_load):")
print(f"  Accuracy: {acc:.4f}")
print(f"  F1-score: {f1:.4f}")


# ============================================
# 8. Generate predictions for ALL rows (for CloudSim)
# ============================================

t0 = time.time()
pred_cpu_future_all = reg_model.predict(X)
pred_high_load_all = cls_model.predict(X)
t1 = time.time()
print(f"Predicted all rows in {t1 - t0:.2f} seconds")

pred_df = pd.DataFrame({
    "vm_id": df["vm_id"].values,
    "target_cpu_future": df["target_cpu_future"].values,
    "target_high_load": df["target_high_load"].values,
    "pred_cpu_future": pred_cpu_future_all,
    "pred_high_load": pred_high_load_all,
})

print("Prediction dataframe shape:", pred_df.shape)
pred_df.head()

# ============================================
# 9. Save predictions to CSV for CloudSim
# ============================================
os.makedirs(DATA_DIR, exist_ok=True)
pred_df.to_csv(PRED_OUTPUT_PATH, index=False)
print("âœ… Saved predictions for CloudSim to:", PRED_OUTPUT_PATH)


âœ… Trained RandomForestRegressor in 2291.16 seconds
âœ… Trained RandomForestClassifier in 563.48 seconds

ðŸ“ˆ Regression Metrics (target_cpu_future):
  MAE: 0.5698
  RÂ² : 0.6356

ðŸ“Š Classification Metrics (target_high_load):
  Accuracy: 0.9998
  F1-score: 0.6859
Predicted all rows in 148.08 seconds
Prediction dataframe shape: (9640028, 5)
âœ… Saved predictions for CloudSim to: /Users/azka/Downloads/Java/data/bitbrains_predictions_for_cloudsim.csv


In [21]:
import os
import pandas as pd
import numpy as np

DATA_DIR = "/Users/azka/Downloads/Java/data"
PRED_OUTPUT_PATH = os.path.join(DATA_DIR, "bitbrains_predictions_for_cloudsim.csv")

# Load existing predictions
pred_df = pd.read_csv(PRED_OUTPUT_PATH)
print("Before:", pred_df.shape, pred_df.columns)

# Add a synthetic time index (slot)
pred_df.insert(0, "slot", np.arange(len(pred_df)))  # add as first column

print("After:", pred_df.shape, pred_df.head())

# Save back (overwrite)
pred_df.to_csv(PRED_OUTPUT_PATH, index=False)
print("âœ… Re-saved with slot column at:", PRED_OUTPUT_PATH)


Before: (9640028, 5) Index(['vm_id', 'target_cpu_future', 'target_high_load', 'pred_cpu_future',
       'pred_high_load'],
      dtype='object')
After: (9640028, 6)    slot  vm_id  target_cpu_future  target_high_load  pred_cpu_future  \
0     0      1           0.666667                 0        15.810750   
1     1      1           0.583333                 0        14.845571   
2     2      1           0.583333                 0         9.758000   
3     3      1           0.583333                 0         9.291167   
4     4      1           0.650000                 0         8.572417   

   pred_high_load  
0               0  
1               0  
2               0  
3               0  
4               0  
âœ… Re-saved with slot column at: /Users/azka/Downloads/Java/data/bitbrains_predictions_for_cloudsim.csv
