In [2]:
import os
import glob
import pandas as pd
import numpy as np

In [3]:
# 1. Setup
data_dir = 'HR_Data'
traffic = 'TC-VM-01'
perf_file = os.path.join(data_dir, 'workload_metrics.csv')

In [4]:
# 2. Load performance metrics
if os.path.exists(perf_file):
    df_perf = pd.read_csv(perf_file)
    print("Loaded performance metrics:")
    display(df_perf.head())
else:
    print(f"Performance file not found at {perf_file}. Please verify path.")
    df_perf = pd.DataFrame()

# Helper to clean column names
def clean_col(col):
    col = col.replace('System - ', '')
    col = col.replace('System Core C-States - ', 'core_c_')
    col = col.replace('System Pack C-States - ', 'pack_c_')
    col = col.replace(' ', '_').replace('-', '').lower()
    return col

Loaded performance metrics:


Unnamed: 0,TestCaseID,Interference,Date,Throughput,Avg_Latency,P50_Latency,P75_Latency,P90_Latency,P99_Latency,Max_Latency
0,TC-VM-01,baseline,2025-04-29,989.65,4850.0,3650.0,6120.0,9430.0,19200.0,72450.0
1,TC-VM-01,cpu12.5,2025-04-29,995.24,7360.0,5510.0,9610.0,15190.0,28740.0,85180.0
2,TC-VM-01,cpu25,2025-04-29,995.16,12960.0,9250.0,17690.0,28210.0,51260.0,125950.0
3,TC-VM-01,cpu50,2025-04-29,985.14,466440.0,328190.0,633850.0,1050000.0,2080000.0,4530000.0
4,TC-VM-01,cpu100,2025-04-29,685.76,10670000.0,10530000.0,14530000.0,17380000.0,21170000.0,24230000.0


In [12]:
# 3. Load and profile PCM files
pcm_pattern = os.path.join(data_dir, f'pcm_system_metrics_{traffic}_*.csv')
pcm_files = glob.glob(pcm_pattern)
if pcm_files:
    feature_list = []
    for filepath in pcm_files:
        fname = os.path.basename(filepath)
        interference = fname.replace(f'pcm_system_metrics_{traffic}_', '').replace('.csv', '')
        
        df_pcm = pd.read_csv(filepath)
        numeric_cols = df_pcm.select_dtypes(include=[np.number]).columns

        # Clean column names
        numeric_cols = [col for col in numeric_cols if "System Pack C-States" not in col]
        
        # Aggregate: mean & 95th percentile
        feat_mean = df_pcm[numeric_cols].mean().rename(
            index={col: 'mean_' + clean_col(col) for col in numeric_cols}
        )
        feat_p95 = df_pcm[numeric_cols].quantile(0.95).rename(
            index={col: 'p95_' + clean_col(col) for col in numeric_cols}
        )
        
        # Build feature vector
        features = pd.concat([feat_mean, feat_p95]).to_frame().T
        features['Interference'] = interference
        feature_list.append(features)
    
    df_features = pd.concat(feature_list, ignore_index=True)
    print(df_features.shape)
    print("Extracted feature summaries:")
    display(df_features)
else:
    print(f"⚠️ No PCM files found with pattern: {pcm_pattern}")
    df_features = pd.DataFrame()

# Show columns
print("Columns in df_features:")
print(df_features.columns.tolist())

(12, 21)
Extracted feature summaries:


Unnamed: 0,mean_ipc,mean_l3miss,mean_l2miss,mean_read,mean_write,mean_physipc,mean_physipc%,mean_core_c_c0res%,mean_core_c_c1res%,mean_core_c_c6res%,...,p95_l3miss,p95_l2miss,p95_read,p95_write,p95_physipc,p95_physipc%,p95_core_c_c0res%,p95_core_c_c1res%,p95_core_c_c6res%,Interference
0,0.563077,3.065385,7.523077,36.531538,22.444615,1.124615,22.501538,5.66,93.080769,0.0,...,3.346,8.042,38.864,23.926,1.678,33.518,5.988,94.064,0.0,l3_1
1,0.403846,2.436154,5.569231,46.056923,19.949231,0.809231,16.196923,9.075385,90.924615,0.0,...,2.564,6.05,46.262,20.432,0.85,16.954,9.124,91.08,0.0,membw_8
2,0.196923,1.956923,4.430769,33.036154,25.839231,0.395385,7.894615,11.219231,88.780769,0.0,...,2.292,5.438,33.618,26.658,0.492,9.83,11.294,89.022,0.0,l3_4
3,0.782308,4.233846,18.666923,26.198462,4.885385,1.566923,31.343846,8.035385,90.130769,0.0,...,4.726,20.626,28.512,5.344,1.74,34.784,8.742,91.568,0.0,cpu12.5
4,0.532727,2.380909,6.992727,47.489091,18.85,1.063636,21.273636,5.949091,93.803636,0.0,...,2.445,7.225,48.08,19.565,1.11,22.18,6.055,94.005,0.0,membw_2
5,0.688,2.500667,9.185333,45.339333,16.109333,1.374667,27.502667,4.986,92.150667,0.0,...,2.859,10.49,47.144,17.09,2.22,44.44,5.559,94.512,0.0,membw_1
6,1.06,5.318462,21.188462,12.633846,3.451538,2.12,42.388462,20.768462,79.164615,0.0,...,5.654,22.654,13.098,3.524,2.154,43.122,20.954,79.382,0.0,cpu100
7,0.664667,2.422667,10.906,26.469333,4.767333,1.332667,26.611333,3.942,92.227333,0.0,...,2.982,13.026,31.395,5.996,1.463,29.216,4.73,95.63,0.0,baseline
8,1.014615,2.463077,9.247692,16.290769,4.386923,2.026154,40.526923,7.38,92.074615,0.0,...,2.758,10.34,17.692,4.78,2.168,43.364,7.68,92.354,0.0,cpu50
9,0.334,1.878,5.114667,36.386667,24.922667,0.669333,13.398667,5.914667,93.938667,0.0,...,1.972,5.3,36.771,26.164,0.758,15.098,6.08,94.091,0.0,l3_2


Columns in df_features:
['mean_ipc', 'mean_l3miss', 'mean_l2miss', 'mean_read', 'mean_write', 'mean_physipc', 'mean_physipc%', 'mean_core_c_c0res%', 'mean_core_c_c1res%', 'mean_core_c_c6res%', 'p95_ipc', 'p95_l3miss', 'p95_l2miss', 'p95_read', 'p95_write', 'p95_physipc', 'p95_physipc%', 'p95_core_c_c0res%', 'p95_core_c_c1res%', 'p95_core_c_c6res%', 'Interference']


In [6]:
# 4. Label joining
if not df_perf.empty and not df_features.empty:
    df_ml = df_features.merge(df_perf, on='Interference', how='inner')
    print("Combined ML-ready dataset:")
    display(df_ml.head())
else:
    df_ml = pd.DataFrame()
    print("⚠️ Cannot create ML dataset: missing performance or feature data.")

# Show columns of the final dataset
print("Final dataset columns:")
print(df_ml.columns.tolist())

Combined ML-ready dataset:


Unnamed: 0,mean_ipc,mean_l3miss,mean_l2miss,mean_read,mean_write,mean_physipc,mean_physipc%,mean_core_c_c0res%,mean_core_c_c1res%,mean_core_c_c6res%,...,Interference,TestCaseID,Date,Throughput,Avg_Latency,P50_Latency,P75_Latency,P90_Latency,P99_Latency,Max_Latency
0,0.563077,3.065385,7.523077,36.531538,22.444615,1.124615,22.501538,5.66,93.080769,0.0,...,l3_1,TC-VM-01,2025-04-29,968.02,1340000.0,989700.0,1940000.0,3070000.0,4970000.0,7620000.0
1,0.403846,2.436154,5.569231,46.056923,19.949231,0.809231,16.196923,9.075385,90.924615,0.0,...,membw_8,TC-VM-01,2025-04-29,207.47,27450000.0,27820000.0,37030000.0,43380000.0,47840000.0,50070000.0
2,0.196923,1.956923,4.430769,33.036154,25.839231,0.395385,7.894615,11.219231,88.780769,0.0,...,l3_4,TC-VM-01,2025-04-29,186.05,28820000.0,28820000.0,40470000.0,44790000.0,48560000.0,
3,0.782308,4.233846,18.666923,26.198462,4.885385,1.566923,31.343846,8.035385,90.130769,0.0,...,cpu12.5,TC-VM-01,2025-04-29,995.24,7360.0,5510.0,9610.0,15190.0,28740.0,85180.0
4,0.532727,2.380909,6.992727,47.489091,18.85,1.063636,21.273636,5.949091,93.803636,0.0,...,membw_2,TC-VM-01,2025-04-29,771.89,8070000.0,7860000.0,10930000.0,13480000.0,16970000.0,19270000.0


Final dataset columns:
['mean_ipc', 'mean_l3miss', 'mean_l2miss', 'mean_read', 'mean_write', 'mean_physipc', 'mean_physipc%', 'mean_core_c_c0res%', 'mean_core_c_c1res%', 'mean_core_c_c6res%', 'p95_ipc', 'p95_l3miss', 'p95_l2miss', 'p95_read', 'p95_write', 'p95_physipc', 'p95_physipc%', 'p95_core_c_c0res%', 'p95_core_c_c1res%', 'p95_core_c_c6res%', 'Interference', 'TestCaseID', 'Date', 'Throughput', 'Avg_Latency', 'P50_Latency', 'P75_Latency', 'P90_Latency', 'P99_Latency', 'Max_Latency']


#### Each row of df_ml now represents one complete experiment: its observed hardware‐counter signature and the matching tail-latency outcome. This ML-ready table is the basis for training our non‐linear latency predictor.

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Step 5: Preprocessing & Pipeline Construction
# 1. Identify feature columns (mean_ and p95_ prefixes) and target
feature_cols = [col for col in df_ml.columns if col.startswith('mean_') or col.startswith('p95_')]
X = df_ml[feature_cols]
y = df_ml['P99_Latency']

# 2. Build the preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

# 3. Fit and transform the features
X_preprocessed = preprocessor.fit_transform(X)

# Display shapes for verification
print(f"Original feature matrix shape: {X.shape}")
print(f"Preprocessed feature matrix shape: {X_preprocessed.shape}")

Original feature matrix shape: (12, 20)
Preprocessed feature matrix shape: (12, 230)


In [8]:
import numpy as np
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

# === Step 6: Model Training & Evaluation ===

# 1. Define features and target
feature_cols = [col for col in df_ml.columns if col.startswith('mean_') or col.startswith('p95_')]
X = df_ml[feature_cols].values
y = df_ml['P99_Latency'].values  # adjust if your column name differs

# 2. Build pipelines
linreg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', LinearRegression())
])

rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # optional for RF, but keeps interface consistent
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=3))
])


In [9]:
# 3. Evaluate with Leave-One-Out CV
loo = LeaveOneOut()

# Linear Regression evaluation
neg_mae_lin = cross_val_score(linreg_pipeline, X, y, cv=loo, scoring='neg_mean_absolute_error')
neg_mse_lin = cross_val_score(linreg_pipeline, X, y, cv=loo, scoring='neg_mean_squared_error')
print("Linear Regression + Polynomial Features")
print(f"  Mean MAE: {(-neg_mae_lin).mean():.2f} ms")
print(f"  Mean MSE: {(-neg_mse_lin).mean():.2f} (ms^2)")

# Random Forest evaluation
neg_mae_rf = cross_val_score(rf_pipeline, X, y, cv=loo, scoring='neg_mean_absolute_error')
neg_mse_rf = cross_val_score(rf_pipeline, X, y, cv=loo, scoring='neg_mean_squared_error')
print("\nRandom Forest Regressor")
print(f"  Mean MAE: {(-neg_mae_rf).mean():.2f} ms")
print(f"  Mean MSE: {(-neg_mse_rf).mean():.2f} (ms^2)")

neg_mae_knn = cross_val_score(knn_pipeline, X, y, cv=loo, scoring='neg_mean_absolute_error')
neg_mse_knn = cross_val_score(knn_pipeline, X, y, cv=loo, scoring='neg_mean_squared_error')

# 4. Fit final models on all data
linreg_pipeline.fit(X, y)
rf_pipeline.fit(X, y)
knn_pipeline.fit(X, y)
y_pred_knn = knn_pipeline.predict(X)


Linear Regression + Polynomial Features
  Mean MAE: 20533272.90 ms
  Mean MSE: 914363697464197.38 (ms^2)

Random Forest Regressor
  Mean MAE: 11504973.96 ms
  Mean MSE: 165679774021210.62 (ms^2)


In [10]:
print("\nTraining-set Comparison (first 5 samples)")
print("Actual P99:", np.round(y[:5], 2))
print("kNN Pred   :", np.round(y_pred_knn[:5], 2))


Training-set Comparison (first 5 samples)
Actual P99: [4.970e+06 4.784e+07 4.856e+07 2.874e+04 1.697e+07]
kNN Pred   : [ 7347403.33 35760000.   40960000.     709313.33 35760000.  ]


In [11]:
# 5. Quick training-set check (first 5 samples)
y_pred_lin = linreg_pipeline.predict(X)
y_pred_rf = rf_pipeline.predict(X)
print("\nTraining-set Comparison (first 5 samples)")
print("Actual P99:", np.round(y[:5], 2))
print("Linear Pred:", np.round(y_pred_lin[:5], 2))
print("RF Pred     :", np.round(y_pred_rf[:5], 2))


Training-set Comparison (first 5 samples)
Actual P99: [4.970e+06 4.784e+07 4.856e+07 2.874e+04 1.697e+07]
Linear Pred: [4.970e+06 4.784e+07 4.856e+07 2.874e+04 1.697e+07]
RF Pred     : [ 8533673.3 41245787.4 43955800.   2307281.5 19568531.6]
