Exercise 7: KNN_Regressor

In [36]:
import pandas as pd
import numpy as np
from si.io.csv_file import read_csv
from si.data.dataset import Dataset 
from si.model_selection.split import train_test_split
from si.models.knn_regressor import KNNRegressor
from si.metrics.rmse import rmse
from si.statistics.euclidean_distance import euclidean_distance 


In [37]:
cpu_dataset = read_csv('../datasets/cpu/cpu.csv', features=True, label=True)
print("Loaded SI dataset:", cpu_dataset.X.shape)


Loaded SI dataset: (209, 6)


In [40]:
features_to_drop = ['Vendor', 'Model', 'PRP']

# keep only features not in drop list
keep_indices = [i for i,f in enumerate(cpu_dataset.features) if f not in features_to_drop]

X = cpu_dataset.X[:, keep_indices]
y = cpu_dataset.y


In [41]:
cpu_dataset_filtered = Dataset(
    X=X,
    y=y,
    features=[cpu_dataset.features[i] for i in keep_indices],
    label=cpu_dataset.label
)


In [43]:
train_ds, test_ds = train_test_split(cpu_dataset_filtered, test_size=0.3, random_state=42)



In [45]:
knn_regressor = KNNRegressor(k=5, distance=euclidean_distance)

knn_regressor.fit(train_ds)

rmse_score = knn_regressor._score(test_ds)

print("\n--- KNNRegressor Test Results ---")
print(f"k-Neighbors Used: {knn_regressor.k}")
print(f"Evaluation Metric: RMSE")
print(f"RMSE on Test Set: {rmse_score:.4f}")



--- KNNRegressor Test Results ---
k-Neighbors Used: 5
Evaluation Metric: RMSE
RMSE on Test Set: 151.4152


In [46]:
predictions = knn_regressor.predict(test_ds)

comparison_df = pd.DataFrame({
    'Real Value (PRP)': test_ds.y,
    'Predicted PRP': predictions
})

print("\nFirst 10 Predictions vs Actual:")
comparison_df.head(10)



First 10 Predictions vs Actual:


Unnamed: 0,Real Value (PRP),Predicted PRP
0,274,91.8
1,30,31.4
2,22,34.6
3,915,391.4
4,16,18.4
5,326,261.6
6,72,39.2
7,6,20.8
8,1144,481.4
9,208,195.0
