In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore' )

dataset = pd.read_csv("P_dataset.csv")

In [2]:
dataset.drop('Time', axis=1, inplace=True)

In [3]:
dataset

Unnamed: 0,Wind speed,Generator speed,Blade angle,Wind direction,Ambient temperature,Active power
0,5.1826,11.3582,-0.7699,-340.9205,28.4515,212.3730
1,4.9545,11.1044,-0.7699,-345.9042,28.5000,198.8769
2,4.8302,10.6120,-0.7699,-339.1856,28.5209,173.8923
3,4.4468,10.0738,-0.7698,-332.9802,28.4043,148.1184
4,4.9901,10.9070,-0.7699,-332.9802,28.3688,188.0684
...,...,...,...,...,...,...
51226,3.5208,8.9996,-0.7969,98.4366,27.6533,72.0548
51227,2.8284,8.9764,-0.7969,98.4366,27.6997,34.9057
51228,2.8082,8.9818,-0.7969,98.4366,27.6811,39.1213
51229,2.9400,8.9237,-0.7969,92.6699,27.6045,32.1846


In [4]:
x = dataset.drop("Active power", axis = 1)
y = dataset["Active power"] 

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 2)

In [6]:
#Preprocessing – Scaling the features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

x_train_scaled = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train_scaled)

x_test_scaled = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test_scaled)

In [7]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor().fit(x_train, y_train)
y_pred = knn_model.predict(x_test)

In [8]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(x_train, y_train)

In [9]:
y_pred = regressor.predict(x_test)

In [10]:
y_pred

array([483.53968, 168.28464, 615.78048, ..., 233.03844, 572.7385 ,
       179.01622])

In [11]:
predict_df = pd.DataFrame({"y_Test" : y_test, "y_Predicted" : y_pred})
predict_df.head(20)

Unnamed: 0,y_Test,y_Predicted
1924,504.5162,483.53968
27593,165.2616,168.28464
3155,595.7564,615.78048
2835,366.0311,366.1859
33102,3.8625,0.0
32974,639.5363,717.34242
7124,324.7482,304.64948
33685,503.4377,499.62556
30406,1304.28,1356.8134
36030,1299.145,1306.364


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'mae: {mae}')
print(f'mse: {mse}')
print(f'rmse: {rmse}')

mae: 16.886006052503173
mse: 854.9472387415253
rmse: 29.239480822024273


The R2 can be calculated directly with the score() method:

In [13]:
regressor.score(x_test, y_test)

0.9951812053145545

In [14]:
y.describe()

count    51231.000000
mean       524.559063
std        423.491368
min          0.000000
25%        179.596550
50%        392.251900
75%        834.088200
max       1379.842000
Name: Active power, dtype: float64

#### Parameter Tuning

In [15]:
error = []

# Calculating MAE error for K values between 1 and 39
for i in range(1, 40):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(x_train, y_train)
    pred_i = knn.predict(x_test)
    mae = mean_absolute_error(y_test, pred_i)
    error.append(mae)

In [16]:
import numpy as np 

print(min(error))               
print(np.array(error).argmin()) 

15.447241530069356
28


We started counting neighbors on 1, while arrays are 0-based, so the 28th index is 29 neighbors!

This means that we need 29 neighbors to be able to predict a point with the lowest MAE error. 
We can execute the model and metrics again with 29 neighbors to compare results

In [19]:
knn_reg12 = KNeighborsRegressor(n_neighbors=29)
knn_reg12.fit(x_train, y_train)
y_pred12 = knn_reg12.predict(x_test)
r2 = knn_reg12.score(x_test, y_test) 

mae12 = mean_absolute_error(y_test, y_pred12)
mse12 = mean_squared_error(y_test, y_pred12)
rmse12 = mean_squared_error(y_test, y_pred12, squared=False)
print(f'r2: {r2}, \nmae: {mae12} \nmse: {mse12} \nrmse: {rmse12}')

r2: 0.9957239520895609, 
mae: 15.447241530069356 
mse: 758.6534792192446 
rmse: 27.54366495619718
