```markdown
# Supervised ML: k-NN model
```

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


```markdown
## Loading the dataset
```

In [None]:

# Step 1: Load the data
data = pd.read_csv('cleaned_dataset.csv')

print(data.head())
print(data.info())
print(data.describe())

```markdown
## Defining features and target variable
```

In [None]:
X = data[['PC1', 'PC2', 'PC3', 'Elongation', 'Reduction_of_Area',
          'Charpy_impact_toughness', 'Hardness']]
y = data['Yield_strength'] 

```markdown
## Training and testing sets preparation
```

In [None]:
# We use the classic 80-20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Standardizing the features for both training and test sets
scaler = StandardScaler() 
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

```markdown
## Hyperparameter tuning
```

In [None]:

# We look for the optimal number of neighbors (k)
# Use GridSearchCV for exhaustive search over parameter grid with 5-fold cross-validation
param_grid = {'n_neighbors': np.arange(1, 31)}
knn = KNeighborsRegressor()
grid_search = GridSearchCV(
    knn, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

best_k = grid_search.best_params_['n_neighbors']
print(f"Optimal number of neighbors (k): {best_k}")


```markdown
## Running the model
```

In [None]:

# Training the model with the optimal k
knn_best = KNeighborsRegressor(n_neighbors=best_k)
knn_best.fit(X_train_scaled, y_train)

# Evaluating the model
y_pred = knn_best.predict(X_test_scaled)

```markdown
## Performance
```

In [None]:
#Defining the metrics
mse = np.mean((y_test - y_pred) ** 2)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)

print(f"Test Set RMSE: {rmse:.2f}")
print(f"Test Set R-squared: {r_squared:.2f}")

# Plotting the cross-validated MSE vs. Number of Neighbors k
mse_values = []
k_range = range(1, 31)
for k in k_range:
    knn = KNeighborsRegressor(n_neighbors=k)
    mse_scores = -cross_val_score(
        knn, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    mse_values.append(mse_scores.mean())

plt.figure(figsize=(10, 6))
plt.plot(k_range, mse_values, marker='o')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Cross-Validated MSE')
plt.title('Cross-Validated MSE vs. Number of Neighbors')
plt.xticks(k_range)
plt.grid(True)
plt.show()
