In [2]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [3]:
penguins_df = pd.read_csv("https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins-raw.csv")
penguins_df.head(3)

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,2007-11-11,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,


In [4]:
df = penguins_df[["Culmen Length (mm)", "Culmen Depth (mm)", "Flipper Length (mm)", "Body Mass (g)"]]
df.dropna(inplace = True)
df.drop_duplicates(inplace = True)
df.head()

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
4,36.7,19.3,193.0,3450.0
5,39.3,20.6,190.0,3650.0


In [5]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

In [6]:
x = df.drop(columns = ["Body Mass (g)"])
y = df["Body Mass (g)"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [7]:
model = KNeighborsRegressor(algorithm = "auto", n_neighbors = 1, weights = "uniform")
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.6648902762486238

In [8]:
from sklearn.metrics import mean_squared_error

preds = model.predict(x_test)
mean_squared_error(y_test, preds)

210969.20289855072

In [9]:
# HYPERPARAMETER TUNING

from sklearn.model_selection import GridSearchCV

parameters = {
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "n_neighbors": [1, 2, 3, 4, 5, 10, 15, 20]
}

optimizer = GridSearchCV(KNeighborsRegressor(), param_grid = parameters)
optimizer.fit(x_train, y_train)

In [12]:
optimizer.best_params_

{'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}

In [11]:
new_model = KNeighborsRegressor(algorithm = "auto", n_neighbors = 10, weights = "distance")
new_model.fit(x_train, y_train)
new_model.score(x_test, y_test)

0.8133992336156797

In [74]:
df.shape

(342, 4)