In [None]:
# Let's start by loading some libraries.
# The convention is to load everything here at the start of the notebook
# so it is easier to read the notebook.
# For this notebook, however, we will load the packages as we need them, 
# so you can learn when we need them.

import pandas as pd
import altair as alt 
import numpy as np
import sklearn
import sklearn.neighbors 
alt.data_transformers.enable("vegafusion")

penguins_raw = pd.read_csv("https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins.csv")



## DO NOT WORRY ABOUT THIS FUNCTION HERE
## I CREATED TO PLOT THE CLASSIFICATION GRID 
## JUST IGNORE IT.
def plot_grid_rl(my_knn: sklearn.neighbors.KNeighborsClassifier, data_df: pd.DataFrame, covariates: list[str], response: str):
  """
  Create a visualization of KNN classification results on a grid.

  This function generates a scatter plot of data points and overlays
  it with a grid of predicted classifications using a KNN model.

  Parameters:
  -----------
  my_knn : KNN
      A trained K-Nearest Neighbors classifier object.
  data_df : pd.DataFrame
      The dataset containing the original data points.
  covariates : list[str]
      A list of two column names to be used as covariates (features) for the plot.
  response : str
      The name of the column containing the response variable (target).

  Returns:
  --------
  alt.LayerChart
      An Altair chart object containing the scatter plot of original data
      and the grid of KNN predictions.

  Dependencies:
  -------------
  - numpy (imported as np)
  - pandas (imported as pd)
  - altair (imported as alt)
  """

  # Calculate min and max values for the covariates
  min_max_info = data_df[covariates].agg(['min', 'max'])

  # Create a grid of points for prediction, extending 5% beyond the data range
  grid = np.meshgrid(np.linspace(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0], 200), 
                     np.linspace(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1], 200))

  # Convert the grid to a DataFrame for easier processing
  grid_df = pd.DataFrame({
      covariates[0]: grid[0].ravel(),
      covariates[1]: grid[1].ravel()
  })

  # Use the KNN model to predict the response for each point in the grid
  grid_df['predicted_'+response] = my_knn.predict(grid_df)

  # Create the base scatter plot of original data
  base_chart = alt.Chart(data_df).mark_point().encode(
      x=alt.X(covariates[0]).scale(zero=False, domain=(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0])),
      y=alt.Y(covariates[1]).scale(zero=False, domain=(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1])),
      color=response+':N'
  )

  # Create the overlay of predicted classifications
  prediction_layer = alt.Chart(grid_df).mark_point(
      opacity=0.05,
      size=40,
      filled=True
  ).encode(
      x=alt.X(covariates[0]).scale(zero=False, domain=(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0])),
      y=alt.Y(covariates[1]).scale(zero=False, domain=(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1])),
      color=alt.Color('predicted_'+response+':N').title(response)
  )

  # Combine the base chart and prediction layer
  return (base_chart + prediction_layer).properties(width=800, height=600)

In [None]:
# Cell [1] 
# Similarly to last time, let's drop the small fractions of NAs we have in the penguins dataset. 

penguins_clean = penguins_raw.dropna()
penguins_clean.sample(10)

In [None]:
# Cell [2]
# The first thing we need to do now, is to split between train and test data, and "hide" the test data.
from sklearn.model_selection import train_test_split

penguins_train, penguins_test = train_test_split(penguins_clean, test_size = 0.3, stratify = penguins_clean['species'])

penguins_train.head()

In [None]:
# Cell [2]
# Let's take a look at our data

alt.Chart(penguins_train)\
   .mark_point()\
   .encode(
       x=alt.X('body_mass_g').title("Body Mass (g)").scale(zero=False),
       y=alt.Y("bill_depth_mm").title("Bill Depth").scale(zero=False),
       color='species'
   )\
   .properties(
       width=600,
       height=400
   )

In [None]:
# Cell [3]
# Once again, we need to standardize our data, otherwise KNN will not work properly. 
# Let's just create the Standization model (but we won't fit it for now). 
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

col_transf = make_column_transformer(
    (StandardScaler(), ['bill_depth_mm', 'body_mass_g'])
)

In [None]:
# Cell [4]
# Let's create our KNN classifier
# We start by loading the library:
from sklearn.neighbors import KNeighborsClassifier as KNN 

my_knn = KNN() # Unfortunately, I have no idea how many neighbours to use here! 

# What is the best value of neighbors (hyper-parameter)? 
# (Note: by not specifying the number of neighbours, sklearn uses 5 as default)

In [None]:
# Cell [5] 
# Next, let's create our pipeline. 
from sklearn.pipeline import make_pipeline

knn_pipeline_5_neighbors = make_pipeline(col_transf, my_knn)

In [None]:
# Cell [6]
# Now, we need to assess our model so we can choose k. 
# However, we cannot assess the classifier on observations we used to fit the model. This is "cheating".
# We don't want the model to be able to "remember" stuff, we want it to be able to "predict" stuff.
# So, we need cross-validation!
from sklearn.model_selection import cross_validate

# Now, we can use cross-validation to compare models. For example: 

cross_validate(knn_pipeline_5_neighbors, 
               penguins_train[['body_mass_g', 'bill_depth_mm']], 
               penguins_train['species'], 
               cv=10, 
               scoring=['accuracy', 'precision_macro', 'recall_macro'])



## (Note: don't worry about the `_macro` in front of precision and recall -- I had to put those in there 
## because this is not a binary classification (two-classes only -- we have three species in this case).

In [None]:
# Cell [7]
# It might be helpful to store the cross_validate's results as a pandas.DataFrame:
penguins_cv_info_5_neighbors = pd.DataFrame(
    cross_validate(knn_pipeline_5_neighbors, 
               penguins_train[['body_mass_g', 'bill_depth_mm']], 
               penguins_train['species'], 
               cv=10, 
               scoring=['accuracy', 'precision_macro', 'recall_macro'])
)
penguins_cv_info_5_neighbors

In [None]:
# Cell [8]
# As you can see we have one measure for each fold. 
# We need to aggregate this information to assess the model: 

penguins_cv_info_5_neighbors.describe().loc['mean']

In [None]:
# Cell [9]

# But, using cross-validation is purposeless to assess only one model. We use Cross-Validation to compare models on 
# "out-of-sample" performance, so we can select the best model. This is a strategy for choosing the parameters. 

# For example, we could replicate this code and try to see how a KNN with23 neighbours perform and compare
# with the KNN above with 5 neighbours. Let's focus on Accuracy. 

knn_pipeline_2_neighbors = make_pipeline(col_transf, KNN(2))

penguins_cv_info_2_neighbors = pd.DataFrame(
    cross_validate(knn_pipeline_2_neighbors, 
               penguins_train[['body_mass_g', 'bill_depth_mm']], 
               penguins_train['species'], 
               cv=10, 
               scoring=['accuracy'])
)
penguins_cv_info_2_neighbors.describe().loc['mean']


The accuracy went from 0.729 to 0.75. An improvement! 

What if we use other values for neighbors? This can become tiresome! Luckily, `sklearn` has some helper functions to assist us in this search. 

In [None]:
# Cell [10]
from sklearn.model_selection import GridSearchCV

cv_results = pd.DataFrame(
    GridSearchCV(estimator=knn_pipeline_5_neighbors, 
             param_grid={'kneighborsclassifier__n_neighbors': range(1, 50, 1)},
             scoring='accuracy',
             cv=10)\
    .fit(penguins_train[['body_mass_g','bill_depth_mm']], penguins_train['species'])\
    .cv_results_)

In [None]:
cv_results

In [None]:
# Now, we can visualize the accuracy as a function of the number of neighbours. 
alt.Chart(cv_results).mark_line().encode(
    x=alt.X('param_kneighborsclassifier__n_neighbors').title('Number of Neighbors'),
    y=alt.Y('mean_test_score').title('Accuracy').scale(zero=False)
)
