In [1]:
# Let's start by loading some libraries.
# The convention is to load everything here at the start of the notebook
# so it is easier to read the notebook.
# For this notebook, however, we will load the packages as we need them, 
# so you can learn when we need them.

import pandas as pd
import altair as alt 
import numpy as np
import sklearn
import sklearn.neighbors 
#alt.data_transformers.enable("vegafusion")
alt.data_transformers.disable_max_rows()

penguins_raw = pd.read_csv("https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins.csv")



## DO NOT WORRY ABOUT THIS FUNCTION HERE
## I CREATED TO PLOT THE CLASSIFICATION GRID 
## JUST IGNORE IT.
def plot_grid_rl(my_knn: sklearn.neighbors.KNeighborsClassifier, data_df: pd.DataFrame, covariates: list[str], response: str):
  """
  Create a visualization of KNN classification results on a grid.

  This function generates a scatter plot of data points and overlays
  it with a grid of predicted classifications using a KNN model.

  Parameters:
  -----------
  my_knn : KNN
      A trained K-Nearest Neighbors classifier object.
  data_df : pd.DataFrame
      The dataset containing the original data points.
  covariates : list[str]
      A list of two column names to be used as covariates (features) for the plot.
  response : str
      The name of the column containing the response variable (target).

  Returns:
  --------
  alt.LayerChart
      An Altair chart object containing the scatter plot of original data
      and the grid of KNN predictions.

  Dependencies:
  -------------
  - numpy (imported as np)
  - pandas (imported as pd)
  - altair (imported as alt)
  """

  # Calculate min and max values for the covariates
  min_max_info = data_df[covariates].agg(['min', 'max'])

  # Create a grid of points for prediction, extending 5% beyond the data range
  grid = np.meshgrid(np.linspace(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0], 200), 
                     np.linspace(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1], 200))

  # Convert the grid to a DataFrame for easier processing
  grid_df = pd.DataFrame({
      covariates[0]: grid[0].ravel(),
      covariates[1]: grid[1].ravel()
  })

  # Use the KNN model to predict the response for each point in the grid
  grid_df['predicted_'+response] = my_knn.predict(grid_df)

  # Create the base scatter plot of original data
  base_chart = alt.Chart(data_df).mark_point().encode(
      x=alt.X(covariates[0]).scale(zero=False, domain=(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0])),
      y=alt.Y(covariates[1]).scale(zero=False, domain=(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1])),
      color=response+':N'
  )

  # Create the overlay of predicted classifications
  prediction_layer = alt.Chart(grid_df).mark_point(
      opacity=0.05,
      size=40,
      filled=True
  ).encode(
      x=alt.X(covariates[0]).scale(zero=False, domain=(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0])),
      y=alt.Y(covariates[1]).scale(zero=False, domain=(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1])),
      color=alt.Color('predicted_'+response+':N').title(response)
  )

  # Combine the base chart and prediction layer
  return (base_chart + prediction_layer).properties(width=800, height=600)