In [1]:
# Let's start by loading some libraries.
# The convention is to load everything here at the start of the notebook
# so it is easier to read the notebook.
# For this notebook, however, we will load the packages as we need them, 
# so you can learn when we need them.

import pandas as pd
import altair as alt 
import numpy as np
import sklearn
import sklearn.neighbors 
#alt.data_transformers.enable("vegafusion")
alt.data_transformers.disable_max_rows()

penguins_raw = pd.read_csv("https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins.csv")



## DO NOT WORRY ABOUT THIS FUNCTION HERE
## I CREATED TO PLOT THE CLASSIFICATION GRID 
## JUST IGNORE IT.
def plot_grid_rl(my_knn: sklearn.neighbors.KNeighborsClassifier, data_df: pd.DataFrame, covariates: list[str], response: str):
  """
  Create a visualization of KNN classification results on a grid.

  This function generates a scatter plot of data points and overlays
  it with a grid of predicted classifications using a KNN model.

  Parameters:
  -----------
  my_knn : KNN
      A trained K-Nearest Neighbors classifier object.
  data_df : pd.DataFrame
      The dataset containing the original data points.
  covariates : list[str]
      A list of two column names to be used as covariates (features) for the plot.
  response : str
      The name of the column containing the response variable (target).

  Returns:
  --------
  alt.LayerChart
      An Altair chart object containing the scatter plot of original data
      and the grid of KNN predictions.

  Dependencies:
  -------------
  - numpy (imported as np)
  - pandas (imported as pd)
  - altair (imported as alt)
  """

  # Calculate min and max values for the covariates
  min_max_info = data_df[covariates].agg(['min', 'max'])

  # Create a grid of points for prediction, extending 5% beyond the data range
  grid = np.meshgrid(np.linspace(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0], 200), 
                     np.linspace(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1], 200))

  # Convert the grid to a DataFrame for easier processing
  grid_df = pd.DataFrame({
      covariates[0]: grid[0].ravel(),
      covariates[1]: grid[1].ravel()
  })

  # Use the KNN model to predict the response for each point in the grid
  grid_df['predicted_'+response] = my_knn.predict(grid_df)

  # Create the base scatter plot of original data
  base_chart = alt.Chart(data_df).mark_point().encode(
      x=alt.X(covariates[0]).scale(zero=False, domain=(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0])),
      y=alt.Y(covariates[1]).scale(zero=False, domain=(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1])),
      color=response+':N'
  )

  # Create the overlay of predicted classifications
  prediction_layer = alt.Chart(grid_df).mark_point(
      opacity=0.05,
      size=40,
      filled=True
  ).encode(
      x=alt.X(covariates[0]).scale(zero=False, domain=(min_max_info.iloc[0,0]-0.05*np.abs(min_max_info.iloc[0,0]), 1.05*min_max_info.iloc[1,0])),
      y=alt.Y(covariates[1]).scale(zero=False, domain=(min_max_info.iloc[0,1]-0.05*np.abs(min_max_info.iloc[0,1]), 1.05*min_max_info.iloc[1,1])),
      color=alt.Color('predicted_'+response+':N').title(response)
  )

  # Combine the base chart and prediction layer
  return (base_chart + prediction_layer).properties(width=800, height=600)

In [None]:
# Cell [1] 
# We will work with the penguins' data set, let's take a look:

penguins_raw.sample(10)

In [None]:
# Cell [2]
# Does this data contain NA?

penguins_raw[penguins_raw.isna().any(axis=1)]

In [None]:
# Cell [3] 
# How many? 

penguins_raw.isna().any(axis=1).value_counts(normalize=True)

In [5]:
# Cell [4]
# Since just a tiny proportion of rows have NAs, let's just drop these rows.

penguins_clean = penguins_raw.dropna()

In [None]:
# Cell [5]
# Let's take a look at our data

alt.Chart(penguins_clean)\
   .mark_point()\
   .encode(
       x=alt.X('body_mass_g').title("Body Mass (g)").scale(zero=False),
       y=alt.Y("bill_depth_mm").title("Bill Depth").scale(zero=False),
       color='species'
   )

In [7]:
# Cell [6]
# It seems that Body Mass and Bill Depth are able to identify the Gentoo penguin pretty well
# but they are not useful to distinguish between Chinstrap and Adelie penguins. 
# Nonetheless, let's try to fit our KNN Classifier. 

# The first step is to create our classify and define the so-called hyperparameters. 

from sklearn.neighbors import KNeighborsClassifier as KNN # this loads the package

my_knn = KNN(n_neighbors=5) # this line creates the classified

In [None]:
# Cell [7]
# Now we are ready to "train" the algorithm. 

my_knn.fit(penguins_clean[['body_mass_g', 'bill_depth_mm']], penguins_clean['species'])

In [None]:
# Cell [8]
# Now, our classifier is trained. Let's use it for some predictions.

my_knn.predict(penguins_clean[['body_mass_g', 'bill_depth_mm']])

In [None]:
# Cell [9]
# We can just call the predict function

my_knn.predict(penguins_clean[['body_mass_g', 'bill_depth_mm']])

In [None]:
# Cell [10]
# Let's take a look at the prediction regions:

plot_grid_rl(my_knn, penguins_clean, covariates=['body_mass_g', 'bill_depth_mm'], response='species' )

In [None]:
# Cell [11]

# Look closely at the plot above. 
# It doesn't make much sense, does it?. 
# The problem is the scale of the variables. 
# The horizontal distance dominates, so the vertical distance is not really relevant. 

# We need to scale the variables. 
# We could do this using pandas operations

mean_body_mass = penguins_clean['body_mass_g'].mean()
std_body_mass = penguins_clean['body_mass_g'].std(ddof=0)
mean_bill_depth = penguins_clean['bill_depth_mm'].mean()
std_bill_depth = penguins_clean['bill_depth_mm'].std(ddof=0)

penguins_clean_std = penguins_clean.assign(
    body_mass_g = (penguins_clean['body_mass_g']-mean_body_mass)/std_body_mass,
    bill_depth_mm = (penguins_clean['bill_depth_mm'] - mean_bill_depth)/std_bill_depth 
)

penguins_clean_std[['body_mass_g', 'bill_depth_mm']]

In [None]:
# Cell [12]

# We should not do this manually. 
# scikit-learn has some tools to help us do this more conveniently, less error-prone and avoid data leakage (more on next week).

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(penguins_clean[['body_mass_g', 'bill_depth_mm']])

In [None]:
# Cell [13]
# One inconvenient thing here, though, is that the output is a numpy array, not a pandas data frame. 
# (Numpy arrays are different data structures.)

# We can change this behaviour in scikit-learn with set_config
from sklearn import set_config
set_config(transform_output='pandas')

scaler.fit_transform(penguins_clean[['body_mass_g', 'bill_depth_mm']])

In [None]:
# Cell [14]
#
# There you go, much better! 
# Let's fit again our knn

my_knn.fit(penguins_clean_std[['body_mass_g', 'bill_depth_mm']], penguins_clean_std['species'])

plot_grid_rl(my_knn, penguins_clean_std, covariates=['body_mass_g', 'bill_depth_mm'], response='species' ).properties(title="Isn't this better?")

In [57]:
# Cell [15]

# Note that here, we had to first scale our data, then we passed our scaled data into our model to fit.
# Then, to predict, we need to scale the new data and then pass the scaled new data to the model for prediction. 
# scikit-learn allows us to create a pipeline to make this more convenient. 

from sklearn.pipeline import Pipeline, make_pipeline

my_pipeline = Pipeline([
    ('scaler', scaler),
    ('knn_model', KNN(n_neighbors=15))
])

In [None]:
# Cell [16]

# Now we can do everything with one line only:
my_pipeline.fit(penguins_clean[['body_mass_g', 'bill_depth_mm']], penguins_clean['species'] )

# We can even pass the pipeline instead of the model to functions, look: 
plot_grid_rl(my_pipeline, penguins_clean, covariates=['body_mass_g', 'bill_depth_mm'], response='species' ).properties(title="Isn't this better?")