In [None]:
# @title ðŸš€ Setup (Run this cell first!)
import os
import sys

# 1. Detect if we are in Google Colab
if 'google.colab' in sys.modules:
    print("Running in Google Colab. Setting up environment...")
    
    # 2. Define the source of your data files 
    BASE_URL = "https://raw.githubusercontent.com/fhfarnoud/intro2ml/main/2026S/data/"
    
    # 3. List the files you need
    files_to_download = ['NBA_player_data.csv']
    
    # 4. Download the files
    if not os.path.exists('data'):
        os.makedirs('data')
        
    for filename in files_to_download:
        if not os.path.exists(f"data/{filename}"):
            url = f"{BASE_URL}{filename}"
            print(f"Downloading {filename}...")
            !wget -q -O data/{filename} {url}
            
    print("âœ… Setup complete! Data files are ready.")
else:
    print("Running locally. Assuming data is already present.")



**ðŸ¤– AI Lab Partner Policy: STRICTLY Opt-In Code Generation**

In this course, we treat AI tools (like ChatGPT, Gemini, Copilot) as **Lab Partners**, not solution generators. You must use the following prompt to ensure the AI acts responsibly.

**1. Copy the text inside the block below**
**2. Open your AI Assistant (Gemini, ChatGPT, etc.)**
**3. Paste the text to set the rules for the session**

> "I am a student in an Intro to Machine Learning course. Please act as my **ML Lab Partner**.
> 
> **Your Rules:**
> 
> 1. **Code Generation is STRICTLY Opt-In:** You **MUST NOT** generate any runnable Python code unless my message starts with one of the specific prefixes below (`code:` or `output:`).
>    * *Default Behavior:* If I ask 'How do I...?' or 'Help me with...', explain the strategy in English, provide pseudocode, or use illustrative examples. Do not generate runnable solution code.
> 
> 2. **The 'code:' Trigger (Logic & Calculation):** 
>    * When generating code, prioritize simplicity and human readability. Avoid complex syntax.
>    * **Constraint:** When I use this trigger, provide **only one single line of code**. Do not write full blocks.
> 
> 3. **The 'output:' Trigger (Formatting & Printing):**
>    * Use this ONLY when I request code to print results, format tables, or create plots.
>    * **Exception:** For this trigger only, you **MAY** provide full multi-line code blocks to handle the verbose syntax of formatting or plotting.
> 
> 4. **Wait for Me:** After providing the code, stop immediately. Wait for me to run it and ask for the next step.
> 
> 5. **Explain Briefly:** Add a short comment explaining what the code does.
> 
> 6. **Catch Logic Errors:** If I ask for a step that is methodologically wrong (like testing on training data), stop me and explain the error before proceeding."


# Lecture 3: kNN Implementation & Real Datasets

**ECE 2410 - Introduction to Machine Learning**  
**Spring 2026**

---

## Building on Lecture 2

In L02, you learned:
- Euclidean distance using `np.linalg.norm()`
- 1-Nearest Neighbor on a small toy dataset
- Using for loops to classify test points

**Today we'll extend this to:**
1. Real datasets with hundreds/thousands of samples
2. Proper train/test splitting
3. Feature normalization
4. k-NN (majority vote among k neighbors)
5. MNIST digit classification



In [None]:
import numpy as np
import matplotlib.pyplot as plt

# NEW: pandas is a library for loading and manipulating tabular data (like spreadsheets)
import pandas as pd

# NEW: sklearn has built-in datasets and ML utilities
from sklearn.datasets import fetch_openml


np.random.seed(42)

---
## Python Concepts You'll Need Today

Before diving into the main content, let's quickly review some Python/NumPy concepts we'll use.

In [None]:
# --- 1. Random Seed for Reproducibility ---
# Computers generate "pseudo-random" numbers starting from a "seed".
# By fixing this seed (e.g., to 42), we ensure that every time we run this notebook,
# we get exactly the same random shuffles and splits. Use this for debugging!
#
# Note: If you run a cell multiple times, you may get different results unless you reset the seed.
# IMPORTANT: "Restart Kernel and Run All" to see what we will see when grading!

np.random.seed(0)
print("With seed 0:", np.random.randint(0, 100, 5))
print("Again:      ", np.random.randint(0, 100, 5))  # Different! Seed not reset

np.random.seed(0)  # Reset to same seed
print("Reset seed: ", np.random.randint(0, 100, 5))  # Same as first output!

In [None]:
# --- 2. Functions with Multiple Return Values ---
# A function can return multiple values as a tuple.
# You can "unpack" them directly into separate variables.

def get_stats(arr):
    return np.min(arr), np.max(arr), np.mean(arr)

data = np.array([1, 5, 3, 9, 2])

# Unpack the 3 returned values into 3 variables:
min_val, max_val, mean_val = get_stats(data)
print(f"Min: {min_val}, Max: {max_val}, Mean: {mean_val}")

In [None]:
# --- 3. The axis Parameter ---
# For 2D arrays: axis=0 means "down columns", axis=1 means "across rows"

X = np.array([[1, 2, 3],
              [4, 5, 6]])
print("X =")
print(X)
print()
print("Sum all:", np.sum(X))           # 21 (everything)
print("Sum axis=0:", np.sum(X, axis=0))  # [5, 7, 9] (down columns)
print("Sum axis=1:", np.sum(X, axis=1))  # [6, 15] (across rows)

In [None]:
# --- 4. Reshape ---
# Convert between 1D vectors and 2D arrays (crucial for images!)

# A 1D vector of 12 elements
vec = np.arange(12)
print("1D vector:", vec)

# Reshape to 3x4
matrix = vec.reshape(3, 4)
print("\nReshaped to 3x4:")
print(matrix)

# Reshape back to 1D
flat = matrix.reshape(-1)  # -1 means "figure out the size"
print("\nFlattened back:", flat)

In [None]:
# --- 5. Views vs Copies (Important!) ---
# Basic slicing creates a VIEW (shares memory with original)
# Fancy indexing creates a COPY (independent)

original = np.array([1, 2, 3, 4, 5])

# VIEW: basic slicing
view = original[:3]
view[0] = 999
print("After modifying view:", original)  # Original is also changed!

# COPY: fancy indexing (using a list or array of indices)
original = np.array([1, 2, 3, 4, 5])
indices = [0, 1, 2]
copy = original[indices]
copy[0] = 999
print("After modifying copy:", original)  # Original unchanged!

# Tip: Use .copy() if you want to be explicit
safe_copy = original[:3].copy()

---
## Part 1: NBA Player Classification (Full Dataset)

In L02, we used just 4 training players. Now let's use the **full NBA dataset**!

**Task**: Classify NBA players as Guards vs. Forwards/Centers based on height and weight.

### 1.1 Load the NBA Dataset

**NEW: `pd.read_csv()`** - Loads data from a CSV file into a DataFrame (like a spreadsheet).

In [None]:
# Load NBA data from CSV file
nba_df = pd.read_csv('data/NBA_player_data.csv')
print(f"Dataset has {len(nba_df)} rows")
nba_df.head()  # Show first 5 rows

In [None]:
# GOOGLE COLAB SETUP
# OPTION 1: Mount Google Drive
# ---------------------------
# from google.colab import drive
# drive.mount('/content/drive')
# %cd "/content/drive/MyDrive/Your/Notebook/Path/Here"
# 
# OPTION 2: Upload Data File
# ---------------------------
# If you prefer not to mount your drive, you can upload the data file manually.
# The code below will upload file and move it to the correct folder.
# 
# from google.colab import files
# import os
# uploaded = files.upload()
# if not os.path.exists('data'):
#     os.makedirs('data')
# !mv NBA_player_data.csv data/
# 
# Check if data exists
# Use !ls data to check if you can see 'NBA_player_data.csv'

### 1.2 Data Preprocessing

Real data often needs cleaning. Here we:
1. Convert height from "6-8" format to inches
2. Remove players with missing data
3. Simplify to 2 classes: Guard vs Forward/Center

In [None]:
# Function to convert height string "6-8" to total inches
def height_to_inches(h):
    try:
        parts = [int(p) for p in h.split('-')]  # Split "6-8" into [6, 8]
        # YOUR CODE HERE
        ## Calculate total inches from feet and inches
        return ...  # YOUR CODE HERE
        raise NotImplementedError()
    except:
        return 0

print(f"Example: '6-8' = {height_to_inches('6-8')} inches")

In [None]:
# Clean the data using a simple loop
nba_df = nba_df.dropna(subset=['weight', 'height'])  # Remove missing values

# Convert heights using a loop (clear and simple!)
height_inches = []
for h in nba_df['height']:
    height_inches.append(height_to_inches(h))
nba_df['height_inches'] = height_inches

# Remove invalid heights
nba_df = nba_df[nba_df['height_inches'] > 0]

# Create binary classes using a loop
classes = []
for pos in nba_df['position']:
    if pos == 'G':
        classes.append('Guard')
    elif pos in ['F', 'C', 'F-C', 'C-F']:
        classes.append('Forward/Center')
    else:
        classes.append('Other')
nba_df['class'] = classes

# YOUR CODE HERE
## # Filter out 'Other' rows - keep only Guard and Forward/Center
nba_df = nba_df[...]  # YOUR CODE HERE
raise NotImplementedError()

print(f"Final dataset: {len(nba_df)} players")

In [None]:
# Extract features (X) and labels (y) as numpy arrays
X = np.column_stack([nba_df['height_inches'].values, nba_df['weight'].values])
y = (nba_df['class'] == 'Forward/Center').astype(int).values  # 0=Guard, 1=Fwd/Ctr

print(f"Features X: {X.shape}  (samples Ã— features)")
print(f"Labels y: {y.shape}")

### 1.3 Train/Test Split

**NEW: `np.random.permutation(N)`** - Returns shuffled indices [0, 1, ..., N-1]

In [None]:
def train_test_split(X, y, test_size=0.2, random_state=42):
    """Split data into training and test sets."""
    np.random.seed(random_state)
    
    N = len(y)
    n_test = int(N * test_size)
    
    # NEW: shuffle indices randomly
    indices = np.random.permutation(N)
    
    # YOUR CODE HERE
    ## # Use slicing to split indices into test and train
    test_indices = indices[...]   # first n_test indices
    train_indices = indices[...]  # remaining indices
    ## 
    ## # Return: X_train, X_test, y_train, y_test
    return ..., ..., ..., ...
    raise NotImplementedError()

X_train, X_test, y_train, y_test = train_test_split(X, y)
print(f"Train: {len(y_train)}, Test: {len(y_test)}")

### 1.4 Feature Normalization

**NEW: `np.std()`** - Computes standard deviation (measures spread of data)

In [None]:
def normalize_zscore(X_train, X_test):
    """Z-score: (x - mean) / std. Use TRAINING stats for both!"""
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)  # NEW: standard deviation
    
    X_train_norm = (X_train - mean) / std
    X_test_norm = (X_test - mean) / std  # Use training stats!
    return X_train_norm, X_test_norm

X_train_norm, X_test_norm = normalize_zscore(X_train, X_test)
print(f"Normalized mean: {np.mean(X_train_norm, axis=0).round(4)}")
print(f"Normalized std:  {np.std(X_train_norm, axis=0).round(4)}")

### 1.5 k-NN Implementation

Uses `np.linalg.norm()` and `np.argsort()` from L02.

In [None]:
def knn_predict(X_train, y_train, X_test, k=3):
    """k-Nearest Neighbors prediction."""
    predictions = []
    
    for x_test in X_test:
        # Distance to all training points (from L02!)
        distances = np.linalg.norm(X_train - x_test, axis=1)
        
        # YOUR CODE HERE
        ## # Find k nearest neighbors
        k_nearest_indices = ...   # indices of k smallest distances (hint: np.argsort)
        k_nearest_labels = ...    # labels of those neighbors
        ## 
        ## # Count votes for each class
        count_class_0 = ...  # how many neighbors have label 0?
        count_class_1 = ...  # how many neighbors have label 1?
        ## 
        ## # Predict: which class has more votes?
        if count_class_1 > count_class_0:
            prediction = 1
        else:
            prediction = 0
        raise NotImplementedError()
        predictions.append(prediction)
    
    return np.array(predictions)

y_pred = knn_predict(X_train_norm, y_train, X_test_norm, k=5)
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy:.2%}")

### 1.6 Precision and Recall

**Accuracy** can be misleading with imbalanced classes. Let's also compute:
- **Precision**: Of all predicted positives, how many are correct?
- **Recall**: Of all actual positives, how many did we find?

For our NBA data: Positive class = Forward/Center (label 1)

In [None]:
def compute_metrics(y_true, y_pred):
    """Compute accuracy, precision, and recall."""
    # True Positives, False Positives, False Negatives, True Negatives
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    
    # YOUR CODE HERE
    ## # Compute the metrics using TP, FP, FN, TN
    accuracy = ...   
    precision = ...  
    recall = ...     
    raise NotImplementedError()
    
    return accuracy, precision, recall

acc, prec, rec = compute_metrics(y_test, y_pred)
print(f"Accuracy:  {acc:.2%}")
print(f"Precision: {prec:.2%}  (of predicted Fwd/Ctr, how many correct?)")
print(f"Recall:    {rec:.2%}  (of actual Fwd/Ctr, how many did we find?)")

---
## Part 2: MNIST Digit Classification

**Key Concept**: Images are vectors! A 28Ã—28 image = 784-dimensional vector.

In [None]:
# Load MNIST
print("Loading MNIST...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X_mnist = mnist.data.astype(np.float32)
y_mnist = mnist.target.astype(np.int32)
print(f"Shape: {X_mnist.shape}  (28Ã—28 = 784 pixels per image)")

### 2.1 Visualize: Vector â†’ Image

**NEW: `arr.reshape(28, 28)`** - Changes shape from (784,) to (28, 28) for display

In [None]:
# Display first digit
img_vector = X_mnist[0]  # 784-element vector
img_2d = img_vector.reshape(28, 28)  # NEW: reshape to 2D

plt.figure(figsize=(4, 4))
plt.imshow(img_2d, cmap='gray')
plt.title(f'Label: {y_mnist[0]}')
plt.axis('off')
plt.show()

In [None]:
# Prepare subset (full dataset is too slow for k-NN)
X_train_mnist = X_mnist[:5000] / 255.0  # Normalize to [0,1]
y_train_mnist = y_mnist[:5000]
X_test_mnist = X_mnist[60000:61000] / 255.0
y_test_mnist = y_mnist[60000:61000]

print(f"Train: {X_train_mnist.shape}, Test: {X_test_mnist.shape}")

In [None]:
def knn_multiclass(X_train, y_train, X_test, k=3):
    """k-NN for multi-class (10 digits)."""
    predictions = []
    
    for i, x_test in enumerate(X_test):
        if i % 100 == 0:
            print(f"Processing {i}/{len(X_test)}...", end='\r')
        
        # Step 1: Compute distances to all training points
        distances = np.linalg.norm(X_train - x_test, axis=1)
        
        # Step 2: Find the k nearest neighbors' labels
        k_nearest_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_nearest_indices]
        
        # Step 3: Count votes for each digit (0-9)
        vote_counts = np.zeros(10, dtype=int)  # 10 possible digits
        # YOUR CODE HERE
        ## # Increment the vote count for each neighbor's label
        for label in k_nearest_labels:
            vote_counts[...] += 1  # which index to increment?
        ## 
        ## # Step 4: Predict the digit with the most votes (hint: np.argmax)
        prediction = ...
        raise NotImplementedError()
        predictions.append(prediction)
    
    print("Done!" + " "*20)
    return np.array(predictions)

print("Running k-NN (this takes ~1 minute)...")
y_pred_mnist = knn_multiclass(X_train_mnist, y_train_mnist, X_test_mnist, k=3)
print(f"MNIST Accuracy: {np.mean(y_pred_mnist == y_test_mnist):.2%}")

### 2.3 Misclassified Examples

Let's see which digits kNN got wrong. This helps us understand the model's limitations.

In [None]:
# Find misclassified examples
misclassified_idx = np.where(y_pred_mnist != y_test_mnist)[0]
print(f"Number of misclassified: {len(misclassified_idx)} out of {len(y_test_mnist)}")

# Display misclassified examples with their closest training image
n_show = 5
fig, axes = plt.subplots(n_show, 2, figsize=(4, 10))
axes[0, 0].set_title('Test Image', fontsize=10)
axes[0, 1].set_title('Nearest Neighbor', fontsize=10)

for row in range(n_show):
    if row < len(misclassified_idx):
        idx = misclassified_idx[row]
        
        # Test image (misclassified)
        test_img = X_test_mnist[idx].reshape(28, 28)
        axes[row, 0].imshow(test_img, cmap='gray')
        axes[row, 0].set_ylabel(f'True: {y_test_mnist[idx]}', fontsize=9)
        
        # Find the closest training image
        distances = np.linalg.norm(X_train_mnist - X_test_mnist[idx], axis=1)
        closest_idx = np.argmin(distances)
        closest_img = X_train_mnist[closest_idx].reshape(28, 28)
        axes[row, 1].imshow(closest_img, cmap='gray')
        axes[row, 1].set_ylabel(f'Label: {y_train_mnist[closest_idx]}', fontsize=9)
    
    axes[row, 0].axis('off')
    axes[row, 1].axis('off')

plt.suptitle('Misclassified: Test vs Nearest Neighbor', fontsize=12)
plt.tight_layout()
plt.show()