### Load packages and data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Base URL for raw GitHub content
base_url = "https://raw.githubusercontent.com/chuckgrigsby0/agec-784/main/data/"

# Load solar directly from GitHub URL
solar_data = pd.read_csv(base_url + 'solar-data.csv')

print("Data loaded successfully!")
print(f"Number of rows and columns: {solar_data.shape}")

### Data exploration

In [None]:
# Print the column names
# Note that .columns is an attribute of solar_data
print(solar_data.columns)

In [None]:
# Print the first 5 rows of the dataset
print(solar_data.head())

In [None]:
# Compute summary statistics, rounded to 4 decimal places 
# Note: only numeric columns are included
np.round(solar_data.describe(), decimals=4)

In [None]:
# To get counts of number of households that installed solar or not, we can use the value_counts() method
solar_data['Install?'].value_counts()

We can create a correlation matrix using the `corr()` method. 

In [None]:
cor_mat = solar_data.select_dtypes(include=np.number).corr()

np.round(cor_mat, decimals=4) 

### Create a binary outcome variable for `Install?` (Yes = 1, No = 0)

To prepare the data for estimation, we create a binary (0/1) variable based on solar installation status. The `insert()` method allows us to place `Install` immediately following `Install?`. 

In [None]:

i = solar_data.columns.get_loc('Install?') + 1
solar_data.insert(i, 'Install', np.where(solar_data['Install?'] == 'Yes', 1, 0))

### Estimate a KNN Model with Unscaled Data

The KNN classifier is sensitive to the scale of variables, as predictions are based on distances between observations' features. Variables with larger scales will dominate the distance calculation. For example, if one variable ranges from 33-290 (`Income` in thousands of dollars) while another ranges from 1.5-7 (peak-sun-hours, `PSH`), `Income` will disproportionately influence which neighbors are considered "nearest," even if `PSH` is equally important for classification. Therefore, we should standardize predictors to ensure equal contribution to distances. One common approach is Z-score standardization, which transforms all variables to have mean zero and standard deviation one: $z = \frac{x - \bar{x}}{\text{sd}(x)}$

We will compare unscaled and scaled model accuracy at the end of the notebook. First, we estimate a KNN model using the scaled data. 

We will use `Income` and `PSH` as predictor variables.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = (StandardScaler(with_mean=True, with_std=True) # Implies mean of 0 and SD of 1
        .set_output(transform='pandas')
)
predictor_vars_scaled = scaler.fit_transform(solar_data[['Income', 'PSH']])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    predictor_vars_scaled,
    solar_data['Install'],
    train_size=0.6,
    test_size=0.4,
    random_state=0
)

#### Evaluating on Training vs Test Data

Unlike parametric models, KNN does not learn parameters during fitting. The `fit()` call stores the training data in memory. When predicting, KNN computes distances from each test observation to all training observations, identifies the k-nearest neighbors, and uses majority vote for classification. With `n_neighbors=1`, evaluating on training data yields perfect accuracy since each point is its own nearest neighbor. However, evaluating on test data provides a reliable estimate of generalization error, as predictions use only the stored training data.

In [None]:
# Using one nearest neighbor and Euclidean distance

knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn_fit = knn.fit(X_train, y_train)

# Evaluating on Training Data
print(knn_fit.score(X_train, y_train))

# Evaluating on Test Data
print(knn_fit.score(X_test, y_test))

In [None]:
# Using three nearest neighbors and Euclidean distance

knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_fit = knn.fit(X_train, y_train)


# Evaluating on Training Data
print(knn_fit.score(X_train, y_train))

# Evaluating on Test Data
print(knn_fit.score(X_test, y_test))

### We can obtain the predicted outcomes using the `predict()` method

In [None]:
# Evaluating on Test Data
preds = pd.DataFrame({
    'Actual': y_test,
    'Predicted': knn_fit.predict(X_test)
})

preds.head()

### Assessing Model Accuracy

#### Accuracy = 1 - Misclassification Rate

The `score()` method measures the model's predictive accuracy. We can also compute accuracy manually comparing actual and predicted values.

In [None]:
# All yield the same result

print(np.mean(1 - preds['Actual'] != preds['Predicted'])) # Manual calculation of accuracy

print(np.mean(preds['Actual'] == preds['Predicted'])) # Manual calculation of accuracy

print(knn_fit.score(X_test, y_test)) # Using score() method

#### Confusion Matrix

A confusion matrix displays the model's predictions versus actual outcomes: true negatives (TN, correctly predicting non-installation), false negatives (FN, incorrectly predicting non-installation), true positives (TP, correctly predicting installation), and false positives (FP, incorrectly predicting installation). The diagonal elements represent correct predictions, while off-diagonal elements represent errors. The model correctly predicted 13 non-installations (TN) and 20 installations (TP), but incorrectly predicted 4 households would not install when they did (FN) and 3 would install when they did not (FP).

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(preds['Actual'], preds['Predicted'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot();


#### Decision Boundary

The decision boundary plot shows the regions where the KNN model predicts each class across the two features (`Income` and `PSH`). The colored background regions represent what the model would predict for any hypothetical point in that area, based on the majority vote of the k=3 nearest training neighbors. Test data points are overlaid as scatter points, allowing us to assess model performance. Points falling in the correctly colored region indicate correct predictions, while points in the wrong region indicate misclassifications. As k increases, the decision boundaries become smoother and less sensitive to individual training points.

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay

# Create a KNN classifier with k=5 neighbors using Euclidean distance
knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')

# Fit the model on training data (stores X_train and y_train in memory)
knn_fit = knn.fit(X_train, y_train)

# Create the decision boundary visualization
disp = DecisionBoundaryDisplay.from_estimator(
    estimator=knn_fit,              # The fitted KNN model
    X=X_train,                      # Training data used to determine plot range and boundary
    plot_method='contourf',         # Use filled contours for the background regions
    response_method='predict',      # Use the predict method to determine class regions
    xlabel=X_train.columns[0],      # Label x-axis with first feature name (Income)
    ylabel=X_train.columns[1],      # Label y-axis with second feature name (PSH)
    alpha=0.5,                      # Set transparency of colored regions to 50%
    cmap='RdYlBu_r',               # Color scheme: reversed red-yellow-blue
)

# Overlay test data points on the decision boundary plot
# c=y_test colors points by their actual class labels
scatter = disp.ax_.scatter(
    X_test["Income"],               # X-coordinates from test data
    X_test["PSH"],                  # Y-coordinates from test data
    c=y_test,                       # Color by actual test labels
    cmap='Paired',                  # Colormap for the scatter points
    s=20,                           # Size of points
    edgecolor="b"                   # Blue edge around each point
)

# Set x-axis limits to match the range of test data
disp.ax_.set_xlim(X_test["Income"].min(), X_test["Income"].max())

# Set y-axis limits to match the range of test data
disp.ax_.set_ylim(X_test["PSH"].min(), X_test["PSH"].max())

# Add legend to identify the classes
disp.ax_.legend(
    scatter.legend_elements()[0],   # Get the legend handles from scatter plot
    ['No', 'Yes'],                  # Label the classes
    loc='best',                     # Automatically choose best legend location
    title='Classes'                 # Title for the legend
)

# Add title to the plot showing the algorithm and k value
# The underscore (_) captures the return value (not needed)
_ = disp.ax_.set_title(
    f"Solar Installation Classification\nKNN (k={knn.n_neighbors}) Decision Boundary"
)

### Compare scaled and unscaled model accuracy

We will compare model accuracy using  the scaled and unscaled predictors and plot the decision boundary plots to visualize model performance. 

First, we will prepare the unscaled train and test splits. 

In [None]:
predictor_vars = solar_data.loc[:, ['Income', 'PSH']]

X_train_unscaled, X_test_unscaled, y_train_unscaled, y_test_unscaled = train_test_split(
    predictor_vars,
    solar_data['Install'],
    train_size=0.6,
    test_size=0.4,
    random_state=0
)

We repeat the same steps to create decision boundary figures for both the unscaled and scaled models, but now plot them side-by-side for comparison. The key is using `plt.subplots(1, 2)` to create one figure with two subplot axes, then specifying which axes to draw on using the `ax` parameter in `DecisionBoundaryDisplay.from_estimator()`.

In [None]:
# Create figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Unscaled model results
knn_unscaled = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_unscaled.fit(X_train_unscaled, y_train_unscaled)

# Create decision boundary on left subplot
disp1 = DecisionBoundaryDisplay.from_estimator(
    estimator=knn_unscaled,
    X=X_train_unscaled,
    ax=ax1,                         # Specify which subplot to use
    plot_method='contourf',
    response_method='predict',
    xlabel=X_train_unscaled.columns[0],
    ylabel=X_train_unscaled.columns[1],
    alpha=0.5,
    cmap='RdYlBu_r'
)

# Add scatter points
scatter1 = ax1.scatter(
    X_test_unscaled["Income"],
    X_test_unscaled["PSH"],
    c=y_test_unscaled,
    cmap='Paired',
    s=20,
    edgecolor="b"
)

# Customize left plot
ax1.set_xlim(X_test_unscaled["Income"].min(), X_test_unscaled["Income"].max())
ax1.set_ylim(X_test_unscaled["PSH"].min(), X_test_unscaled["PSH"].max())
ax1.legend(scatter1.legend_elements()[0], ['No', 'Yes'], loc='best', title='Classes')
acc_unscaled = knn_unscaled.score(X_test_unscaled, y_test_unscaled)
ax1.set_title(f"Without Scaling\nKNN (k=3) | Accuracy: {acc_unscaled:.3f}")

# Scaled model results
knn_scaled = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_scaled.fit(X_train, y_train)

# Create decision boundary on right subplot
disp2 = DecisionBoundaryDisplay.from_estimator(
    estimator=knn_scaled,
    X=X_train,
    ax=ax2,                         # Specify which subplot to use
    plot_method='contourf',
    response_method='predict',
    xlabel=X_train.columns[0],
    ylabel=X_train.columns[1],
    alpha=0.5,
    cmap='RdYlBu_r'
)

# Add scatter points
scatter2 = ax2.scatter(
    X_test["Income"],
    X_test["PSH"],
    c=y_test,
    cmap='Paired',
    s=20,
    edgecolor="b"
)

# Customize right plot
ax2.set_xlim(X_test["Income"].min(), X_test["Income"].max())
ax2.set_ylim(X_test["PSH"].min(), X_test["PSH"].max())
ax2.legend(scatter2.legend_elements()[0], ['No', 'Yes'], loc='best', title='Classes')
acc_scaled = knn_scaled.score(X_test, y_test)
ax2.set_title(f"With Scaling\nKNN (k=3) | Accuracy: {acc_scaled:.3f}")

# Overall title
fig.suptitle("Effect of Feature Scaling on KNN Decision Boundaries", fontsize=14, y=1.02)

plt.tight_layout()
plt.show()

# Print comparison
print(f"Accuracy without scaling: {acc_unscaled:.3f}")
print(f"Accuracy with scaling: {acc_scaled:.3f}")
print(f"Improvement: {(acc_scaled - acc_unscaled):.3f}")

