## Model Comparison

In this notebook, we compare the predictive accuracy of K-Nearest Neighbors, Decision Trees, Linear Regression (Linear Probability Model for classification), and Logistic Regression for both classification and regression tasks using the solar installation dataset.

Click the badge below to open in Google Colab:

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/chuckgrigsby0/agec-784/blob/main/notebooks/06_model_comparison_solar_data.ipynb)

## Setup

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## Data Loading

In [None]:
# Base URL for raw GitHub content
base_url = "https://raw.githubusercontent.com/chuckgrigsby0/agec-784/main/data/"

# Load solar directly from GitHub URL
solar_data = pd.read_csv(base_url + 'solar-data.csv')

print("Data loaded successfully!")
print(f"Number of rows and columns: {solar_data.shape}")

In [None]:
# Preview the data
solar_data.head()

## Data Preparation

In [None]:
# Create binary numeric variable for classification models
# Classification models require numeric (0/1) rather than categorical (Yes/No) target variables
i = solar_data.columns.get_loc('Install?') + 1
solar_data.insert(i, 'Install', np.where(solar_data['Install?'] == 'Yes', 1, 0))

In [None]:
# Split data into training (70%) and testing (30%) sets
# We evaluate model performance using test data to assess predictive accuracy on unseen data
# random_state ensures reproducibility
train_data, test_data = train_test_split(
    solar_data,
    train_size=0.7,
    test_size=0.3,
    random_state=731
)

## Classification Comparison

In [None]:
def compare_classification_models(train_data, test_data, knn_nn, tree_min_samples_split, tree_min_samples_leaf):
    """
    Compare classification accuracy across multiple models.
    
    Parameters:
    -----------
    train_data : DataFrame
        Training dataset
    test_data : DataFrame
        Testing dataset
    knn_nn : int, optional
        Number of neighbors for KNN
    tree_min_samples_split : int, optional
        Minimum samples required to split a node in Decision Tree
    tree_min_samples_leaf : int, optional
        Minimum samples required in a leaf node in Decision Tree
    
    Returns:
    --------
    DataFrame with Model names and their Accuracy scores
    """
    results = []
    
    # Prepare features and target
    X_train = train_data[['Income', 'PSH']]
    y_train = train_data['Install']
    X_test = test_data[['Income', 'PSH']]
    y_test = test_data['Install']
    
    # 1. KNN Classification (k=3)
    # KNN requires feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    knn_clf = KNeighborsClassifier(n_neighbors=knn_nn, metric='euclidean')
    knn_clf.fit(X_train_scaled, y_train)
    knn_preds = knn_clf.predict(X_test_scaled)
    knn_accuracy = accuracy_score(y_test, knn_preds)
    results.append({'Model': f'KNN (k={knn_nn})', 'Accuracy': knn_accuracy})
    
    # 2. Decision Tree Classification
    tree_clf = DecisionTreeClassifier(
        criterion='gini',              # Measure of split quality for classification
        min_samples_split=tree_min_samples_split,           # Minimum samples required to split a node
        min_samples_leaf=tree_min_samples_leaf,            # Minimum samples required in a leaf node
        random_state=731               # For reproducibility
    )
    tree_clf.fit(X_train, y_train)
    tree_preds = tree_clf.predict(X_test)
    tree_accuracy = accuracy_score(y_test, tree_preds)
    results.append({'Model': 'Decision Tree', 'Accuracy': tree_accuracy})
    
    # 3. Linear Probability Model (LPM)
    lpm_train = smf.ols(formula='Install ~ Income + PSH', data=train_data).fit()
    lpm_pred_probs = lpm_train.predict(test_data)
    lpm_preds = np.where(lpm_pred_probs >= 0.5, 1, 0)
    lpm_accuracy = accuracy_score(test_data['Install'], lpm_preds)
    results.append({'Model': 'Linear Probability Model', 'Accuracy': lpm_accuracy})
    
    # 4. Logistic Regression
    logit_train = smf.logit(formula='Install ~ Income + PSH', data=train_data).fit()
    logit_pred_probs = logit_train.predict(test_data)
    logit_preds = np.where(logit_pred_probs >= 0.5, 1, 0)
    logit_accuracy = accuracy_score(test_data['Install'], logit_preds)
    results.append({'Model': 'Logistic Regression', 'Accuracy': logit_accuracy})
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Accuracy', ascending=False).reset_index(drop=True)
    
    return results_df

In [None]:
# Compare classification models
classification_results = compare_classification_models(train_data, test_data, knn_nn=3, tree_min_samples_split=5, tree_min_samples_leaf=3)
print("Classification Model Comparison")
print("="*40)
classification_results

## Regression Comparison

In [None]:
def compare_regression_models(train_data, test_data, knn_nn, tree_min_samples_split, tree_min_samples_leaf):
    """
    Compare regression performance across multiple models.
    
    Parameters:
    -----------
    train_data : DataFrame
        Training dataset
    test_data : DataFrame
        Testing dataset
    knn_nn : int, optional
        Number of neighbors for KNN
    tree_min_samples_split : int, optional
        Minimum samples required to split a node in Decision Tree
    tree_min_samples_leaf : int, optional
        Minimum samples required in a leaf node in Decision Tree
    
    Returns:
    --------
    DataFrame with Model names, MSE, and R² scores
    """
    results = []
    
    # Prepare features and target
    X_train = train_data[['Income', 'PSH']]
    y_train = train_data['Profit']
    X_test = test_data[['Income', 'PSH']]
    y_test = test_data['Profit']
    
    # 1. KNN Regression (k=3)
    # KNN requires feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    knn_reg = KNeighborsRegressor(n_neighbors=knn_nn, metric='euclidean')
    knn_reg.fit(X_train_scaled, y_train)
    knn_preds = knn_reg.predict(X_test_scaled)
    knn_mse = mean_squared_error(y_test, knn_preds)
    knn_r2 = r2_score(y_test, knn_preds)
    results.append({'Model': f'KNN (k={knn_nn})', 'MSE': knn_mse, 'R²': knn_r2})
    
    # 2. Decision Tree Regression
    tree_reg = DecisionTreeRegressor(
        criterion='squared_error',     # MSE criterion for regression
        min_samples_split=tree_min_samples_split,           # Minimum samples required to split a node
        min_samples_leaf=tree_min_samples_leaf,            # Minimum samples required in a leaf node
        random_state=731               # For reproducibility
    )
    tree_reg.fit(X_train, y_train)
    tree_preds = tree_reg.predict(X_test)
    tree_mse = mean_squared_error(y_test, tree_preds)
    tree_r2 = r2_score(y_test, tree_preds)
    results.append({'Model': 'Decision Tree', 'MSE': tree_mse, 'R²': tree_r2})
    
    # 3. Linear Regression (OLS)
    ols_train = smf.ols(formula='Profit ~ Income + PSH', data=train_data).fit()
    ols_preds = ols_train.predict(test_data)
    ols_mse = mean_squared_error(test_data['Profit'], ols_preds)
    ols_r2 = r2_score(test_data['Profit'], ols_preds)
    results.append({'Model': 'Linear Regression (OLS)', 'MSE': ols_mse, 'R²': ols_r2})
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('R²', ascending=False).reset_index(drop=True)
    
    return results_df

In [None]:
# Compare regression models
regression_results = compare_regression_models(train_data, test_data, knn_nn=3, tree_min_samples_split=5, tree_min_samples_leaf=3)
print("Regression Model Comparison")
print("="*40)
regression_results