# Assignment 1 - Part 2: Overfitting Analysis
## Overfitting (8 points)

This notebook analyzes overfitting using a procedure similar to simulation.ipynb. We use a simple data generating process and study how R-squared measures change with model complexity.

## Load Required Libraries

In [None]:
# Load required libraries
library(ggplot2)
library(dplyr)
library(tidyr)

# Set options for better output display
options(digits = 6)
options(scipen = 999)

## Data Generation

Following the simulation.ipynb approach, we generate data with a convenient slope (PGD) for all three languages.

In [None]:
generate_data <- function(n = 1000, seed = 42) {
  #' Generate data following the specification similar to simulation.ipynb.
  #' Two variables X and Y, intercept parameter is zero.
  #'
  #' @param n Sample size (default: 1000)
  #' @param seed Random seed for reproducibility (42)
  #'
  #' @return List containing X (feature matrix) and y (target variable)
  
  set.seed(seed)
  
  # Generate X from uniform distribution like in simulation.ipynb
  X_raw <- runif(n, 0, 1)
  X_raw <- sort(X_raw)  # Sort like in simulation
  X <- matrix(X_raw, nrow = n, ncol = 1)
  
  # Generate error term
  e <- rnorm(n, 0, 1)
  
  # Generate y with no intercept (as requested)
  # True relationship: y = 2*X + e (convenient slope for all languages)
  beta_true <- 2.0
  y <- beta_true * X[, 1] + e
  
  return(list(X = X, y = y))
}

# Generate the data
data <- generate_data(n = 1000, seed = 42)
X <- data$X
y <- data$y

cat(sprintf("Generated data with n=%d observations\n", length(y)))
cat("True relationship: y = 2*X + e (convenient slope = 2.0)\n")
cat(sprintf("X range: [%.4f, %.4f]\n", min(X), max(X)))
cat(sprintf("y range: [%.4f, %.4f]\n", min(y), max(y)))

## Helper Functions

In [None]:
create_polynomial_features <- function(X, n_features) {
  #' Create polynomial features up to n_features.
  #'
  #' @param X Original feature matrix (n x 1)
  #' @param n_features Number of features to create
  #'
  #' @return Extended feature matrix with polynomial features
  
  n_samples <- nrow(X)
  X_poly <- matrix(0, nrow = n_samples, ncol = n_features)
  
  for (i in 1:n_features) {
    X_poly[, i] <- X[, 1]^i  # x^1, x^2, x^3, etc.
  }
  
  return(X_poly)
}

calculate_adjusted_r2 <- function(r2, n, k) {
  #' Calculate adjusted R-squared.
  #'
  #' Adjusted R² = 1 - [(1 - R²)(n - 1) / (n - k - 1)]
  #'
  #' @param r2 R-squared value
  #' @param n Sample size
  #' @param k Number of features (excluding intercept)
  #'
  #' @return Adjusted R-squared
  
  if (n - k - 1 <= 0) {
    return(NA)
  }
  
  adj_r2 <- 1 - ((1 - r2) * (n - 1) / (n - k - 1))
  return(adj_r2)
}

r2_score <- function(y_true, y_pred) {
  #' Calculate R-squared score.
  ss_res <- sum((y_true - y_pred)^2)
  ss_tot <- sum((y_true - mean(y_true))^2)
  return(1 - (ss_res / ss_tot))
}

train_test_split <- function(X, y, test_size = 0.25, random_state = 42) {
  #' Split data into training and testing sets.
  set.seed(random_state)
  n <- length(y)
  n_test <- round(n * test_size)
  indices <- sample(1:n, n)
  
  test_indices <- indices[1:n_test]
  train_indices <- indices[(n_test + 1):n]
  
  return(list(
    X_train = X[train_indices, , drop = FALSE],
    X_test = X[test_indices, , drop = FALSE],
    y_train = y[train_indices],
    y_test = y[test_indices]
  ))
}

# Test the functions
X_poly_example <- create_polynomial_features(X, 5)
cat(sprintf("Original X shape: (%d, %d)\n", nrow(X), ncol(X)))
cat(sprintf("Polynomial features (5 features) shape: (%d, %d)\n", nrow(X_poly_example), ncol(X_poly_example)))
cat(sprintf("Example adjusted R²: %.4f\n", calculate_adjusted_r2(0.8, 1000, 5)))

## Overfitting Analysis

Test models with different numbers of polynomial features: 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000

In [None]:
overfitting_analysis <- function() {
  #' Main function to perform overfitting analysis.
  
  # Number of features to test (as specified)
  n_features_list <- c(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000)
  
  # Storage for results
  results <- data.frame(
    n_features = integer(),
    r2_full = numeric(),
    adj_r2_full = numeric(),
    r2_out_of_sample = numeric()
  )
  
  cat("Analyzing overfitting for different numbers of features...\n")
  cat("Features | R² (full) | Adj R² (full) | R² (out-of-sample)\n")
  cat(paste(rep("-", 60), collapse = ""), "\n")
  
  for (n_feat in n_features_list) {
    tryCatch({
      # Create polynomial features
      X_poly <- create_polynomial_features(X, n_feat)
      
      # Split data into train/test (75%/25%)
      split_data <- train_test_split(X_poly, y, test_size = 0.25, random_state = 42)
      X_train <- split_data$X_train
      X_test <- split_data$X_test
      y_train <- split_data$y_train
      y_test <- split_data$y_test
      
      # Fit model on full sample (no intercept as requested)
      # Using solve() for OLS: beta = (X'X)^(-1) X'y
      if (n_feat >= nrow(X_poly)) {
        # When n_features >= n_samples, use regularized solution
        lambda <- 1e-6
        beta_full <- solve(t(X_poly) %*% X_poly + lambda * diag(n_feat), t(X_poly) %*% y)
      } else {
        beta_full <- solve(t(X_poly) %*% X_poly, t(X_poly) %*% y)
      }
      y_pred_full <- X_poly %*% beta_full
      r2_full <- r2_score(y, y_pred_full)
      
      # Calculate adjusted R²
      adj_r2_full <- calculate_adjusted_r2(r2_full, length(y), n_feat)
      
      # Fit model on training data and predict on test data
      if (n_feat >= nrow(X_train)) {
        # When n_features >= n_samples, use regularized solution
        lambda <- 1e-6
        beta_train <- solve(t(X_train) %*% X_train + lambda * diag(n_feat), t(X_train) %*% y_train)
      } else {
        beta_train <- solve(t(X_train) %*% X_train, t(X_train) %*% y_train)
      }
      y_pred_test <- X_test %*% beta_train
      r2_out_of_sample <- r2_score(y_test, y_pred_test)
      
      # Store results
      results <- rbind(results, data.frame(
        n_features = n_feat,
        r2_full = r2_full,
        adj_r2_full = adj_r2_full,
        r2_out_of_sample = r2_out_of_sample
      ))
      
      cat(sprintf("%8d | %9.4f | %12.4f | %17.4f\n", n_feat, r2_full, adj_r2_full, r2_out_of_sample))
      
    }, error = function(e) {
      cat(sprintf("Error with %d features: %s\n", n_feat, e$message))
      # Still append to maintain consistency
      results <<- rbind(results, data.frame(
        n_features = n_feat,
        r2_full = NA,
        adj_r2_full = NA,
        r2_out_of_sample = NA
      ))
    })
  }
  
  return(results)
}

# Run the analysis
results_df <- overfitting_analysis()

## Visualization

Create three separate graphs for each R-squared measure as requested.

In [None]:
create_separate_plots <- function(df_results) {
  #' Create three separate plots for R-squared analysis.
  
  # Filter out NA values for plotting
  df_clean <- df_results[complete.cases(df_results), ]
  
  # Create individual plots
  
  # Plot 1: R-squared (full sample)
  p1 <- ggplot(df_clean, aes(x = n_features, y = r2_full)) +
    geom_line(size = 1, color = "blue") +
    geom_point(size = 3, color = "blue") +
    scale_x_log10() +
    ylim(0, 1) +
    labs(
      title = "R-squared on Full Sample vs Number of Features",
      x = "Number of Features (log scale)",
      y = "R-squared"
    ) +
    theme_bw() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      panel.grid.minor = element_line(alpha = 0.3)
    )
  
  print(p1)
  
  # Plot 2: Adjusted R-squared (full sample)
  p2 <- ggplot(df_clean, aes(x = n_features, y = adj_r2_full)) +
    geom_line(size = 1, color = "green") +
    geom_point(size = 3, color = "green", shape = 15) +
    scale_x_log10() +
    labs(
      title = "Adjusted R-squared on Full Sample vs Number of Features",
      x = "Number of Features (log scale)",
      y = "Adjusted R-squared"
    ) +
    theme_bw() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      panel.grid.minor = element_line(alpha = 0.3)
    )
  
  print(p2)
  
  # Plot 3: Out-of-sample R-squared
  p3 <- ggplot(df_clean, aes(x = n_features, y = r2_out_of_sample)) +
    geom_line(size = 1, color = "red") +
    geom_point(size = 3, color = "red", shape = 17) +
    scale_x_log10() +
    labs(
      title = "Out-of-Sample R-squared vs Number of Features",
      x = "Number of Features (log scale)",
      y = "Out-of-Sample R-squared"
    ) +
    theme_bw() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      panel.grid.minor = element_line(alpha = 0.3)
    )
  
  print(p3)
  
  return(list(p1 = p1, p2 = p2, p3 = p3))
}

# Create the plots
plots <- create_separate_plots(results_df)

cat("\nThree separate plots created showing:\n")
cat("1. R² (Full Sample): Shows monotonic increase\n")
cat("2. Adjusted R² (Full Sample): Shows peak and decline due to complexity penalty\n")
cat("3. R² (Out-of-Sample): Shows the classic overfitting pattern\n")

## Results Summary

In [None]:
# Display complete results
cat("\n=== COMPLETE RESULTS TABLE ===\n")
print(results_df, row.names = FALSE, digits = 4)

# Find optimal complexity
valid_results <- results_df[complete.cases(results_df), ]
if (nrow(valid_results) > 0) {
  optimal_adj_r2_idx <- which.max(valid_results$adj_r2_full)
  optimal_oos_r2_idx <- which.max(valid_results$r2_out_of_sample)
  
  cat("\n=== OPTIMAL MODEL COMPLEXITY ===\n")
  cat(sprintf("By Adjusted R²: %d features\n", valid_results$n_features[optimal_adj_r2_idx]))
  cat(sprintf("By Out-of-Sample R²: %d features\n", valid_results$n_features[optimal_oos_r2_idx]))
}

cat("\n=== INSIGHTS ===\n")
cat("✅ This analysis demonstrates the classic bias-variance tradeoff\n")
cat("📈 R² (Full Sample) increases monotonically with model complexity\n")
cat("📊 Adjusted R² peaks early and then declines due to complexity penalty\n")
cat("📉 Out-of-Sample R² shows the inverted U-shape characteristic of overfitting\n")
cat("🎯 True model has only 1 feature (y = 2*X + e), but polynomial terms can help initially\n")
cat("⚠️ High-dimensional models (many features) lead to severe overfitting\n")

## Save Results

In [None]:
# Create output directory and save results
output_dir <- "../output"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

# Save results
write.csv(results_df, file.path(output_dir, "overfitting_results_r.csv"), row.names = FALSE)
cat(sprintf("Results saved to %s/overfitting_results_r.csv\n", output_dir))

cat("\n🎉 R overfitting analysis complete!\n")
cat("Data generation follows simulation.ipynb approach with:\n")
cat("- X ~ Uniform(0,1), sorted, n=1000\n")
cat("- e ~ Normal(0,1)\n")
cat("- y = 2*X + e (convenient slope = 2.0)\n")
cat("- No intercept (as requested)\n")
cat("- Seed = 42 for reproducibility\n")