# Assignment 1 - Part 2: Overfitting Analysis
## Overfitting (8 points)

This notebook analyzes overfitting using a procedure similar to simulation.ipynb. We use a simple data generating process and study how R-squared measures change with model complexity.

## Load Required Packages

In [None]:
using LinearAlgebra
using Random
using Printf
using Plots
using DataFrames
using CSV
using Statistics

# Set plotting backend
gr()

## Data Generation

Following the simulation.ipynb approach, we generate data with a convenient slope (PGD) for all three languages.

In [None]:
function generate_data(n=1000; seed=42)
    """
    Generate data following the specification similar to simulation.ipynb.
    Two variables X and Y, intercept parameter is zero.
    
    Parameters:
    -----------
    n : Int
        Sample size (default: 1000)
    seed : Int
        Random seed for reproducibility (42)
        
    Returns:
    --------
    X : Matrix
        Feature matrix (n x 1)
    y : Vector
        Target variable (n,)
    """
    Random.seed!(seed)
    
    # Generate X from uniform distribution like in simulation.ipynb
    X_raw = rand(n)
    X_raw = sort(X_raw)  # Sort like in simulation
    X = reshape(X_raw, n, 1)
    
    # Generate error term
    e = randn(n)
    
    # Generate y with no intercept (as requested)
    # True relationship: y = 2*X + e (convenient slope for all languages)
    beta_true = 2.0
    y = beta_true * X[:, 1] + e
    
    return X, y
end

# Generate the data
X, y = generate_data(1000, seed=42)

@printf("Generated data with n=%d observations\n", length(y))
println("True relationship: y = 2*X + e (convenient slope = 2.0)")
@printf("X range: [%.4f, %.4f]\n", minimum(X), maximum(X))
@printf("y range: [%.4f, %.4f]\n", minimum(y), maximum(y))

## Helper Functions

In [None]:
function create_polynomial_features(X, n_features)
    """
    Create polynomial features up to n_features.
    
    Parameters:
    -----------
    X : Matrix
        Original feature matrix (n x 1)
    n_features : Int
        Number of features to create
        
    Returns:
    --------
    X_poly : Matrix
        Extended feature matrix with polynomial features
    """
    n_samples = size(X, 1)
    X_poly = zeros(n_samples, n_features)
    
    for i in 1:n_features
        X_poly[:, i] = X[:, 1] .^ i  # x^1, x^2, x^3, etc.
    end
    
    return X_poly
end

function calculate_adjusted_r2(r2, n, k)
    """
    Calculate adjusted R-squared.
    
    Adjusted R² = 1 - [(1 - R²)(n - 1) / (n - k - 1)]
    
    Parameters:
    -----------
    r2 : Float64
        R-squared value
    n : Int
        Sample size
    k : Int
        Number of features (excluding intercept)
        
    Returns:
    --------
    adj_r2 : Float64
        Adjusted R-squared
    """
    if n - k - 1 <= 0
        return NaN
    end
    
    adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))
    return adj_r2
end

function r2_score(y_true, y_pred)
    """Calculate R-squared score."""
    ss_res = sum((y_true - y_pred).^2)
    ss_tot = sum((y_true .- mean(y_true)).^2)
    return 1 - (ss_res / ss_tot)
end

function train_test_split(X, y; test_size=0.25, random_state=42)
    """Split data into training and testing sets."""
    Random.seed!(random_state)
    n = length(y)
    n_test = round(Int, n * test_size)
    indices = randperm(n)
    
    test_indices = indices[1:n_test]
    train_indices = indices[n_test+1:end]
    
    return X[train_indices, :], X[test_indices, :], y[train_indices], y[test_indices]
end

# Test the functions
X_poly_example = create_polynomial_features(X, 5)
@printf("Original X shape: (%d, %d)\n", size(X)...)
@printf("Polynomial features (5 features) shape: (%d, %d)\n", size(X_poly_example)...)
@printf("Example adjusted R²: %.4f\n", calculate_adjusted_r2(0.8, 1000, 5))

## Overfitting Analysis

Test models with different numbers of polynomial features: 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000

In [None]:
function overfitting_analysis()
    """
    Main function to perform overfitting analysis.
    """
    println("=== OVERFITTING ANALYSIS ===\n")
    
    # Number of features to test (as specified)
    n_features_list = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
    
    # Storage for results
    results = DataFrame(
        n_features = Int[],
        r2_full = Float64[],
        adj_r2_full = Float64[],
        r2_out_of_sample = Float64[]
    )
    
    println("Analyzing overfitting for different numbers of features...")
    println("Features | R² (full) | Adj R² (full) | R² (out-of-sample)")
    println("-" ^ 60)
    
    for n_feat in n_features_list
        try
            # Create polynomial features
            X_poly = create_polynomial_features(X, n_feat)
            
            # Split data into train/test (75%/25%)
            X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.25, random_state=42)
            
            # Fit model on full sample (no intercept as requested)
            # Using linear algebra: beta = (X'X)^(-1) X'y
            if n_feat >= size(X_poly, 1)
                # When n_features >= n_samples, use regularized solution
                lambda = 1e-6
                beta_full = (X_poly' * X_poly + lambda * I) \ (X_poly' * y)
            else
                beta_full = (X_poly' * X_poly) \ (X_poly' * y)
            end
            y_pred_full = X_poly * beta_full
            r2_full = r2_score(y, y_pred_full)
            
            # Calculate adjusted R²
            adj_r2_full = calculate_adjusted_r2(r2_full, length(y), n_feat)
            
            # Fit model on training data and predict on test data
            if n_feat >= size(X_train, 1)
                # When n_features >= n_samples, use regularized solution
                lambda = 1e-6
                beta_train = (X_train' * X_train + lambda * I) \ (X_train' * y_train)
            else
                beta_train = (X_train' * X_train) \ (X_train' * y_train)
            end
            y_pred_test = X_test * beta_train
            r2_out_of_sample = r2_score(y_test, y_pred_test)
            
            # Store results
            push!(results, (n_feat, r2_full, adj_r2_full, r2_out_of_sample))
            
            @printf("%8d | %9.4f | %12.4f | %17.4f\n", n_feat, r2_full, adj_r2_full, r2_out_of_sample)
            
        catch e
            println("Error with $n_feat features: $e")
            # Still append to maintain consistency
            push!(results, (n_feat, NaN, NaN, NaN))
        end
    end
    
    println()
    return results
end

# Run the analysis
results_df = overfitting_analysis()

## Visualization

Create three separate graphs for each R-squared measure as requested.

In [None]:
function create_separate_plots(df_results)
    """
    Create three separate plots for R-squared analysis.
    
    Parameters:
    -----------
    df_results : DataFrame
        Results from overfitting analysis
    """
    println("Creating plots...")
    
    # Filter out NaN values
    valid_rows = completecases(df_results)
    df_clean = df_results[valid_rows, :]
    
    # Plot 1: R-squared (full sample)
    p1 = plot(df_clean.n_features, df_clean.r2_full,
              marker=:circle, linewidth=2, markersize=6, color=:blue,
              title="R-squared on Full Sample vs Number of Features",
              xlabel="Number of Features", ylabel="R-squared",
              xscale=:log10, ylims=(0, 1), grid=true,
              titlefontsize=12, labelfontsize=10,
              legend=false)
    
    display(p1)
    
    # Plot 2: Adjusted R-squared (full sample)  
    p2 = plot(df_clean.n_features, df_clean.adj_r2_full,
              marker=:square, linewidth=2, markersize=6, color=:green,
              title="Adjusted R-squared on Full Sample vs Number of Features",
              xlabel="Number of Features", ylabel="Adjusted R-squared",
              xscale=:log10, grid=true,
              titlefontsize=12, labelfontsize=10,
              legend=false)
    
    display(p2)
    
    # Plot 3: Out-of-sample R-squared
    p3 = plot(df_clean.n_features, df_clean.r2_out_of_sample,
              marker=:utriangle, linewidth=2, markersize=6, color=:red,
              title="Out-of-Sample R-squared vs Number of Features",
              xlabel="Number of Features", ylabel="Out-of-Sample R-squared",
              xscale=:log10, grid=true,
              titlefontsize=12, labelfontsize=10,
              legend=false)
    
    display(p3)
    
    println("Plots created successfully!")
    
    return p1, p2, p3
end

# Create the plots
p1, p2, p3 = create_separate_plots(results_df)

println("\nThree separate plots created showing:")
println("1. R² (Full Sample): Shows monotonic increase")
println("2. Adjusted R² (Full Sample): Shows peak and decline due to complexity penalty")
println("3. R² (Out-of-Sample): Shows the classic overfitting pattern")

## Results Summary

In [None]:
# Display complete results
println("\n=== COMPLETE RESULTS TABLE ===")
println(results_df)

# Find optimal complexity
valid_results = results_df[completecases(results_df), :]
if nrow(valid_results) > 0
    optimal_adj_r2_idx = argmax(valid_results.adj_r2_full)
    optimal_oos_r2_idx = argmax(valid_results.r2_out_of_sample)
    
    println("\n=== OPTIMAL MODEL COMPLEXITY ===")
    @printf("By Adjusted R²: %d features\n", valid_results.n_features[optimal_adj_r2_idx])
    @printf("By Out-of-Sample R²: %d features\n", valid_results.n_features[optimal_oos_r2_idx])
end

println("\n=== INSIGHTS ===")
println("✅ This analysis demonstrates the classic bias-variance tradeoff")
println("📈 R² (Full Sample) increases monotonically with model complexity")
println("📊 Adjusted R² peaks early and then declines due to complexity penalty")
println("📉 Out-of-Sample R² shows the inverted U-shape characteristic of overfitting")
println("🎯 True model has only 1 feature (y = 2*X + e), but polynomial terms can help initially")
println("⚠️ High-dimensional models (many features) lead to severe overfitting")

## Save Results

In [None]:
# Create output directory and save results
output_dir = "../output"
mkpath(output_dir)

# Save results
CSV.write(joinpath(output_dir, "overfitting_results_julia.csv"), results_df)
println("Results saved to $(output_dir)/overfitting_results_julia.csv")

println("\n🎉 Julia overfitting analysis complete!")
println("Data generation follows simulation.ipynb approach with:")
println("- X ~ Uniform(0,1), sorted, n=1000")
println("- e ~ Normal(0,1)")
println("- y = 2*X + e (convenient slope = 2.0)")
println("- No intercept (as requested)")
println("- Seed = 42 for reproducibility")