# Assignment 1 - Part 2: Overfitting Analysis
## Overfitting (8 points)

This notebook analyzes overfitting using a procedure similar to simulation.ipynb. We use a simple data generating process and study how R-squared measures change with model complexity.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

## Data Generation

Following the simulation.ipynb approach, we generate data with a convenient slope (PGD) for all three languages.

In [None]:
def generate_data(n=1000, seed=42):
    """
    Generate data following the specification similar to simulation.ipynb.
    Two variables X and Y, intercept parameter is zero.
    
    Parameters:
    -----------
    n : int
        Sample size (default: 1000)
    seed : int
        Random seed for reproducibility (42)
        
    Returns:
    --------
    X : numpy.ndarray
        Feature matrix (n x 1)
    y : numpy.ndarray
        Target variable (n,)
    """
    np.random.seed(seed)
    
    # Generate X from uniform distribution like in simulation.ipynb
    X_raw = np.random.uniform(0, 1, n)
    X_raw.sort()  # Sort like in simulation
    X = X_raw.reshape(-1, 1)
    
    # Generate error term
    e = np.random.normal(0, 1, n)
    
    # Generate y with no intercept (as requested)
    # True relationship: y = 2*X + e (convenient slope for all languages)
    beta_true = 2.0
    y = beta_true * X.ravel() + e
    
    return X, y

# Generate the data
X, y = generate_data(n=1000, seed=42)

print(f"Generated data with n={len(y)} observations")
print(f"True relationship: y = 2*X + e (convenient slope = 2.0)")
print(f"X range: [{X.min():.4f}, {X.max():.4f}]")
print(f"y range: [{y.min():.4f}, {y.max():.4f}]")

## Helper Functions

In [None]:
def create_polynomial_features(X, n_features):
    """
    Create polynomial features up to n_features.
    
    Parameters:
    -----------
    X : numpy.ndarray
        Original feature matrix (n x 1)
    n_features : int
        Number of features to create
        
    Returns:
    --------
    X_poly : numpy.ndarray
        Extended feature matrix with polynomial features
    """
    n_samples = X.shape[0]
    X_poly = np.zeros((n_samples, n_features))
    
    for i in range(n_features):
        X_poly[:, i] = X.ravel() ** (i + 1)  # x^1, x^2, x^3, etc.
    
    return X_poly

def calculate_adjusted_r2(r2, n, k):
    """
    Calculate adjusted R-squared.
    
    Adjusted R² = 1 - [(1 - R²)(n - 1) / (n - k - 1)]
    
    Parameters:
    -----------
    r2 : float
        R-squared value
    n : int
        Sample size
    k : int
        Number of features (excluding intercept)
        
    Returns:
    --------
    adj_r2 : float
        Adjusted R-squared
    """
    if n - k - 1 <= 0:
        return np.nan
    
    adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))
    return adj_r2

# Test the functions
X_poly_example = create_polynomial_features(X, 5)
print(f"Original X shape: {X.shape}")
print(f"Polynomial features (5 features) shape: {X_poly_example.shape}")
print(f"Example adjusted R²: {calculate_adjusted_r2(0.8, 1000, 5):.4f}")

## Overfitting Analysis

Test models with different numbers of polynomial features: 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000

In [None]:
def overfitting_analysis():
    """
    Main function to perform overfitting analysis.
    """
    # Number of features to test (as specified)
    n_features_list = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
    
    # Storage for results
    results = []
    
    print("Analyzing overfitting for different numbers of features...")
    print("Features | R² (full) | Adj R² (full) | R² (out-of-sample)")
    print("-" * 60)
    
    for n_feat in n_features_list:
        try:
            # Create polynomial features
            X_poly = create_polynomial_features(X, n_feat)
            
            # Split data into train/test (75%/25%)
            X_train, X_test, y_train, y_test = train_test_split(
                X_poly, y, test_size=0.25, random_state=42
            )
            
            # Fit model on full sample (no intercept as requested)
            model_full = LinearRegression(fit_intercept=False)
            model_full.fit(X_poly, y)
            y_pred_full = model_full.predict(X_poly)
            r2_full = r2_score(y, y_pred_full)
            
            # Calculate adjusted R²
            adj_r2_full = calculate_adjusted_r2(r2_full, len(y), n_feat)
            
            # Fit model on training data and predict on test data
            model_train = LinearRegression(fit_intercept=False)
            model_train.fit(X_train, y_train)
            y_pred_test = model_train.predict(X_test)
            r2_out_of_sample = r2_score(y_test, y_pred_test)
            
            # Store results
            results.append({
                'n_features': n_feat,
                'r2_full': r2_full,
                'adj_r2_full': adj_r2_full,
                'r2_out_of_sample': r2_out_of_sample
            })
            
            print(f"{n_feat:8d} | {r2_full:9.4f} | {adj_r2_full:12.4f} | {r2_out_of_sample:17.4f}")
            
        except Exception as e:
            print(f"Error with {n_feat} features: {str(e)}")
            # Still append to maintain consistency
            results.append({
                'n_features': n_feat,
                'r2_full': np.nan,
                'adj_r2_full': np.nan,
                'r2_out_of_sample': np.nan
            })
    
    return pd.DataFrame(results)

# Run the analysis
results_df = overfitting_analysis()

## Visualization

Create three separate graphs for each R-squared measure as requested.

In [None]:
def create_separate_plots(df_results):
    """
    Create three separate plots for R-squared analysis.
    """
    # Filter out NaN values for plotting
    df_clean = df_results.dropna()
    
    # Create figure with subplots
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Plot 1: R-squared (full sample)
    axes[0].plot(df_clean['n_features'], df_clean['r2_full'], 
                marker='o', linewidth=2, markersize=6, color='blue')
    axes[0].set_title('R-squared on Full Sample vs Number of Features', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Number of Features')
    axes[0].set_ylabel('R-squared')
    axes[0].set_xscale('log')
    axes[0].grid(True, alpha=0.3)
    axes[0].set_ylim(0, 1)
    
    # Plot 2: Adjusted R-squared (full sample)
    axes[1].plot(df_clean['n_features'], df_clean['adj_r2_full'], 
                marker='s', linewidth=2, markersize=6, color='green')
    axes[1].set_title('Adjusted R-squared on Full Sample vs Number of Features', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Number of Features')
    axes[1].set_ylabel('Adjusted R-squared')
    axes[1].set_xscale('log')
    axes[1].grid(True, alpha=0.3)
    
    # Plot 3: Out-of-sample R-squared
    axes[2].plot(df_clean['n_features'], df_clean['r2_out_of_sample'], 
                marker='^', linewidth=2, markersize=6, color='red')
    axes[2].set_title('Out-of-Sample R-squared vs Number of Features', fontsize=12, fontweight='bold')
    axes[2].set_xlabel('Number of Features')
    axes[2].set_ylabel('Out-of-Sample R-squared')
    axes[2].set_xscale('log')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return fig

# Create the plots
fig = create_separate_plots(results_df)

print("\nThree separate plots created showing:")
print("1. R² (Full Sample): Shows monotonic increase")
print("2. Adjusted R² (Full Sample): Shows peak and decline due to complexity penalty")
print("3. R² (Out-of-Sample): Shows the classic overfitting pattern")

## Results Summary

In [None]:
# Display complete results
print("\n=== COMPLETE RESULTS TABLE ===")
print(results_df.to_string(index=False, float_format='%.4f'))

# Find optimal complexity
valid_results = results_df.dropna()
if not valid_results.empty:
    optimal_adj_r2_idx = valid_results['adj_r2_full'].idxmax()
    optimal_oos_r2_idx = valid_results['r2_out_of_sample'].idxmax()
    
    print("\n=== OPTIMAL MODEL COMPLEXITY ===")
    print(f"By Adjusted R²: {valid_results.loc[optimal_adj_r2_idx, 'n_features']} features")
    print(f"By Out-of-Sample R²: {valid_results.loc[optimal_oos_r2_idx, 'n_features']} features")

print("\n=== INSIGHTS ===")
print("✅ This analysis demonstrates the classic bias-variance tradeoff")
print("📈 R² (Full Sample) increases monotonically with model complexity")
print("📊 Adjusted R² peaks early and then declines due to complexity penalty")
print("📉 Out-of-Sample R² shows the inverted U-shape characteristic of overfitting")
print("🎯 True model has only 1 feature (y = 2*X + e), but polynomial terms can help initially")
print("⚠️ High-dimensional models (many features) lead to severe overfitting")

## Save Results

In [None]:
import os

# Create output directory
output_dir = '../output'
os.makedirs(output_dir, exist_ok=True)

# Save results
results_df.to_csv(f'{output_dir}/overfitting_results_python.csv', index=False)
print(f"Results saved to {output_dir}/overfitting_results_python.csv")

print("\n🎉 Python overfitting analysis complete!")
print("Data generation follows simulation.ipynb approach with:")
print("- X ~ Uniform(0,1), sorted, n=1000")
print("- e ~ Normal(0,1)")
print("- y = 2*X + e (convenient slope = 2.0)")
print("- No intercept (as requested)")
print("- Seed = 42 for reproducibility")