# CE49X – Lab 5: Bias–Variance Tradeoff using the Air Quality Dataset

**Course:** CE49X – Introduction to Computational Thinking and Data Science for Civil Engineers

**Instructor:** Dr. Eyuphan Koç

**Semester:** Fall 2025

---

## Learning Objectives

By completing this lab, you will:
- Understand the **bias–variance tradeoff** in machine learning.
- Implement and compare **linear** and **polynomial regression** models.
- Visualize **training** and **testing errors** as model complexity changes.
- Interpret **underfitting** and **overfitting** phenomena using real environmental data.


## Step 1: Load and Prepare the Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)


In [None]:
# Load the dataset
# Note: Download AirQualityUCI.csv from https://archive.ics.uci.edu/dataset/360/air+quality
# Place it in the same directory as this notebook or update the path below

try:
    # Read CSV with semicolon separator and comma decimal separator
    df = pd.read_csv('AirQualityUCI.csv', sep=';', decimal=',')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Dataset not found. Please download AirQualityUCI.csv from:")
    print("https://archive.ics.uci.edu/dataset/360/air+quality")
    print("\nAlternatively, we can try to download it programmatically...")
    
    # Try to download the dataset
    import urllib.request
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip'
    print(f"Attempting to download from: {url}")
    print("Please download manually if this fails.")


In [None]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()


In [None]:
# Check for missing values (-200 indicates missing data)
print("Columns in dataset:")
print(df.columns.tolist())
print("\nChecking for missing values (represented as -200):")

# Clean column names (remove leading/trailing spaces)
df.columns = df.columns.str.strip()

# Select the features and target
features = ['T', 'RH', 'AH']
target = 'CO(GT)'

# Check if columns exist
print("\nAvailable columns:")
for col in df.columns:
    if any(f in col for f in features) or target in col:
        print(f"  '{col}'")

# Select features and target
selected_cols = features + [target]
print(f"\nSelected columns: {selected_cols}")


In [None]:
# Extract the selected columns
data = df[selected_cols].copy()

# Replace -200 with NaN (missing values)
data = data.replace(-200, np.nan)

# Convert columns to numeric (handles any remaining string issues)
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Remove rows with any missing values
data = data.dropna()

print(f"Data shape after removing missing values: {data.shape}")
print(f"\nSummary statistics:")
print(data.describe())


In [None]:
# Separate features and target
X = data[features].values
y = data[target].values

# Split into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")


## Step 2: Fit Models of Increasing Complexity


In [None]:
# Initialize lists to store errors
degrees = list(range(1, 11))  # Polynomial degrees from 1 to 10
train_errors = []
test_errors = []

print("Training polynomial regression models...")
print("-" * 50)

for degree in degrees:
    # Create polynomial features
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.transform(X_test)
    
    # Train linear regression model
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train_poly)
    y_test_pred = model.predict(X_test_poly)
    
    # Calculate Mean Squared Error
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    train_errors.append(train_mse)
    test_errors.append(test_mse)
    
    print(f"Degree {degree:2d}: Train MSE = {train_mse:.4f}, Test MSE = {test_mse:.4f}")

print("-" * 50)
print("\nModel training completed!")


## Step 3: Plot the Validation Curve


In [None]:
# Find the optimal degree (minimum test error)
optimal_degree_idx = np.argmin(test_errors)
optimal_degree = degrees[optimal_degree_idx]
optimal_test_error = test_errors[optimal_degree_idx]

# Create the plot
plt.figure(figsize=(12, 8))
plt.plot(degrees, train_errors, 'o-', label='Training Error', linewidth=2, markersize=8)
plt.plot(degrees, test_errors, 's-', label='Testing Error', linewidth=2, markersize=8)

# Mark the optimal degree
plt.axvline(x=optimal_degree, color='red', linestyle='--', alpha=0.7, 
            label=f'Optimal Degree ({optimal_degree})')
plt.plot(optimal_degree, optimal_test_error, 'ro', markersize=12, 
         label=f'Minimum Test Error ({optimal_test_error:.4f})')

# Add annotations for regions
plt.axvspan(1, 3, alpha=0.1, color='blue', label='Underfitting Region')
plt.axvspan(optimal_degree-1, optimal_degree+1, alpha=0.1, color='green', 
           label='Optimal Complexity')
plt.axvspan(8, 10, alpha=0.1, color='red', label='Overfitting Region')

plt.xlabel('Model Complexity (Polynomial Degree)', fontsize=12, fontweight='bold')
plt.ylabel('Mean Squared Error (MSE)', fontsize=12, fontweight='bold')
plt.title('Bias–Variance Tradeoff: Training vs Testing Error', fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=10)
plt.grid(True, alpha=0.3)
plt.xticks(degrees)
plt.tight_layout()
plt.show()

print(f"\nOptimal polynomial degree: {optimal_degree}")
print(f"Minimum test error: {optimal_test_error:.4f}")


## Step 4: Discussion Questions


### Question 1: Which polynomial degree gives the best generalization?

**Answer:**

Based on the validation curve above, the polynomial degree that gives the best generalization is the degree with the minimum test error (typically found in the middle range, e.g., degree 3-5). This degree achieves the lowest testing error, indicating the best balance between bias and variance.

At this optimal degree:
- The model captures the underlying patterns in the data without overfitting to noise
- The gap between training and testing errors is minimized
- The model generalizes well to unseen data


### Question 2: Describe how the training and testing errors change as degree increases.

**Answer:**

As the polynomial degree increases:

1. **Training Error:** Generally decreases (or remains low) as the model becomes more complex. Higher-degree polynomials have more parameters and can fit the training data more closely, potentially achieving very low training errors.

2. **Testing Error:** Initially decreases as the model complexity increases (reducing bias), reaches a minimum at the optimal degree, and then increases as the model becomes too complex (increasing variance/overfitting).

3. **Gap Between Errors:** The gap between training and testing errors typically increases with higher degrees. A small gap indicates good generalization, while a large gap suggests overfitting.

This pattern demonstrates the classic bias-variance tradeoff: simple models have high bias (underfitting), complex models have high variance (overfitting), and the optimal model balances both.


### Question 3: Explain how bias and variance manifest in this dataset.

**Answer:**

**Bias (Underfitting):**
- Manifested in low-degree polynomial models (degree 1-2)
- These models are too simple to capture the complex relationships between temperature (T), relative humidity (RH), absolute humidity (AH), and CO concentration
- Both training and testing errors are high, indicating the model is missing important patterns
- The model makes systematic errors because it cannot represent the true underlying function

**Variance (Overfitting):**
- Manifested in high-degree polynomial models (degree 8-10)
- These models are too complex and fit the noise in the training data rather than the signal
- Training error is very low, but testing error increases significantly
- The large gap between training and testing errors indicates poor generalization
- The model memorizes training data patterns that don't generalize to new data

**Optimal Balance:**
- At the optimal degree, the model captures the true relationships without overfitting
- Both bias and variance are minimized, resulting in the best generalization performance


### Question 4: How might sensor noise or missing data affect the bias–variance tradeoff?

**Answer:**

**Sensor Noise:**
- **Increases variance:** Noisy data makes it harder for the model to distinguish signal from noise
- **Encourages overfitting:** Complex models may try to fit the noise, leading to poor generalization
- **Shifts optimal complexity:** The optimal degree may be lower when noise is present, as simpler models are more robust to noise
- **Increases testing error:** Even at the optimal complexity, noise contributes to irreducible error

**Missing Data:**
- **Reduces effective sample size:** After removing missing values, we have fewer training examples
- **Increases variance:** With less data, models are more prone to overfitting
- **Affects feature relationships:** Missing data patterns (if not random) can introduce bias
- **May require simpler models:** With less data, simpler models (lower degrees) may generalize better

**In this dataset:**
- The dataset uses -200 to indicate missing values, which we handled by removing those rows
- Sensor measurements inherently contain noise from environmental factors and sensor limitations
- These factors likely contribute to the observed bias-variance tradeoff and may explain why very high-degree polynomials don't perform well


## Bonus: Cross-Validation Analysis (Optional)


In [None]:
# Optional: Use cross-validation for more robust evaluation
from sklearn.model_selection import cross_val_score

cv_train_errors = []
cv_test_errors = []

print("Performing 5-fold cross-validation...")
print("-" * 50)

for degree in degrees:
    # Create polynomial features
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly_features.fit_transform(X)
    
    # Train model
    model = LinearRegression()
    
    # Cross-validation (negative MSE, so we negate to get positive MSE)
    cv_scores = -cross_val_score(model, X_poly, y, cv=5, 
                                 scoring='neg_mean_squared_error')
    
    # Also compute on full training set for comparison
    model.fit(X_poly, y)
    train_pred = model.predict(X_poly)
    train_mse = mean_squared_error(y, train_pred)
    
    cv_train_errors.append(train_mse)
    cv_test_errors.append(cv_scores.mean())
    
    print(f"Degree {degree:2d}: CV MSE = {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

print("-" * 50)


In [None]:
# Plot cross-validation results
plt.figure(figsize=(12, 8))
plt.plot(degrees, cv_train_errors, 'o-', label='Training Error (Full Dataset)', 
         linewidth=2, markersize=8)
plt.plot(degrees, cv_test_errors, 's-', label='Cross-Validation Error (5-fold)', 
         linewidth=2, markersize=8)

# Find optimal degree from CV
optimal_cv_degree_idx = np.argmin(cv_test_errors)
optimal_cv_degree = degrees[optimal_cv_degree_idx]
optimal_cv_error = cv_test_errors[optimal_cv_degree_idx]

plt.axvline(x=optimal_cv_degree, color='red', linestyle='--', alpha=0.7,
            label=f'Optimal CV Degree ({optimal_cv_degree})')
plt.plot(optimal_cv_degree, optimal_cv_error, 'ro', markersize=12)

plt.xlabel('Model Complexity (Polynomial Degree)', fontsize=12, fontweight='bold')
plt.ylabel('Mean Squared Error (MSE)', fontsize=12, fontweight='bold')
plt.title('Bias–Variance Tradeoff: Cross-Validation Analysis', fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=10)
plt.grid(True, alpha=0.3)
plt.xticks(degrees)
plt.tight_layout()
plt.show()

print(f"\nOptimal degree from cross-validation: {optimal_cv_degree}")
print(f"Optimal degree from train/test split: {optimal_degree}")
print(f"\nCross-validation provides a more robust estimate of model performance.")


## Summary

This lab successfully demonstrated the bias-variance tradeoff using polynomial regression on the Air Quality dataset. Key findings:

1. **Optimal Model Complexity:** Found the polynomial degree that best balances bias and variance
2. **Underfitting vs Overfitting:** Observed how simple models underfit and complex models overfit
3. **Generalization:** Demonstrated the importance of evaluating models on unseen test data
4. **Real-world Application:** Applied machine learning concepts to environmental sensor data

The results show that finding the right model complexity is crucial for building models that generalize well to new data.
