# Lab 3: Simulated Annealing for House Price Prediction

## Task
Load the "house_prices" dataset from OpenML, apply simulated annealing, and evaluate the results.

### Learning Objectives
- Understand and implement the Simulated Annealing optimization algorithm
- Apply metaheuristic optimization to a real-world regression problem
- Evaluate the effectiveness of nature-inspired algorithms for parameter optimization

## 1. Setup and Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import math
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

np.random.seed(42)
random.seed(42)

print("All libraries imported successfully!")

## 2. Load Data

We use `sklearn.datasets.fetch_openml` which is more stable than the direct openml library (avoids minio dependency conflicts).

In [None]:
print("Loading house_prices dataset from OpenML...")
house_prices = fetch_openml(name='house_prices', version=1, as_frame=True, parser='auto')
house_data = house_prices.frame

print(f"Dataset loaded! Shape: {house_data.shape}")
print(f"\nTarget statistics:")
print(house_data['SalePrice'].describe())
house_data.head()

## 3. Data Preprocessing

Handle missing values, encode categorical features, and scale numerical features.

In [None]:
X = house_data.drop('SalePrice', axis=1)
y = house_data['SalePrice'].astype(float)

numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

X_preprocessed = preprocessor.fit_transform(X)
print(f"Preprocessed data shape: {X_preprocessed.shape}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42
)
print(f"Training: {X_train.shape[0]} samples, Test: {X_test.shape[0]} samples")

## 4. Objective Function

The objective function calculates Mean Squared Error (MSE) for given linear model coefficients.

In [None]:
def objective_function(params, X, y):
    intercept = params[0]
    coefficients = params[1:]
    y_pred = X @ coefficients + intercept
    return np.mean((y - y_pred) ** 2)

n_features = X_train.shape[1]
initial_mse = objective_function(np.zeros(n_features + 1), X_train, y_train.values)
print(f"Initial MSE with zero parameters: {initial_mse:,.2f}")

## 5. Simulated Annealing Implementation

Simulated Annealing is a probabilistic optimization technique inspired by metallurgy annealing:
1. Start with initial solution and high temperature
2. Generate neighbor solutions with small random perturbations
3. Accept better solutions always; accept worse solutions with probability based on temperature
4. Gradually cool down temperature
5. Repeat until convergence

In [None]:
def simulated_annealing(objective_func, X, y, n_params, 
                        initial_temp=1000.0, cooling_rate=0.995,
                        num_iterations=5000, step_size=0.5):
    current_solution = np.random.randn(n_params) * 0.01
    best_solution = current_solution.copy()
    current_cost = objective_func(current_solution, X, y)
    best_cost = current_cost
    temperature = initial_temp
    history = [current_cost]
    
    print("Starting Simulated Annealing...")
    print("="*60)
    
    for i in range(num_iterations):
        neighbor_solution = current_solution + np.random.randn(n_params) * step_size
        neighbor_cost = objective_func(neighbor_solution, X, y)
        cost_diff = neighbor_cost - current_cost
        
        if cost_diff < 0:
            acceptance_prob = 1.0
        else:
            acceptance_prob = math.exp(-cost_diff / (temperature + 1e-10))
        
        if random.random() < acceptance_prob:
            current_solution = neighbor_solution
            current_cost = neighbor_cost
        
        if current_cost < best_cost:
            best_solution = current_solution.copy()
            best_cost = current_cost
        
        temperature *= cooling_rate
        history.append(best_cost)
        
        if (i + 1) % 1000 == 0:
            print(f"Iter {i+1:5d} | Best MSE: {best_cost:,.0f} | Temp: {temperature:.4f}")
    
    print("="*60)
    return best_solution, best_cost, history

In [None]:
n_params = X_train.shape[1] + 1

best_params, best_mse, cost_history = simulated_annealing(
    objective_function, X_train, y_train.values, n_params,
    initial_temp=1000.0, cooling_rate=0.995, num_iterations=5000, step_size=0.5
)

print(f"\nBest Training MSE: {best_mse:,.2f}")
print(f"Best Training RMSE: ${np.sqrt(best_mse):,.2f}")

## 6. Visualization

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(cost_history, 'b-', alpha=0.7)
plt.xlabel('Iteration')
plt.ylabel('MSE')
plt.title('Simulated Annealing Convergence')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(cost_history, 'b-', alpha=0.7)
plt.xlabel('Iteration')
plt.ylabel('MSE (Log Scale)')
plt.title('Convergence (Log Scale)')
plt.yscale('log')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

improvement = (cost_history[0] - cost_history[-1]) / cost_history[0] * 100
print(f"Improvement: {improvement:.1f}%")

## 7. Evaluation and Comparison

In [None]:
# Evaluate on test set
test_mse = objective_function(best_params, X_test, y_test.values)
test_rmse = np.sqrt(test_mse)

# Compare with sklearn LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_sklearn = lr_model.predict(X_test)
sklearn_mse = mean_squared_error(y_test, y_pred_sklearn)
sklearn_rmse = np.sqrt(sklearn_mse)

print("Results Comparison:")
print("="*55)
print(f"{'Method':<25} {'Test MSE':>14} {'Test RMSE':>14}")
print("-"*55)
print(f"{'Simulated Annealing':<25} {test_mse:>14,.0f} {test_rmse:>14,.2f}")
print(f"{'sklearn LinearRegression':<25} {sklearn_mse:>14,.0f} {sklearn_rmse:>14,.2f}")
print("="*55)

In [None]:
# Plot predictions
y_pred_sa = X_test @ best_params[1:] + best_params[0]

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_sa, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Simulated Annealing')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred_sklearn, alpha=0.5, color='orange')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('sklearn Linear Regression')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Summary

### Key Findings
- Successfully implemented Simulated Annealing for optimizing linear regression coefficients
- The algorithm demonstrated convergence by reducing MSE over iterations
- Compared results with sklearn analytical solution (OLS)

### Observations
- SA is a metaheuristic that may not reach the exact optimal but can approach it
- The strength of SA lies in its ability to escape local minima and handle complex optimization landscapes
- For linear regression specifically, analytical methods like OLS are more efficient

### Potential Improvements
- Tune hyperparameters (temperature, cooling rate, iterations)
- Use more complex models beyond linear regression
- Apply feature selection to reduce dimensionality