In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

housing = fetch_california_housing()
X = housing.data
y = housing.target

lr = make_pipeline(StandardScaler(), LinearRegression())
rf = RandomForestRegressor(n_estimators=100, random_state=42)
lr_scores = cross_val_score(lr, X, y, cv=5, scoring='neg_mean_squared_error')
rf_scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_squared_error')

lr_scores = -lr_scores
rf_scores = -rf_scores
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest Regressor'],
    'Mean MSE': [lr_scores.mean(), rf_scores.mean()],
    'Std Dev': [lr_scores.std(), rf_scores.std()]
})

print("Cross-Validation Comparison:")
print(results.to_string(index=False))
print("\nDiscussion:")
if lr_scores.mean() < rf_scores.mean():
    print("Linear Regression has a lower average MSE, but check variability.")
else:
    print("Random Forest Regressor has a lower average MSE and might generalize better.")


Cross-Validation Comparison:
                  Model  Mean MSE  Std Dev
      Linear Regression  0.558290 0.065602
Random Forest Regressor  0.425455 0.062343

Discussion:
Random Forest Regressor has a lower average MSE and might generalize better.
