In [None]:
# Import necessary libraries
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pickle

In [None]:
# Load dataset
california = fetch_california_housing() # https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html
X = california.data
y = california.target
# We used this one in the course: https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_ames_housing.html
print(california.DESCR)
# based on 1990 US Census, one row per census block group 
# (smalles geographical unit for which the US Census Bureau publishes sample data)

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# save the model to disk
with open('regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

In [None]:
# Test with one new instance
labels = california.feature_names
new_instance = np.array([3.84, 52, 6.28, 1.08, 565.0, 2.18, 37.85, -122.25])
new_instance = new_instance.reshape(1, -1)
# print new instance with labels 
dict(zip(labels, new_instance[0]))

In [None]:
# Print feature names and new instance
pred = model.predict(new_instance)
print(f"Prediction for new instance: {pred[0]*100000:,.2f}") 