In [1]:
# Import necessary libraries
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pickle

In [2]:
# Load dataset
california = fetch_california_housing() # https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html
X = california.data
y = california.target
# We used this one in the course: https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_ames_housing.html
print(california.DESCR)
# based on 1990 US Census, one row per census block group 
# (smalles geographical unit for which the US Census Bureau publishes sample data)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [3]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [5]:
# save the model to disk
with open('regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [6]:
# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.2553684927247781


In [7]:
# Test with one new instance
labels = california.feature_names
new_instance = np.array([3.84, 52, 6.28, 1.08, 565.0, 2.18, 37.85, -122.25])
new_instance = new_instance.reshape(1, -1)
# print new instance with labels 
dict(zip(labels, new_instance[0]))

{'MedInc': 3.84,
 'HouseAge': 52.0,
 'AveRooms': 6.28,
 'AveBedrms': 1.08,
 'Population': 565.0,
 'AveOccup': 2.18,
 'Latitude': 37.85,
 'Longitude': -122.25}

In [8]:
# Print feature names and new instance
pred = model.predict(new_instance)
print(f"Prediction for new instance: {pred[0]*100000:,.2f}") 

Prediction for new instance: 309,276.00
