In [11]:
import pandas as pd

from sklearn.datasets import fetch_california_housing

# Load the housing dataset
housing = fetch_california_housing()

# Create a DataFrame with the data (X) and the target (y)
X = pd.DataFrame(housing.data, columns=housing.feature_names) 
y = pd.Series(housing.target, name='med_house_value')

# Display the first 5 rows of the data
display(X.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [3]:
# Print the feature names and check for missing values
display(X.columns.dropna())

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

In [4]:
# Generate summary statistics
X.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Creating a linear regression model

# Split the raw data (80% training, 20% testing)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model on unscaled data
lin_reg = LinearRegression()
lin_reg.fit(X_train_raw, y_train)

# Make predictions on the test set
y_pred = lin_reg.predict(X_test_raw)
y_pred

array([0.71912284, 1.76401657, 2.70965883, ..., 4.46877017, 1.18751119,
       2.00940251])

In [38]:
# Evaluate model performance
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score

mse_raw = mean_squared_error(y_test, y_pred)
rmse_raw = root_mean_squared_error(y_test, y_pred)
r2_raw = r2_score(y_test, y_pred)

print("Unscaled Data Model:")
print(f"Mean Squared Error: {mse_raw:.2f}")
print(f"Root Mean Squared Error: {rmse_raw:.2f}")
print(f"R² Score: {r2_raw:.2f}")

# View the model's coefficients
print("Model Coefficients (Unscaled):")
print(pd.Series(lin_reg.coef_,
                index=X.columns))
print(f"Model Intercept (Unscaled):")
print(pd.Series(lin_reg.intercept_))

# View the predicted data versus the actual data
y_pred_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred}).head()
y_pred_df

Unscaled Data Model:
Mean Squared Error: 0.56
Root Mean Squared Error: 0.75
R² Score: 0.58
Model Coefficients (Unscaled):
MedInc        0.448675
HouseAge      0.009724
AveRooms     -0.123323
AveBedrms     0.783145
Population   -0.000002
AveOccup     -0.003526
Latitude     -0.419792
Longitude    -0.433708
dtype: float64
Model Intercept (Unscaled):
0   -37.023278
dtype: float64


Unnamed: 0,Actual,Predicted
20046,0.477,0.719123
3024,0.458,1.764017
15663,5.00001,2.709659
20484,2.186,2.838926
9814,2.78,2.604657


The R2 score tells us that 58% of the variance in the target variable is explained by the model. In most cases, this is a rather strong correlation, suggesting that the created linear regression model is good and trustworthy. The variables with the largest positive impact on predictions are AveBedrms and MedInc. If the average number of bedrooms increases by one, then the target variable, median house value, increases by 0.783. The variables with the largest negative impact are Latitude and Longitude. The predicted values match up with the actual values well on some observations but not others.

In [28]:
# Select only three features from the model to create a new, simpler model
simplified_X = X[["HouseAge", "MedInc", "AveBedrms"]]

# Split the simplified data (80% training, 20% testing)
X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(simplified_X, y, test_size = 0.2, random_state = 42)

# Train a linear regression model on the simplified data
lin_reg_simple = LinearRegression()
lin_reg_simple.fit(X_train_simple, y_train_simple)

# Make predictions on the test set
y_pred_simple = lin_reg_simple.predict(X_test_simple)
y_pred_simple

array([1.04950913, 1.51492009, 2.31479717, ..., 4.32450877, 1.72116012,
       1.72145295])

In [39]:
# Evaluate the simplified model's performance

mse_simple = mean_squared_error(y_test_simple, y_pred_simple)
rmse_simple = root_mean_squared_error(y_test_simple, y_pred_simple)
r2_simple = r2_score(y_test_simple, y_pred_simple)

print("Unscaled Data Model:")
print(f"Mean Squared Error: {mse_simple:.2f}")
print(f"Root Mean Squared Error: {rmse_simple:.2f}")
print(f"R² Score: {r2_simple:.2f}")

# View the model's coefficients
print("Model Coefficients (Unscaled):")
print(pd.Series(lin_reg_simple.coef_,
                index=simplified_X.columns))
print(f"Model Intercept (Unscaled):")
print(pd.Series(lin_reg_simple.intercept_))

# View the predicted data versus the actual data
y_pred_df = pd.DataFrame({"Actual": y_test_simple, "Predicted": y_pred_simple}).head()
y_pred_df

Unscaled Data Model:
Mean Squared Error: 0.66
Root Mean Squared Error: 0.81
R² Score: 0.49
Model Coefficients (Unscaled):
HouseAge     0.017643
MedInc       0.434319
AveBedrms    0.046612
dtype: float64
Model Intercept (Unscaled):
0   -0.169399
dtype: float64


Unnamed: 0,Actual,Predicted
20046,0.477,1.049509
3024,0.458,1.51492
15663,5.00001,2.314797
20484,2.186,2.670038
9814,2.78,2.096227


The simplified model varies from the full model in multiple ways. I chose the variables Median Income, Average Bedrooms, and House Age because they had the largest positive impact on the Median Housing Value. The simplified model had worse results for the mean squared error, root mean squared error, and the R2 score. Also, the coefficients of the variables chosen for the simplified model were different than in the full model, which is to be expected. The coefficient for MedInc decreased very slightly and the coefficient for HouseAge increased by a more sizeable amount. However, the coefficient for AveBedrms plumetted. It went from a coefficient of 0.783145 to one of 0.046612. Its old coefficient was much larger than that of MedInc and HouseAge, but its coefficient in the simplified model is significantly smaller than the other two. I expected all the coefficients of all three to change, but I find it strange that one changed so dramatically while the other two changes were much smaller. The model intercept also changed a lot from -37 to almost 0. In practice, I would not use the simplified model because the full model had better test results, suggesting that it does a better job of predicting the correct values.

In [41]:
# Scale the data

from sklearn.preprocessing import StandardScaler

Scaler = StandardScaler()

X_scaled = Scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

# Split the scaled data
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

# Fit the scaled data
lin_reg_scaled = LinearRegression()
lin_reg_scaled.fit(X_train_scaled, y_train_scaled)

# Make predictions
y_pred_scaled = lin_reg_scaled.predict(X_test_scaled)
y_pred_scaled

# Evaluate model performance
mse_scaled = mean_squared_error(y_test_scaled, y_pred_scaled)
rmse_scaled = root_mean_squared_error(y_test_scaled, y_pred_scaled)
r2_scaled = r2_score(y_test_scaled, y_pred_scaled)

print("Unscaled Data Model:")
print(f"Mean Squared Error: {mse_scaled:.2f}")
print(f"Root Squared Error: {rmse_scaled:.2f}")
print(f"R² Score: {r2_scaled:.2f}")

# View coefficients
coef_series_scaled = pd.Series(lin_reg_scaled.coef_, index=X.columns)
intercept = lin_reg_scaled.intercept_

print("Coefficients (Unscaled):")
print(coef_series_scaled)
print(f"Intercept: {intercept}")

Unscaled Data Model:
Mean Squared Error: 0.56
Root Squared Error: 0.75
R² Score: 0.58
Coefficients (Unscaled):
MedInc        0.852382
HouseAge      0.122382
AveRooms     -0.305116
AveBedrms     0.371132
Population   -0.002298
AveOccup     -0.036624
Latitude     -0.896635
Longitude    -0.868927
dtype: float64
Intercept: 2.067862309508389


Latitude, Longitude, and Median Income have the largest impact on the median housing value.