In [1]:
import numpy as np


np.random.seed(42)

In [2]:
from typing import List
from typing import Tuple

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
dataset = fetch_california_housing()

In [4]:
print("Num features: ", dataset.feature_names)

Num features:  ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [5]:
x = dataset["data"]
y = dataset["target"]

In [6]:
print(x.shape)
print(y.shape)

(20640, 8)
(20640,)


### Exercise 1

Test the model performance for a subset of the given dataset where you only use one feature.


In [7]:
scores: List[Tuple[int, float]] = []

for feature_idx in range(len(dataset.feature_names)):
    x_sliced = x[:, [feature_idx]]
    x_train, x_test, y_train, y_test = train_test_split(
        x_sliced, y, test_size=0.3
    )

    regr = LinearRegression()
    regr.fit(x_train, y_train)
    r2 = regr.score(x_test, y_test)
    scores.append((feature_idx, r2))

    print(f"Featue: {feature_idx}\tR2: {r2:.6f}")

Featue: 0	R2: 0.472932
Featue: 1	R2: 0.004318
Featue: 2	R2: 0.011952
Featue: 3	R2: -0.000451
Featue: 4	R2: 0.001144
Featue: 5	R2: 0.000134
Featue: 6	R2: 0.021466
Featue: 7	R2: 0.002054


### Exercise 2

Print the standard deviation, and variance of each feature.
Do these values correlate to the performance differences from exercise 1?


In [8]:
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

In [9]:
for feature_idx, score in sorted_scores:
    x_sliced = x[:, [feature_idx]]
    std = np.std(x_sliced)
    var = np.var(x_sliced)

    print(
        f"Feature: {feature_idx}\tScore: {score:.4f}\t"
        f"Std: {std:.4f}\tVar: {var:.4f}"
    )

Feature: 0	Score: 0.4729	Std: 1.8998	Var: 3.6091
Feature: 6	Score: 0.0215	Std: 2.1359	Var: 4.5621
Feature: 2	Score: 0.0120	Std: 2.4741	Var: 6.1212
Feature: 1	Score: 0.0043	Std: 12.5853	Var: 158.3886
Feature: 7	Score: 0.0021	Std: 2.0035	Var: 4.0139
Feature: 4	Score: 0.0011	Std: 1132.4347	Var: 1282408.3220
Feature: 5	Score: 0.0001	Std: 10.3858	Var: 107.8648
Feature: 3	Score: -0.0005	Std: 0.4739	Var: 0.2246
