In [25]:
import numpy as np
np.random.seed(42)

In [26]:
from typing import List
from typing import Tuple

import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [27]:
dataset = load_boston()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [28]:
all_features = df.columns.values.tolist()
num_features_total = len(all_features)
print(all_features)
print("Num features: ", num_features_total)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
Num features:  13


In [29]:
x = dataset['data']
y = dataset['target']

In [30]:
print(x.shape)
print(y.shape)

(506, 13)
(506,)


# Exercise 1


<font size="5">
Test the model performance for a subset of the given dataset where you only use one feature (hence not all 13 features ;)
</font> 

In [31]:
scores: List[Tuple[int, float]] = []

for feature_idx in range(num_features_total):
        x_sliced = x[:, [feature_idx]]
        x_train, x_test, y_train, y_test = train_test_split(x_sliced, y, test_size=0.3)

        regr = LinearRegression()
        regr.fit(x_train, y_train)
        r2_score = np.clip(regr.score(x_test, y_test), 0.0, 1.0)
        scores.append((feature_idx, r2_score))

        print(f"Features: {feature_idx:02}\tR2: {r2_score:04.6F}")

Features: 00	R2: 0.138140
Features: 01	R2: 0.156386
Features: 02	R2: 0.115023
Features: 03	R2: 0.015996
Features: 04	R2: 0.214377
Features: 05	R2: 0.479388
Features: 06	R2: 0.247553
Features: 07	R2: 0.000000
Features: 08	R2: 0.081760
Features: 09	R2: 0.294857
Features: 10	R2: 0.241133
Features: 11	R2: 0.112641
Features: 12	R2: 0.538005


# Exercise 2

<font size="5">
Plot the standard deviation, and variance of each feature.
Do these values correlate to the performance differences from exercise 1?
</font> 

In [32]:
sorted_scores = sorted(scores, key= lambda x: x[1], reverse=True)

for feature_idx, score in sorted_scores:
        x_sliced = x[:, [feature_idx]]
        std = np.std(x_sliced)
        var = np.var(x_sliced)

        print(
            f"Features: {feature_idx:02}\tScore: {score:04.6F}\tStd: {std:04.6F}\tVar: {var:04.6F}"
        )

Features: 12	Score: 0.538005	Std: 7.134002	Var: 50.893979
Features: 05	Score: 0.479388	Std: 0.701923	Var: 0.492695
Features: 09	Score: 0.294857	Std: 168.370495	Var: 28348.623600
Features: 06	Score: 0.247553	Std: 28.121033	Var: 790.792473
Features: 10	Score: 0.241133	Std: 2.162805	Var: 4.677726
Features: 04	Score: 0.214377	Std: 0.115763	Var: 0.013401
Features: 01	Score: 0.156386	Std: 23.299396	Var: 542.861840
Features: 00	Score: 0.138140	Std: 8.593041	Var: 73.840360
Features: 02	Score: 0.115023	Std: 6.853571	Var: 46.971430
Features: 11	Score: 0.112641	Std: 91.204607	Var: 8318.280421
Features: 08	Score: 0.081760	Std: 8.698651	Var: 75.666531
Features: 03	Score: 0.015996	Std: 0.253743	Var: 0.064385
Features: 07	Score: 0.000000	Std: 2.103628	Var: 4.425252
