In [52]:
import random
random.seed(42)
import numpy as np
np.random.seed(42)

In [53]:
from typing import List
from typing import Tuple

import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [54]:
dataset = load_boston()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [55]:
all_features = df.columns.values.tolist()
num_features_total = len(all_features)
print(all_features)
print("Num features: ", num_features_total)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
Num features:  13


In [56]:
x = dataset["data"]
y = dataset['target']

In [57]:
print(x.shape)
print(y.shape)

(506, 13)
(506,)


# Exercise 1


<font size="5">
Test the model performance for a subset of the given dataset where you only use one feature (hence not all 13 features ;)
</font> 

In [58]:
def add_intercept(x: np.ndarray) -> np.ndarray:
        intercepts = np.ones(shape=(x.shape[0]))
        x = np.column_stack((intercepts, x))
        return x

In [59]:
for i in range(len(all_features)):
    x = dataset.data[:, i]
    feature_name = all_features[i]
    x = add_intercept(x)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    regr = LinearRegression()
    regr.fit(x_train, y_train)

    r2_score = regr.score(x_test, y_test)
    print(f"Feature: {feature_name}")
    print(f"R2 score: {r2_score}")
    print()

Feature: CRIM
R2 score: 0.13814008417132628

Feature: ZN
R2 score: 0.15638621004081643

Feature: INDUS
R2 score: 0.11502275985264965

Feature: CHAS
R2 score: 0.015996439988834488

Feature: NOX
R2 score: 0.21437745037434752

Feature: RM
R2 score: 0.47938829361849133

Feature: AGE
R2 score: 0.2475526536799063

Feature: DIS
R2 score: -0.03743139263865247

Feature: RAD
R2 score: 0.08176044033536023

Feature: TAX
R2 score: 0.2948573932208305

Feature: PTRATIO
R2 score: 0.24113282841490113

Feature: B
R2 score: 0.11264117043013844

Feature: LSTAT
R2 score: 0.5380052327556764



# Exercise 2

<font size="5">
Plot the standard deviation, and variance of each feature.
Do these values correlate to the performance differences from exercise 1?
</font> 

In [60]:
for i in range(len(all_features)):
    x = dataset.data[:, i]
    feature_name = all_features[i]
    x = add_intercept(x)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    regr = LinearRegression()
    regr.fit(x_train, y_train)

    r2_score = regr.score(x_test, y_test)
    print(f"Coef: {regr.coef_[1]}")
    print(f"Intercept: {regr.intercept_}")
    print()

Coef: -0.3829134100704193
Intercept: 23.850708741162506

Coef: 0.15897945544928363
Intercept: 20.929409993246775

Coef: -0.6600319111679346
Intercept: 29.95151314531303

Coef: 7.106439393939387
Intercept: 21.656060606060606

Coef: -34.380291267585605
Intercept: 41.16992939969644

Coef: 9.190658165933144
Intercept: -35.40471519554286

Coef: -0.1350984328349001
Intercept: 31.6611239458421

Coef: 1.3153693552180674
Intercept: 17.254203182273685

Coef: -0.42256126362607244
Intercept: 26.27725922784088

Coef: -0.024544684882731694
Intercept: 32.50606596120581

Coef: -2.2886684835028
Intercept: 64.98603946849931

Coef: 0.03574259668579368
Intercept: 9.991161514464572

Coef: -0.9379702842960074
Intercept: 34.30945844294357

