In [15]:
import numpy as np
np.random.seed(42)

In [16]:
from typing import List
from typing import Tuple

import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [17]:
dataset = load_boston()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [18]:
all_features = df.columns.values.tolist()
num_features_total = len(all_features)
print(all_features)
print("Num features: ", num_features_total)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
Num features:  13


In [19]:
x = dataset['data']
y = dataset['target']

In [20]:
print(x.shape)
print(y.shape)

(506, 13)
(506,)


# Exercise 1


<font size="5">
Test the model performance for a subset of the given dataset where you only use one feature (hence not all 13 features ;)
</font> 

In [21]:
scores: List[Tuple[int, float]] = []

for feature_idx in range(num_features_total):
    x_sliced = x[:, [feature_idx]]
    x_train, x_test, y_train, y_test = train_test_split(x_sliced, y, test_size=0.3)

    regr = LinearRegression()
    regr.fit(x_train, y_train)
    r2 = regr.score(x_test, y_test)
    scores.append((feature_idx, r2))

    print(f"Featue: {feature_idx}\tR2: {r2:.6f}")

Featue: 0	R2: 0.138140
Featue: 1	R2: 0.156386
Featue: 2	R2: 0.115023
Featue: 3	R2: 0.015996
Featue: 4	R2: 0.214377
Featue: 5	R2: 0.479388
Featue: 6	R2: 0.247553
Featue: 7	R2: -0.037431
Featue: 8	R2: 0.081760
Featue: 9	R2: 0.294857
Featue: 10	R2: 0.241133
Featue: 11	R2: 0.112641
Featue: 12	R2: 0.538005


# Exercise 2

Print the standard deviation, and variance of each feature.
Do these values correlate to the performance differences from exercise 1?

In [22]:
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

print(sorted_scores)

[(12, 0.5380052327556762), (5, 0.4793882936184912), (9, 0.2948573932208305), (6, 0.24755265367990642), (10, 0.24113282841490136), (4, 0.21437745037434752), (1, 0.15638621004081643), (0, 0.1381400841713265), (2, 0.11502275985265009), (11, 0.1126411704301381), (8, 0.08176044033536023), (3, 0.015996439988834488), (7, -0.03743139263865247)]


In [27]:
for feature_idx, score in sorted_scores:
    x_sliced = x[:, [feature_idx]]
    std = np.std(x_sliced)
    var = np.var(x_sliced)

    print(f"Feature: {feature_idx}\tScore: {score:.6f}\tStd: {std:.4f}\tVar: {var:.4f}")

Feature: 12	Score: 0.538005	Std: 7.1340	Var: 50.8940
Feature: 5	Score: 0.479388	Std: 0.7019	Var: 0.4927
Feature: 9	Score: 0.294857	Std: 168.3705	Var: 28348.6236
Feature: 6	Score: 0.247553	Std: 28.1210	Var: 790.7925
Feature: 10	Score: 0.241133	Std: 2.1628	Var: 4.6777
Feature: 4	Score: 0.214377	Std: 0.1158	Var: 0.0134
Feature: 1	Score: 0.156386	Std: 23.2994	Var: 542.8618
Feature: 0	Score: 0.138140	Std: 8.5930	Var: 73.8404
Feature: 2	Score: 0.115023	Std: 6.8536	Var: 46.9714
Feature: 11	Score: 0.112641	Std: 91.2046	Var: 8318.2804
Feature: 8	Score: 0.081760	Std: 8.6987	Var: 75.6665
Feature: 3	Score: 0.015996	Std: 0.2537	Var: 0.0644
Feature: 7	Score: -0.037431	Std: 2.1036	Var: 4.4253
