In [35]:
import random
random.seed(42)
import numpy as np
np.random.seed(42)

In [36]:
import itertools
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [37]:
dataset = load_boston()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [38]:
# Welche Features verwenden
all_features = df.columns.values.tolist()
num_features_total = len(all_features)
print(all_features)
print("Num features: ", num_features_total)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
Num features:  13


In [39]:
total_feature_combs = 0

for num in range(1, num_features_total):
    current_feature_combs = len([v for v in itertools.combinations(all_features, num)])
    total_feature_combs += current_feature_combs
    print("Combs with ", num, " combs: ", current_feature_combs)
    
print("Total: ", total_feature_combs)

Combs with  1  combs:  13
Combs with  2  combs:  78
Combs with  3  combs:  286
Combs with  4  combs:  715
Combs with  5  combs:  1287
Combs with  6  combs:  1716
Combs with  7  combs:  1716
Combs with  8  combs:  1287
Combs with  9  combs:  715
Combs with  10  combs:  286
Combs with  11  combs:  78
Combs with  12  combs:  13
Total:  8190


In [40]:
best_score = 0.0

for num in range(1, num_features_total + 1):
    for features in itertools.combinations(all_features, num):
        df_features = pd.DataFrame(df, columns=features)

        x = df_features.to_numpy()
        y = dataset['target']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

        regr = LinearRegression()
        regr.fit(x_train, y_train)
        r2_score = regr.score(x_test, y_test)

        if r2_score > best_score:
            best_score = r2_score
            print(f"R2: {r2_score}")
            print(f"Features:\n{features}")

R2: 0.13814008417132628
Features:
('CRIM',)
R2: 0.15638621004081632
Features:
('ZN',)
R2: 0.21437745037434774
Features:
('NOX',)
R2: 0.4793882936184912
Features:
('RM',)
R2: 0.5380052327556761
Features:
('LSTAT',)
R2: 0.5582507486695105
Features:
('CRIM', 'LSTAT')
R2: 0.5834097092123274
Features:
('INDUS', 'RM')
R2: 0.589243676493741
Features:
('NOX', 'RM')
R2: 0.6555129928066802
Features:
('RM', 'RAD')
R2: 0.6898293831298401
Features:
('ZN', 'RM', 'B')
R2: 0.7209685926887803
Features:
('NOX', 'RM', 'LSTAT')
R2: 0.7391173951136699
Features:
('ZN', 'RM', 'PTRATIO', 'LSTAT')
R2: 0.7424359641956617
Features:
('INDUS', 'RM', 'PTRATIO', 'LSTAT')
R2: 0.7747205244084439
Features:
('CRIM', 'ZN', 'RM', 'PTRATIO', 'LSTAT')
R2: 0.7764740425306117
Features:
('ZN', 'INDUS', 'RM', 'PTRATIO', 'B', 'LSTAT')
R2: 0.7822209304168458
Features:
('INDUS', 'RM', 'DIS', 'PTRATIO', 'B', 'LSTAT')
R2: 0.7939789972823279
Features:
('ZN', 'INDUS', 'RM', 'DIS', 'TAX', 'PTRATIO', 'LSTAT')
R2: 0.8114669483526987
Feat

In [41]:
# Best Settings
features = ['CRIM', 'ZN', 'INDUS', 'RM', 'AGE', 'PTRATIO', 'B', 'LSTAT']
df_features = pd.DataFrame(df, columns=features)

x = df_features.to_numpy()
y = dataset['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

regr = LinearRegression()
regr.fit(x_train, y_train)
r2_score = regr.score(x_test, y_test)

print(f"R2-Score: {r2_score}")

R2-Score: 0.7467355524490357
