## Importe

### Importieren benötigter Module und Plot-Einstellungen setzen.

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings; warnings.simplefilter('ignore')
sns.set()

## Datensatz laden

### Datensatz "Boston house prices" laden und analysieren.

In [None]:
boston = load_boston()

In [None]:
type(boston)

In [None]:
boston.keys()

In [None]:
print(boston.DESCR)

## Datensatz splitten

### Datensatz aufteilen in Trainingsdaten und Testdaten.

In [None]:
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target, name='MEDV')

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Visualisieren

### Zufällig ausgewählte Features plotten.

In [None]:
def compute_line(data, model):
    x_min = data.min()
    x_max = data.max()
    reg_line = model.intercept_ + (model.coef_ * np.arange(0, x_max+1))
    return reg_line

In [None]:
def plot_reg(X_train, X_test, y_train, y_test, predictions, reg_line):
    x_min = min(X_train.min(), X_test.min())
    x_max = max(X_train.max(), X_test.max())
    y_min = min(y_train.min(), y_test.min())
    y_max = max(y_train.max(), y_test.max())
    plt.xlim(x_min-(0.1*x_max), x_max+(0.1*x_max))
    plt.ylim(y_min-5, y_max+5)
    plt.plot(reg_line, linewidth=5, color='r', zorder=0)
    plt.scatter(X_train, y_train)
    plt.scatter(X_test, predictions, color='k', zorder=10)
    plt.show()

In [None]:
fig, axs = plt.subplots(ncols=3,figsize=(16,4))

for i, var in enumerate(['CRIM', 'RM', 'ZN']):
    lm = sns.scatterplot(X[var], y, ax = axs[i])
#    lm = sns.regplot(X[var], y, ax = axs[i])
    lm.set(ylim=(0, None))

## Lineare Regression mit einem Feature

### Feature setzen

In [None]:
feature = 'ZN'

### Trainings- und Testdaten auf das Feature reduzieren

In [None]:
X_train_one_feature = np.array(X_train[feature].copy()).reshape(-1,1)
X_test_one_feature = np.array(X_test[feature].copy()).reshape(-1,1)

### Model erzeugen

In [None]:
model = LinearRegression()

### Model trainieren

In [None]:
model.fit(X_train_one_feature, y_train)

### Preise für Testdaten vorhersagen

In [None]:
predictions = model.predict(X_test_one_feature)

In [None]:
predictions[:5]

In [None]:
reg_line = compute_line(X_train[feature], model)

In [None]:
reg_line

In [None]:
model.coef_

### Regressionslinie und Vorhersagen plotten

In [None]:
plot_reg(X_train[feature], X_test[feature], y_train, y_test, predictions, reg_line)

In [None]:
sns.regplot(X[feature], y)
plt.scatter(X_test[feature], predictions, color='k')

## Fehler berechnen

In [None]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse).round(2)
rmse

## Lineare Regression mit mehr als einem Feature

In [None]:
features = ['ZN', 'PTRATIO']

In [None]:
X_train_multiple_features = X_train[features].copy()
X_test_multiple_features = X_test[features].copy()

In [None]:
model = LinearRegression()
model.fit(X_train_multiple_features, y_train)
predictions = model.predict(X_test_multiple_features)

In [None]:
predictions[:5]

In [None]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse).round(2)
rmse

---

## Aufgaben

### Wie groß ist der Fehler, wenn nur das Feature **RM** berücksichtigt wird?

### Wie groß ist der Fehler, wenn nur die Features **RM** und **LSTAT** berücksichtigt werden?