In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
dataset = load_boston()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [3]:
print(dataset["DESCR"])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

#### Einfache Lineare Regression

#### Bedeutung
  
- $R^2$: Wie viel Streuung kann von dem Regressionsmodell erklärt werden
- coef: Steigung der Geraden
- intercept: y-Achsenabschnitt
  
#### Symbole

- $\bar{x}$: Mittelwert von $x$
- $\bar{y}$: Mittelwert von $y$
- $\hat{y}$: Prediktion vom Modell

#### Datensatz

- $m$: Anzahl an Samples
- $n$: Anzahl an Features
- $x$: Input-Daten (Features)
- $y$: Output Daten (Targets)

#### Variablen

- $x \in \mathbb{R}^{m,n}$
- $y \in \mathbb{R}^{m,}$
- coef, $\bar{x} \in \mathbb{R}^{n}$
- intercept, $\bar{y} \in \mathbb{R}$

#### Formeln

- coef = $\frac{\sum_{i=1}^{n}(x_i-\bar{x})(y_i-\bar{y})}{\sum_{i=1}^n(x_i-\bar{x})^2}$ 
    
- intercept = $\bar{y} - coef \cdot \bar{x}$
    
- $R^2 = 1 - \frac{\sum_{i=1}^n(y_i-\hat{y})^2}{\sum_{i=1}^n(y_i-\bar{y})^2}$
  
- $\hat{y} = coef^T \cdot x + intercept$

In [4]:
class SimpleLinearRegression:
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None
    
    def _compute_coef(self, x: np.ndarray, y: np.ndarray, x_mean: np.ndarray, y_mean: np.ndarray):
        frac1 = np.sum(
            [(x[i] - x_mean) * (y[i] - y_mean) for i in range(len(x))]
        )
        frac2 = np.sum(
            [(x[i] - x_mean)**2 for i in range(len(x))]
        )
        self.coef_ = frac1 / frac2
    
    def _compute_intercept(self, x_mean: np.ndarray, y_mean: np.ndarray):
        self.intercept_ = y_mean - np.dot(self.coef_.T, x_mean)
    
    def fit(self, x: np.ndarray, y: np.ndarray):
        x_mean = np.mean(x, axis=0)
        y_mean = np.mean(y, axis=0)
        self._compute_coef(x, y, x_mean, y_mean)
        self._compute_intercept(x_mean, y_mean)
    
    def predict(self, x: np.ndarray):
        y_pred = np.array(
            [np.dot(self.coef_.T, xi) + self.intercept_ for xi in x]
        )
        return y_pred
    
    def score(self, x: np.ndarray, y: np.ndarray):
        y_pred = self.predict(x)
        y_mean = np.mean(y, axis=0)
        frac1 = np.sum(
            [(y[i] - y_pred[i])**2 for i in range(len(y))]
        )
        frac2 = np.sum(
            [(y[i] - y_mean)**2 for i in range(len(y))]
        )
        r2_score = 1.0 - frac1 / frac2
        return r2_score

In [5]:
np.random.seed(42)

x = dataset.data[:, 5]
y = dataset.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [6]:
regr = SimpleLinearRegression()
regr.fit(x_train, y_train)
r2_score = regr.score(x_test, y_test)

print(f"Coef: {regr.coef_}")
print(f"Intercept: {regr.intercept_}")
print(f"R2-Score: {r2_score}")

Coef: 9.118102197303783
Intercept: -34.66230743840676
R2-Score: 0.4584649934303069
