In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

dataset = load_boston()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [2]:
print(dataset["DESCR"])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
np.random.seed(42)

x = dataset.data[:, 5]
y = dataset.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [4]:
class SimpleLinearRegression:
    def __init__(self):
        self.coef_ = None       # steigung der geraden
        self.intercept_ = None  # y-achsenabschnitt
    
    def _compute_coef(self, x: np.ndarray, y: np.ndarray, x_mean: np.ndarray, y_mean: np.ndarray):
        zaehler = np.sum(
            [(x[i] - x_mean) * (y[i] - y_mean) for i in range(len(x))]
        )
        nenner = np.sum(
            [(x[i] - x_mean) **2 for i in range(len(x))]
        )
        self.coef_ = zaehler/nenner

    def _compute_intercept(self, x_mean: np.ndarray, y_mean: np.ndarray):
        #np.dot multiplizert vektoren
        self.intercept_ = y_mean - np.dot(self.coef_.T, x_mean)

    def fit(self, x: np.ndarray,y: np.ndarray):
        x_mean = np.mean(x, axis=0) #axis gibt die dimension an
        y_mean = np.mean(y, axis=0)
        self._compute_coef(x,y,x_mean,y_mean)
        self._compute_intercept(x_mean,y_mean)

    def predict(self, x: np.ndarray):
        y_pred = np.array(
            [np.dot(self.coef_.T, xi) + self.intercept_ for xi in x]
        )
        return y_pred

    def score(self, x: np.ndarray, y: np.ndarray):
        y_pred = self.predict(x)
        y_mean = np.mean(y, axis=0)
        zaehler = np.sum(
            [(y[i] - y_pred[i])**2 for i in range (len(y))]
        )
        nenner = np.sum(
            [(y[i] - y_mean)**2 for i in range(len(y))]
        )

        r2_score = 1.0 - zaehler /nenner
        return r2_score

In [5]:
regr = SimpleLinearRegression()
regr.fit(x_train, y_train)
r2_score = regr.score(x_test, y_test)

print(f"Coef: {regr.coef_}")
print(f"Intercept: {regr.intercept_}")
print(f"R2Score: {r2_score}") #zwischen 0 und 1 -> je naeher an 1 desto besser

Coef: 9.118102197303783
Intercept: -34.66230743840676
R2Score: 0.4584649934303069


In [6]:
#exkurs mean mit axis
a = np.array([[1,2,3],[4,5,6]])
print(a)
print(np.mean(a))           #mittelwert ueber ganzes array
print(np.mean(a, axis=0))   #mittelwert der spalten
print(np.mean(a, axis=1))   #mittelwert der zeilen

[[1 2 3]
 [4 5 6]]
3.5
[2.5 3.5 4.5]
[2. 5.]
