In [1]:
import pandas as pd
from sklearn.datasets import load_boston

dataset = load_boston()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [2]:
print(dataset['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

#### Einfache Lineare Regression

#### Formeln:

  - $m = \frac{\sum_{i=1}^{n}(x_i-\bar{x})(y_i-\bar{y})}{\sum_{i=1}^n(x_i-\bar{x})^2}$ 
    
  - $b = \bar{y} - m \cdot \bar{x}$
    
  - $R^2 = 1 - \frac{\sum_{i=1}^n(y_i-\hat{y})^2}{\sum_{i=1}^n(y_i-\bar{y})^2}$
  
  - $y = m \cdot x + b$
  
#### Bedeutung:
  
  - $R^2 :=$ Wie viel Streuung kann von dem Regressionsmodell erklärt werden
  - $m :=$ Steigung der Geraden
  - $b :=$ y-Achsenabschnitt
  
#### Symbole:
  - $\bar{x} :=$ Mittelwert von $x$
  - $\hat{y} :=$ Schätzung zum Wert x

In [3]:
# Einfache Lineare Regression Programmieren

# Compute Slope (Param m)
def compute_slope(x, y, x_mean, y_mean):
    frac1 = sum([(x[i] - x_mean)*(y[i] - y_mean) for i in range(len(x))])
    frac2 = sum([(x[i] - x_mean)**2 for i in range(len(x))])
    slope = frac1 / frac2
    return slope

In [4]:
# Compute intercept (Param b)
def compute_intercept(x_mean, y_mean, slope):
    intercept = y_mean - slope * x_mean
    return intercept

In [5]:
# Compute Regression Line
def compute_regression(x, slope, intercept):
    regression_line = [slope * x[i] + intercept for i in range(len(x))]
    return regression_line

In [6]:
# R-Squared
def compute_r2(y, y_mean, regression_line):
    frac1 = sum([(y[i] - regression_line[i])**2 for i in range(len(y))])
    frac2 = sum([(y[i] - y_mean)**2 for i in range(len(y))])
    r2 = 1 - frac1 / frac2
    return r2