In [2]:
import pandas as pd
import numpy as np

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
cols = [ "CRIM" , "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "TGT"]
boston = pd.read_csv(url, sep = ' ', skipinitialspace = True, header = None, names = cols, index_col = False )
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,TGT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
for col in cols:
    stddev = boston[col].std()
    #center data -> mean = 0
    boston[col] -= boston[col].mean()
    #norm to variance = 1
    boston[col] /= stddev
    
    print(col + " Varianz: " + str(boston[col].var()))
    print(col + " Mittelwert: " + str(boston[col].mean()) + "\n")
  
designmatr = boston.to_numpy()
designmatr = designmatr.T
n,m = designmatr.shape
(n,m)

CRIM Varianz: 1.0000000000000007
CRIM Mittelwert: 8.326672684688674e-17

ZN Varianz: 1.0000000000000164
ZN Mittelwert: 3.4667043061417934e-16

INDUS Varianz: 0.9999999999999974
INDUS Mittelwert: -3.0169652459193794e-15

CHAS Varianz: 0.9999999999999893
CHAS Mittelwert: 3.999874651959803e-16

NOX Varianz: 0.9999999999999966
NOX Mittelwert: 3.5635745562153124e-15

RM Varianz: 0.999999999999999
RM Mittelwert: -1.1498816830640888e-14

AGE Varianz: 0.9999999999999996
AGE Mittelwert: -1.1582741792482611e-15

DIS Varianz: 1.0000000000000007
DIS Mittelwert: 7.308602559340704e-16

RAD Varianz: 0.9999999999999948
RAD Mittelwert: -1.0685348082854768e-15

TAX Varianz: 0.9999999999999997
TAX Mittelwert: 6.534079382082443e-16

PTRATIO Varianz: 0.9999999999999999
PTRATIO Mittelwert: -1.0844202128275441e-14

B Varianz: 1.000000000000002
B Mittelwert: 8.117353956330493e-15

LSTAT Varianz: 0.9999999999999996
LSTAT Mittelwert: -6.49458528239222e-16

TGT Varianz: 0.9999999999999989
TGT Mittelwert: -2.8295

(14, 506)

In [9]:
# Columns of V represent the PCs
U,D,V = np.linalg.svd(designmatr)
sigma = np.zeros((n, m))
#write singular values on diagonal of M x N - Matrix
for i in range(D.shape[0]):
    sigma[i,i] = D[i]
    
# projections onto the PCs are the columns of this matrix
UD = U.dot(sigma)
UD.shape

(14, 506)

### Warum müssen für die Ermittlung der Eigenwerte der Kovarianzmatrix die Diagonalelemente quadriert und durch n-1 geteilt werden?
Durch die Singulärwertzerlegung der Kovarianzmatrix erhält man folgenden Zusammenhang: 

**Eigenwerte = Singulärwerte² / n-1**

Durch die Singulärwertzerlegung der Designmatrix sind deren Singulärwerte bekannt und es können daraus die Eigenwerte der Kovarianzmatrix berechnet werden

In [5]:
eigenvalues = np.square(D) / (n-1)
eigenvalues

array([254.28651825,  64.07797018,  52.39980705,  34.43866415,
        33.05417053,  25.63888004,  20.79865033,  15.65797493,
        10.77062383,   9.7992313 ,   8.26608277,   7.10836055,
         5.20576129,   2.34345865])

In [24]:
variances = []
for i in range(m):
    variances.append(UD[:,i].var())
    
summe = np.sum(variances)
explained_var = np.trim_zeros((variances/summe)*100)
cumulated_var = np.cumsum(explained_var)

dataset = pd.DataFrame({'Eigenvalues': eigenvalues, 'Explained Variance': explained_var, 'Cumulative Variance': cumulated_var})
dataset

Unnamed: 0,Eigenvalues,Explained Variance,Cumulative Variance
0,254.286518,46.504227,46.504227
1,64.07797,12.129906,58.634133
2,52.399807,9.337612,67.971745
3,34.438664,6.831658,74.803403
4,33.054171,5.64012,80.443523
5,25.63888,4.304302,84.747825
6,20.79865,4.174911,88.922737
7,15.657975,2.589613,91.512349
8,10.770624,2.135454,93.647803
9,9.799231,1.968121,95.615925


### Wieviele Dimensionen können weggelassen, wenn x % Fehler zulässig sind?
x=10: 6  
x=5:  4  
x=1:  2  