## PCA : use SVD algorithm to calculate the principle components:


### Data Generation

In [1]:
import numpy as np
import random
from scipy.linalg import svd

In [2]:
x, y, z = [], [], []
for _ in range(100):
    x.append(random.randint(1, 10))
    y.append(random.randint(100, 500))
    z.append(random.randint(1000, 3000))

In [3]:
A = np.array([x, y, z])

In [4]:
A.shape

(3, 100)

### Step 1: regularize each feature

In [5]:
def normalize(A):
    mean = sum(A) / len(A)
    deviation = [(element - mean)**2 for element in A]
    std = np.sqrt((sum(deviation) / len(A)))
    normalized = [(element - mean)/std for element in A]
    return normalized

In [6]:
len(A)

3

In [7]:
normalized=[normalize(A[i]) for i in range(len(A))]

In [10]:
len(normalized)

3

### Step 2: get the covariance matrix

In [11]:
x=normalized

In [12]:
c=np.dot(np.transpose(x),x)/(len(A)-1)

In [13]:
len(A)

3

### Step 3: Singular Vector Decomposition

In [14]:
U, s, VT = svd(c)

In [15]:
s.shape

(100,)

In [16]:
U

array([[-0.17095808,  0.02758707, -0.02147219, ...,  0.08561065,
         0.09562424,  0.03999048],
       [ 0.01900714, -0.11093357, -0.06171569, ...,  0.16075557,
        -0.00884492, -0.03320724],
       [ 0.03010879,  0.03202759, -0.14408734, ..., -0.34467196,
         0.10340751, -0.01671631],
       ...,
       [ 0.12638042,  0.07235378, -0.09044809, ..., -0.04164119,
        -0.02232417, -0.15387309],
       [ 0.13052351,  0.0495525 ,  0.05239628, ...,  0.03755079,
         0.0965796 ,  0.09099108],
       [ 0.20820333, -0.13113306, -0.02395092, ...,  0.15214228,
        -0.11848385, -0.06318674]])

In [17]:
VT

array([[-1.70958076e-01,  1.90071414e-02,  3.01087881e-02, ...,
         1.26380418e-01,  1.30523514e-01,  2.08203331e-01],
       [ 2.75870680e-02, -1.10933572e-01,  3.20275865e-02, ...,
         7.23537838e-02,  4.95525038e-02, -1.31133055e-01],
       [-2.14721880e-02, -6.17156942e-02, -1.44087336e-01, ...,
        -9.04480940e-02,  5.23962843e-02, -2.39509186e-02],
       ...,
       [ 0.00000000e+00, -2.46394789e-02, -1.42458327e-02, ...,
         8.23892072e-02, -2.91534403e-02, -1.49877665e-01],
       [ 5.17061783e-01, -7.61204824e-02,  6.57030814e-03, ...,
         4.23768516e-02, -1.14236508e-02, -6.73730019e-02],
       [ 7.94477068e-01, -1.35392773e-01,  1.59035705e-02, ...,
        -4.81450669e-04,  4.27596602e-02,  1.06738878e-01]])

### Step 4: Select importance components and transform the feature matrix

In [60]:
total = sum(s)
temp = 0
K = 0
for i, a in enumerate(s):
    temp += a
    if temp / total > 0.9:
         K = i + 1
         break
K

3

### Step 5: Recover the original matrix

In [96]:
X_rec = np.zeros((Z.shape[0],U.shape[0]))
U_recude = U[:,0:K]
X_rec = np.dot(Z,np.transpose(U_recude))  # 还原数据（近似）
print(X_rec.shape)