In [118]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [119]:
df = pd.DataFrame([[1.,1.,1.,0,0],
[3.,3.,3.,0,0],
[4.,4.,4.,0,0],
[5.,5.,5.,0,0],
[0,2.,0,4.,4.],
[0,0,0,5.,5.],
[0,1.,0,2.,2.]], columns=['Matrix', 'Alien', 'Serenity', 'Casablanca', 'Amelie'])

df

Unnamed: 0,Matrix,Alien,Serenity,Casablanca,Amelie
0,1.0,1.0,1.0,0.0,0.0
1,3.0,3.0,3.0,0.0,0.0
2,4.0,4.0,4.0,0.0,0.0
3,5.0,5.0,5.0,0.0,0.0
4,0.0,2.0,0.0,4.0,4.0
5,0.0,0.0,0.0,5.0,5.0
6,0.0,1.0,0.0,2.0,2.0


In [120]:
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)


## PCA

In [199]:
# Number of components to keep.
# n_components == min(n_samples, n_features)
pca = PCA(n_components=3)

In [200]:
pca.fit(scaled_data)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [201]:
x_pca = pca.transform(scaled_data)

In [202]:
pca.components_

array([[-0.46541708, -0.43804254, -0.46541708,  0.4329507 ,  0.4329507 ],
       [-0.30297998, -0.45075325, -0.30297998, -0.55372727, -0.55372727],
       [-0.43771003,  0.77777904, -0.43771003, -0.07707014, -0.07707014]])

In [203]:
pca.explained_variance_

array([4.98181251, 0.75758588, 0.09393494])

In [204]:
pca.singular_values_

array([5.4672548 , 2.13202141, 0.75073938])

## sklearn.decomposition.TruncatedSVD

In [198]:
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

svd = TruncatedSVD(n_components=3)
svd.fit(scaled_data)

print(svd.explained_variance_) 
print("----------------------")
print(svd.singular_values_)  


[4.27012501 0.64935933 0.08051566]
----------------------
[5.4672548  2.13202141 0.75073938]


## scipy.sparse.linalg.svds

In [180]:
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds, eigs

In [205]:
# k:Number of singular values and vectors to compute. Must be 1 <= k < min(A.shape)
# u: Unitary matrix having left singular vectors as columns.
# s: The singular values.
# vt: Unitary matrix having right singular vectors as rows.
u, s, vt = svds(scaled_data, k=3)

In [211]:
print(vt)
print("----------------------")
print("reversed_arr s:", s[::-1])

[[-0.43771003  0.77777904 -0.43771003 -0.07707014 -0.07707014]
 [-0.30297998 -0.45075325 -0.30297998 -0.55372727 -0.55372727]
 [ 0.46541708  0.43804254  0.46541708 -0.4329507  -0.4329507 ]]
----------------------
reversed_arr s: [5.4672548  2.13202141 0.75073938]


## scipy.linalg.svd

In [76]:
from scipy import linalg

In [217]:
# k:Number of singular values and vectors to compute. Must be 1 <= k < min(A.shape)
# U: Unitary matrix having left singular vectors as columns.
# s: The singular values, sorted in non-increasing order.
# Vh: Unitary matrix having right singular vectors as rows.
U_, s_, Vh_ = linalg.svd(scaled_data)

In [218]:
print(s_)

[5.46725480e+00 2.13202141e+00 7.50739383e-01 5.69485276e-16
 2.55602307e-32]


## numpy.linalg.svd

In [219]:
# u: Unitary array(s)
# s: Vector(s) with the singular values, within each vector sorted in descending order.
# vh: Unitary array(s).
u__, s__, vh__ = np.linalg.svd(scaled_data)

In [220]:
print(s__)

[5.46725480e+00 2.13202141e+00 7.50739383e-01 5.69485276e-16
 2.55602307e-32]


## Other

In [244]:
np.cov(scaled_data)

array([[ 0.03600116,  0.10408222,  0.13812275,  0.17216328, -0.18119989,
        -0.1795499 , -0.08961963],
       [ 0.10408222,  0.52801126,  0.73997578,  0.95194031, -0.77723847,
        -1.10421072, -0.44256038],
       [ 0.13812275,  0.73997578,  1.0409023 ,  1.34182882, -1.07525776,
        -1.56654113, -0.61903076],
       [ 0.17216328,  0.95194031,  1.34182882,  1.73171733, -1.37327705,
        -2.02887154, -0.79550114],
       [-0.18119989, -0.77723847, -1.07525776, -1.37327705,  1.19469821,
         1.55651615,  0.65575881],
       [-0.1795499 , -1.10421072, -1.56654113, -2.02887154,  1.55651615,
         2.40300882,  0.91964833],
       [-0.08961963, -0.44256038, -0.61903076, -0.79550114,  0.65575881,
         0.91964833,  0.37130478]])

In [242]:
# Covariance
def cov(x, y):
    xbar, ybar = x.mean(), y.mean()
    return np.sum((x - xbar)*(y - ybar))/(len(x) - 1)

# Covariance matrix
def cov_mat(X):
    return np.array([[cov(X[0], X[0]), cov(X[0], X[1]), cov(X[0], X[2]), cov(X[0], X[3]), cov(X[0], X[4]), cov(X[0], X[5]), cov(X[0], X[6])], \
                     [cov(X[1], X[0]), cov(X[1], X[1]), cov(X[1], X[2]), cov(X[1], X[3]), cov(X[1], X[4]), cov(X[1], X[5]), cov(X[1], X[6])]                    
                    ])

In [245]:
# Calculate covariance matrix 
cov_mat(scaled_data)

array([[0.03600116, 0.10408222, 0.13812275],
       [0.10408222, 0.52801126, 0.73997578]])