In [1]:
import numpy as np
from sklearn import decomposition, datasets
from sklearn.preprocessing import StandardScaler
data = datasets.load_breast_cancer()
cancer = data.data 
cancer = StandardScaler().fit_transform(cancer)
cancer.shape

(569, 30)

In [2]:
cancer.shape

(569, 30)

In [3]:
before_transformation = data.data
before_transformation[:10, :1]

array([[17.99],
       [20.57],
       [19.69],
       [11.42],
       [20.29],
       [12.45],
       [18.25],
       [13.71],
       [13.  ],
       [12.46]])

In [4]:
cancer[:10, :1]

array([[ 1.09706398],
       [ 1.82982061],
       [ 1.57988811],
       [-0.76890929],
       [ 1.75029663],
       [-0.47637467],
       [ 1.17090767],
       [-0.11851678],
       [-0.32016686],
       [-0.47353452]])

In [5]:
#covariance matrix
covariance_matrix = np.cov(cancer, rowvar=False)

In [6]:
covariance_matrix.shape

(30, 30)

In [7]:
np.cov(cancer).shape

(569, 569)

In [8]:
eig_val_cov, eig_vec_cov = np.linalg.eig(covariance_matrix)

In [9]:
eig_val_cov

array([1.33049908e+01, 5.70137460e+00, 2.82291016e+00, 1.98412752e+00,
       1.65163324e+00, 1.20948224e+00, 6.76408882e-01, 4.77456255e-01,
       4.17628782e-01, 3.51310875e-01, 2.94433153e-01, 2.61621161e-01,
       2.41782421e-01, 1.57286149e-01, 9.43006956e-02, 8.00034045e-02,
       5.95036135e-02, 5.27114222e-02, 4.95647002e-02, 1.33279057e-04,
       7.50121413e-04, 1.59213600e-03, 6.91261258e-03, 8.19203712e-03,
       1.55085271e-02, 1.80867940e-02, 2.43836914e-02, 2.74877113e-02,
       3.12142606e-02, 3.00256631e-02])

In [10]:
eig_val_cov.shape

(30,)

In [11]:
eig_vec_cov.shape

(30, 30)

In [12]:
eig_pairs = [(np.abs(eig_val_cov[i]), eig_vec_cov[:, i]) for i in range(len(eig_val_cov))]

In [13]:
eig_pairs[:3]

[(13.304990794374561,
  array([0.21890244, 0.10372458, 0.22753729, 0.22099499, 0.14258969,
         0.23928535, 0.25840048, 0.26085376, 0.13816696, 0.06436335,
         0.20597878, 0.01742803, 0.21132592, 0.20286964, 0.01453145,
         0.17039345, 0.15358979, 0.1834174 , 0.04249842, 0.10256832,
         0.22799663, 0.10446933, 0.23663968, 0.22487053, 0.12795256,
         0.21009588, 0.22876753, 0.25088597, 0.12290456, 0.13178394])),
 (5.701374603726145,
  array([-0.23385713, -0.05970609, -0.21518136, -0.23107671,  0.18611302,
          0.15189161,  0.06016536, -0.0347675 ,  0.19034877,  0.36657547,
         -0.10555215,  0.08997968, -0.08945723, -0.15229263,  0.20443045,
          0.2327159 ,  0.19720728,  0.13032156,  0.183848  ,  0.28009203,
         -0.21986638, -0.0454673 , -0.19987843, -0.21935186,  0.17230435,
          0.14359317,  0.09796411, -0.00825724,  0.14188335,  0.27533947])),
 (2.822910155006231,
  array([-0.00853124,  0.0645499 , -0.00931422,  0.02869953, -0.1042919 

In [14]:
sorted_pairs = sorted(eig_pairs, key=lambda x:x[0], reverse = True)
for p in sorted_pairs:
    print(p[0])

13.304990794374561
5.701374603726145
2.822910155006231
1.9841275177301982
1.651633242330119
1.209482239802972
0.6764088817009056
0.47745625468950803
0.41762878210781734
0.3513108748817329
0.29443315349116506
0.261621161366121
0.24178242132831373
0.15728614921759285
0.09430069560105592
0.08000340447737676
0.059503613530431945
0.052711422210148115
0.04956470021298144
0.03121426055306657
0.0300256630904285
0.027487711338904305
0.024383691354590953
0.018086793984305315
0.015508527134418915
0.008192037117606788
0.006912612579184366
0.0015921360011975747
0.000750121412719186
0.00013327905666397818


In [15]:
matrix_w = np.hstack((sorted_pairs[0][1].reshape(30,1), sorted_pairs[1][1].reshape(30,1)))

In [16]:
matrix_w.T.shape

(2, 30)

In [17]:
sorted_pairs[0][1].shape

(30,)

In [18]:
transformed = matrix_w.T.dot(cancer.T)

In [19]:
transformed.shape

(2, 569)

In [20]:
transformed.T.shape

(569, 2)

In [21]:
transformed.T[0]

array([9.19283683, 1.94858307])

In [22]:
#Test the result with sklearn

In [23]:
pca = decomposition.PCA(n_components=2)
x_std = StandardScaler().fit_transform(cancer)


In [24]:
pca.fit_transform(x_std)[0]

array([9.19283683, 1.94858307])