实现基于协方差矩阵特征值分解的$PCA$

数据集使用$R$语言导出的$Hitters.csv$

In [31]:
import numpy as np
import pandas as pd

In [15]:
ht = pd.read_csv('Hitters.csv')
pd.set_option('display.max_columns', 25)
ht.head()

Unnamed: 0.1,Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,-Andy Allanson,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,-Andre Dawson,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,-Andres Galarraga,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [16]:
print(ht.shape)
print(np.sum(ht.isna()))

(322, 21)
Unnamed: 0     0
AtBat          0
Hits           0
HmRun          0
Runs           0
RBI            0
Walks          0
Years          0
CAtBat         0
CHits          0
CHmRun         0
CRuns          0
CRBI           0
CWalks         0
League         0
Division       0
PutOuts        0
Assists        0
Errors         0
Salary        59
NewLeague      0
dtype: int64


In [17]:
#数据清洗，清洗字符串类型的字段，删除NULL值
ht.drop(['Unnamed: 0', 'League', 'Division', 'NewLeague'], axis=1, inplace=True)
ht = ht.dropna()

In [18]:
print(ht.shape)
ht.head()

(263, 17)


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0


In [50]:
#将data.frame转化成ndarray
nht = np.array(ht)
#修改字段均值
n_samples, n_features = nht.shape
nht = nht - np.mean(nht, axis=0)#(263, 17)和(17, )可以广播，(17, 263)和(17, )不能广播
#计算协方差矩阵
cov_matrix = 1/(n_samples)*np.dot(nht.T, nht)
#计算特征值与特征向量
eig_val, eig_vec = np.linalg.eig(cov_matrix)
eig_pairs = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(n_features)]#特征向量取出之后维度为(17,)
#将特征值从高到低排序
eig_pairs.sort(key=lambda x:x[0], reverse=True)#默认eig_pairs中的元组按照第一个位置的值排序
#选取前k维作为新的特征
feature = np.array([ele[1] for ele in eig_pairs[:10]])
#计算数据投影
data = np.dot(feature, nht.T)
data

array([[ 7.81852512e+02, -1.06379373e+03,  3.14213662e+03, ...,
        -1.05055311e+03,  6.27319995e+02,  2.40721921e+03],
       [-2.90332933e+01,  3.00268362e+02, -3.44805847e+02, ...,
        -1.30283993e+02,  7.50712813e+02,  2.57600706e+02],
       [-3.61435034e+02, -5.32034748e+02, -4.64737975e+01, ...,
         1.98185742e+02, -7.94595174e+02, -1.92732478e+01],
       ...,
       [-1.47387615e+02,  1.92896870e+01,  4.59593441e+01, ...,
        -5.16353969e+01, -1.92987279e+01,  2.62315903e+01],
       [-5.61475193e+01, -3.73025166e+01,  8.79221094e+01, ...,
        -2.00852119e+00,  4.70608871e+01,  9.13974221e+01],
       [ 6.97990245e+00, -6.69258183e+00, -2.23238017e+00, ...,
         1.13823951e+01,  1.53232100e+01,  3.10258677e+01]])