# 12.1.2 PCA主成分分析代码实现

1.二维空间降维Python代码实现

In [1]:
import numpy as np
X = np.array([[1, 1], [2, 2], [3, 3]])
X

array([[1, 1],
       [2, 2],
       [3, 3]])

In [2]:
# 也可以通过pandas库来构造数据，效果一样
import pandas as pd
X = pd.DataFrame([[1, 1], [2, 2], [3, 3]])
X

Unnamed: 0,0,1
0,1,1
1,2,2
2,3,3


In [3]:
# 数据降维，由二维降至一维
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(X)  # 进行降维模型训练
X_transformed = pca.transform(X)  # 进行数据降维，并赋值给X_transformed

X_transformed  # 查看降维后的结果

array([[-1.41421356],
       [ 0.        ],
       [ 1.41421356]])

In [4]:
# 查看此时的维度
X_transformed.shape

(3, 1)

In [5]:
# 查看降维的系数
pca.components_  

array([[0.70710678, 0.70710678]])

In [6]:
# 查看线性组合表达式
a = pca.components_[0][0] 
b = pca.components_[0][1]
print(str(a) + ' * X + ' +  str(b) + ' * Y')

0.7071067811865476 * X + 0.7071067811865475 * Y


这个的确也和12.1.1节我们获得的线性组合是一样的。

2.三维空间降维Python代码实现

In [7]:
import pandas as pd
X = pd.DataFrame([[45, 0.8, 9120], [40, 0.12, 2600], [38, 0.09, 3042], [30, 0.04, 3300], [39, 0.21, 3500]], columns=['年龄(岁)', '负债比率', '月收入(元)'])
X

Unnamed: 0,年龄(岁),负债比率,月收入(元)
0,45,0.8,9120
1,40,0.12,2600
2,38,0.09,3042
3,30,0.04,3300
4,39,0.21,3500


In [8]:
# 因为三个指标数据的量级相差较大，所以可以先进行数据归一化处理
from sklearn.preprocessing import StandardScaler
X_new = StandardScaler().fit_transform(X)

X_new  # 查看归一化后的数据

array([[ 1.36321743,  1.96044639,  1.98450514],
       [ 0.33047695, -0.47222431, -0.70685302],
       [-0.08261924, -0.57954802, -0.52440206],
       [-1.73500401, -0.75842087, -0.41790353],
       [ 0.12392886, -0.15025319, -0.33534653]])

In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X_new)  # 进行降维模型训练
X_transformed = pca.transform(X_new)  # 进行数据降维，并赋值给X_transformed

X_transformed  # 查看降维后的结果

array([[ 3.08724247,  0.32991205],
       [-0.52888635, -0.74272137],
       [-0.70651782, -0.33057258],
       [-1.62877292,  1.05218639],
       [-0.22306538, -0.30880449]])

In [10]:
# 查看降维的系数
pca.components_  

array([[ 0.52952108,  0.61328179,  0.58608264],
       [-0.82760701,  0.22182579,  0.51561609]])

In [11]:
dim = ['年龄(岁)', '负债比率', '月收入(元)']
for i in pca.components_:
    formula = []
    for j in range(len(i)):
        formula.append(str(i[j]) + ' * ' + dim[j])
    print(" + ".join(formula))

0.5295210839165538 * 年龄(岁) + 0.6132817922410683 * 负债比率 + 0.5860826434841948 * 月收入(元)
-0.8276070105929828 * 年龄(岁) + 0.2218257919336094 * 负债比率 + 0.5156160917294705 * 月收入(元)


In [12]:
# 如果不想显示具体的特征名称，可以采用如下的写法
dim = ['X', 'Y', 'Z']
for i in pca.components_:
    formula = []
    for j in range(len(i)):
        formula.append(str(i[j]) + ' * ' + dim[j])
    print(" + ".join(formula))

0.5295210839165538 * X + 0.6132817922410683 * Y + 0.5860826434841948 * Z
-0.8276070105929828 * X + 0.2218257919336094 * Y + 0.5156160917294705 * Z
