In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
## loading IRIS dataset
from sklearn.datasets import load_iris
data = load_iris()

In [None]:
data

In [None]:
data["data"]

In [None]:
data["feature_names"]

In [None]:
iris_df = pd.DataFrame(
    data['data'], 
    columns=['sepal_length', 'sepal_width', 
             'petal_length', 'petal_width'])

In [None]:
from sklearn import preprocessing

std_scale = preprocessing.StandardScaler()
X_std = std_scale.fit_transform(data['data'])

In [None]:
iris_df = pd.DataFrame(
    X_std, 
    columns=['sepal_length', 'sepal_width', 
             'petal_length', 'petal_width'])

In [None]:
from sklearn.decomposition import PCA
from matplotlib.ticker import MaxNLocator

pca = PCA(random_state=1004)
pca.fit_transform(iris_df)

## percentage of variance explained
print(pca.explained_variance_ratio_)
# [0.92461872 0.05306648 0.01710261 0.00521218] w/o Scaler
# [0.72962445 0.22850762 0.03668922 0.00517871] w/  StandardScaler

## Principal 1 & 2 explain about 97.8% of variance
plt.rcParams['figure.figsize'] = (7, 7)
ax = plt.figure().gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.bar(range(1, iris_df.shape[1]+1), pca.explained_variance_ratio_)
plt.xlabel("number of Principal Components", fontsize=12)
plt.ylabel("% of Variance Explained", fontsize=12)
plt.show()

In [None]:
# 공분산 행렬 구성, 고유값, 고유벡터 구하기
import numpy as np

cov_mat = np.cov(X_std.T)
eigenvalues, eigenvectors = np.linalg.eig(cov_mat)

np.sort(eigenvalues)[::-1]

print("val: ", eigenvalues)
print("vec: ", eigenvectors)

In [None]:
sums = np.sum(eigenvalues)
explained_variances = [x/sums for x in eigenvalues]

print(explained_variances)

In [None]:
## 주성분 2개로 차원축소
pca = PCA(n_components=2, random_state=42)
iris_pca = pca.fit_transform(iris_df)
iris_pca[:10]

In [None]:
species_map_dict = {
    0: 'setosa', 
    1: 'versicolor', 
    2: 'virginica'
}

iris_pca_df = pd.DataFrame({
    'pc_1': iris_pca[:, 0], 
    'pc_2': iris_pca[:, 1], 
    'species': np.vectorize(species_map_dict.get)(data['target']) # numpy broadcasting
})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (7, 7)
sns.scatterplot(
    x='pc_1', 
    y='pc_2',
    hue='species', 
    style='species',
    s=100,
    data=iris_pca_df
)

plt.title('PCA result of IRIS dataset')
plt.xlabel('Principal Component 1', fontsize=14)
plt.ylabel('Principal Component 2', fontsize=14)
plt.show()