In [None]:
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

iris = load_iris()

columns = ['sepal_length','sepal_width','petal_length','petal_width']
irisDF = pd.DataFrame(iris.data , columns=columns)
irisDF['target']=iris.target
irisDF.head(3)

In [None]:
markers=['^', 's', 'o']

for i, marker in enumerate(markers):
    x_axis_data = irisDF[irisDF['target']==i]['sepal_length']
    y_axis_data = irisDF[irisDF['target']==i]['sepal_width']
    plt.scatter(x_axis_data, y_axis_data, marker=marker,label=iris.target_names[i])

plt.legend()
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

iris_scaled = StandardScaler().fit_transform(irisDF.iloc[:, :-1]) # PCA전에 Standardization 해줘야함 (피쳐들의 값을 연산하는데 스케일이 같아야하므로)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(iris_scaled)
iris_pca = pca.transform(iris_scaled)

print(iris_pca.shape)

In [None]:
pca_columns=['pca_component_1','pca_component_2']
irisDF_pca = pd.DataFrame(iris_pca,columns=pca_columns)
irisDF_pca['target']=iris.target
irisDF_pca.head(3) # 4차원에서 2차원으로 축소

In [None]:
markers=['^', 's', 'o']

for i, marker in enumerate(markers):
    x_axis_data = irisDF_pca[irisDF_pca['target']==i]['pca_component_1']
    y_axis_data = irisDF_pca[irisDF_pca['target']==i]['pca_component_2']
    plt.scatter(x_axis_data, y_axis_data, marker=marker,label=iris.target_names[i])

plt.legend()
plt.xlabel('pca_component_1')
plt.ylabel('pca_component_2')
plt.show()

In [None]:
print(pca.explained_variance_ratio_) # 주성분 별 변동성 비율

**원본 데이터와 PCA 변환된 데이터 기반에서 예측 성능 비교**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

rcf = RandomForestClassifier(random_state=156)
scores = cross_val_score(rcf, iris.data, iris.target,scoring='accuracy',cv=3) # 원본 데이터
print(scores)
print(np.mean(scores))

In [None]:
pca_X = irisDF_pca[['pca_component_1', 'pca_component_2']] # PCA된 데이터
scores_pca = cross_val_score(rcf, pca_X, iris.target, scoring='accuracy', cv=3 )
print(scores_pca)
print(np.mean(scores_pca)) # PCA는 많은 feature가 존재할 때 데이터를 보다 명확하게 표현

# **PCA for credit card dataset**

In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/머신러닝/6장

In [None]:
import pandas as pd

df = pd.read_excel('pca_credit_card.xls', sheet_name='Data', header=1)
print(df.shape)
df.head(3)

In [None]:
df.rename(columns={'PAY_0':'PAY_1','default payment next month':'default'}, inplace=True)
Y_target = df['default']
X_features = df.drop(['ID','default'], axis=1)

In [None]:
Y_target.value_counts()

In [None]:
X_features.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

corr = X_features.corr() # Check correlation of features
plt.figure(figsize=(14,14))
sns.heatmap(corr, annot=True, fmt='.1g') # 상관도가 높으면 차원축소 효율이 좋음

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

cols_bill = ['BILL_AMT'+str(i) for i in range(1, 7)] # 상관도가 높은 BILL
print('대상 속성명:', cols_bill)

scaler = StandardScaler() 
df_cols_scaled = scaler.fit_transform(X_features[cols_bill]) # 표준화
pca = PCA(n_components=2)
pca.fit(df_cols_scaled) # fit까지만 해서 변동성 확인

print('PCA Component별 변동성:', pca.explained_variance_ratio_) # 6개의 feature를 2개로 축소했을 때의 변동성
# PCA된 feature를 data feature로 사용 가능?

**원본 데이터로 분류**

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rcf = RandomForestClassifier(n_estimators=300, random_state=156)
scores = cross_val_score(rcf, X_features, Y_target, scoring='accuracy', cv=3 )

print('CV=3 인 경우의 개별 Fold세트별 정확도:',scores)
print('평균 정확도:{0:.4f}'.format(np.mean(scores)))

PCA 데이터로 분류

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(X_features)

pca = PCA(n_components=6)
df_pca = pca.fit_transform(df_scaled)
scores_pca = cross_val_score(rcf, df_pca, Y_target, scoring='accuracy', cv=3)

print('CV=3 인 경우의 PCA 변환된 개별 Fold세트별 정확도:',scores_pca)
print('PCA 변환 데이터 셋 평균 정확도:{0:.4f}'.format(np.mean(scores_pca))) # 6개의 feature만 가지고도 분류 잘 할 수 있음
# 상관도가 높은 feature가 많은 경우 유용함

# **Practice**

In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/머신러닝/6장

In [None]:
import pandas as pd

df = pd.read_excel('pca_credit_card.xls', sheet_name='Data', header=1)
print(df.shape)
df.head(3)

In [None]:
df.rename(columns={'PAY_0':'PAY_1','default payment next month':'default'}, inplace=True)
Y_target = df['default']
X_features = df.drop(['ID','default'], axis=1)

In [None]:
Y_target.value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

corr = X_features.corr() # Check correlation of features
plt.figure(figsize=(14,14))
sns.heatmap(corr, annot=True, fmt='.1g') # 상관도가 높으면 차원축소 효율이 좋음

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

cols_bill = ['BILL_AMT'+str(i) for i in range(1, 7)] # 상관도가 높은 BILL
print('대상 속성명:', cols_bill)

scaler = StandardScaler() 
df_cols_scaled = scaler.fit_transform(X_features[cols_bill]) # 표준화
pca = PCA(n_components=2)
pca.fit(df_cols_scaled) # fit까지만 해서 변동성 확인
pca = pca.transform(df_cols_scaled) # transform을 해야 변환이 됨

In [None]:
pca_columns=['pca_component_1','pca_component_2']
pca = pd.DataFrame(pca, columns=pca_columns)
pca.head(3) # 6차원에서 2차원으로 축소

In [None]:
X_train = X_features.drop(cols_bill, axis = 1)
X_train

In [None]:
X_train = pd.concat([X_train, pca], axis = 1)
X_train

In [None]:
corr2 = X_train.corr()

In [None]:
plt.figure(figsize=(14,14))
sns.heatmap(corr2, annot=True, fmt='.1g')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rcf = RandomForestClassifier(n_estimators=300)
scores = cross_val_score(rcf, X_train, Y_target, scoring='accuracy', cv=3 )

print('CV=3 인 경우의 개별 Fold세트별 정확도:',scores)
print('평균 정확도:{0:.4f}'.format(np.mean(scores)))