### 由課本的技巧來進行，以 Bank 為範例進行資料前處理，整理後的資料，以 CSV 的格式輸出
 

In [129]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

data = pd.read_csv("../data/bank/bank.csv", sep=";", header="infer")
print("資料集大小:", data.shape, "\n")
print("欄位:", data.columns.tolist(), "\n")
print(data.head(), "\n")

# 處理 'unknown' 的值
data.replace("unknown", np.nan, inplace=True)
null_counts = data.isnull().sum()
print("Null Data:\n", null_counts[null_counts > 0])

資料集大小: (4521, 17) 

欄位: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'] 

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unkno

#### job, education, contact, poutcome 具有 known 值需額外處理

In [130]:
## 處理二元類別資料
data.y =  data.y.map({'yes':1, 'no':0})
data.default = data.default.map({'yes':1, 'no':0})
data.housing = data.housing.map({'yes':1, 'no':0})
data.loan = data.loan.map({'yes':1, 'no':0})

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,,3,jun,199,4,-1,0,,0
4,59,blue-collar,married,secondary,0,0,1,0,,5,may,226,1,-1,0,,0


In [131]:
# 處理 'unknown' 值欄位
# job, education 以眾數、平均數取代 'unknown' 值, poutcome 因 unknown 筆數很多，將 uknown 視為一類別
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy="most_frequent")

# 要用眾數填補的欄位
nan_columns = ['job', 'education', 'contact']
data[nan_columns] = imp.fit_transform(data[nan_columns])

# 分離 X 跟 y
X = data.drop(columns=['y'])
y = data.y

In [132]:
# 做 OneHotEncoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

categorical_columns = X.select_dtypes(include=["object"]).columns
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns

for col in categorical_columns:
    X[col] = le.fit_transform(X[col])

# 去除具有時序性的資料
X.drop(columns=["duration", "month", "day"], axis=1, inplace=True)
print(X.columns.tolist())
X.head()

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'campaign', 'pdays', 'previous', 'poutcome']


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,pdays,previous,poutcome
0,30,10,1,0,0,1787,0,0,0,1,-1,0,3
1,33,7,1,1,0,4789,1,1,0,1,339,4,0
2,35,4,2,2,0,1350,1,0,0,1,330,1,0
3,30,4,1,2,0,1476,1,1,0,4,-1,0,3
4,59,1,1,1,0,0,1,0,0,1,-1,0,3


In [133]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

scaled_data = ss.fit_transform(X)
scaled_data[0:5]

array([[-1.05626965,  1.7657156 , -0.24642938, -1.75887388, -0.1307588 ,
         0.12107186, -1.14205138, -0.42475611, -0.26707118, -0.57682947,
        -0.4072183 , -0.32041282,  0.44441328],
       [-0.77258281,  0.82777644, -0.24642938, -0.22760702, -0.1307588 ,
         1.1186443 ,  0.87561735,  2.3542922 , -0.26707118, -0.57682947,
         2.98904408,  2.04173372, -2.57996071],
       [-0.58345826, -0.11016273,  1.42139579,  1.30365984, -0.1307588 ,
        -0.02414438,  0.87561735, -0.42475611, -0.26707118, -0.57682947,
         2.89914302,  0.27012381, -2.57996071],
       [-1.05626965, -0.11016273, -0.24642938,  1.30365984, -0.1307588 ,
         0.01772575,  0.87561735,  2.3542922 , -0.26707118,  0.38796743,
        -0.4072183 , -0.32041282,  0.44441328],
       [ 1.68603644, -1.04810189, -0.24642938, -0.22760702, -0.1307588 ,
        -0.47275291,  0.87561735, -0.42475611, -0.26707118, -0.57682947,
        -0.4072183 , -0.32041282,  0.44441328]])

In [134]:
from sklearn.decomposition import PCA
pca = PCA(n_components=11)
X_pca = pca.fit_transform(scaled_data)