### 由課本的技巧來進行，以 Bank 為範例進行資料前處理，整理後的資料，以 CSV 的格式輸出
 

In [154]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

data = pd.read_csv("../data/bank/bank.csv", sep=";", header="infer")
print("資料集大小:", data.shape, "\n")
print("欄位:", data.columns.tolist(), "\n")
print(data.head(), "\n")
print("Null Data:\n", data.isnull().sum())

資料集大小: (4521, 17) 

欄位: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'] 

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unkno

In [155]:
# 檢查各欄位有多少 "unknown" 值
unknown_counts = {}
for col in data.columns:
    # 字串欄位才需要檢查 "unknown"
    if data[col].dtype == 'object':
        count = (data[col] == 'unknown').sum()
        if count > 0:
            unknown_counts[col] = count, 

# 顯示結果
if unknown_counts:
    print("含有 'unknown' 值的欄位:")
    for col, count in unknown_counts.items():
        print(f"{col}: {count} 筆")
else:
    print("沒有欄位含 'unknown' 值")


含有 'unknown' 值的欄位:
job: (np.int64(38),) 筆
education: (np.int64(187),) 筆
contact: (np.int64(1324),) 筆
poutcome: (np.int64(3705),) 筆


#### job, education, contact, poutcome 具有 known 值需額外處理

In [156]:
## 處理二元類別資料
data.y =  data.y.map({'yes':1, 'no':0})
data.default = data.default.map({'yes':1, 'no':0})
data.housing = data.housing.map({'yes':1, 'no':0})
data.loan = data.loan.map({'yes':1, 'no':0})

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,-1,0,unknown,0


In [157]:
# 處理 'unknown' 值欄位
# job, education 以眾數、平均數取代 'unknown' 值, poutcome 因 unknown 筆數很多，將 uknown 視為一類別
data["job"].replace("unknown", data["job"].mode()[0], inplace=True)
data["education"].replace("unknown", data["education"].mode()[0], inplace=True)
data["contact"].replace("unknown", data["contact"].mode()[0], inplace=True)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

nominal_cols = ["job", "marital", "education", "contact", "poutcome"]

for col in nominal_cols:
    data[col] = le.fit_transform(data[col])

# 去除具有時序性的資料
data.drop(columns=["duration", "month", "day"], axis=1, inplace=True)
print(data.columns.tolist())
data.head()

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,pdays,previous,poutcome,y
0,30,10,1,0,0,1787,0,0,0,1,-1,0,3,0
1,33,7,1,1,0,4789,1,1,0,1,339,4,0,0
2,35,4,2,2,0,1350,1,0,0,1,330,1,0,0
3,30,4,1,2,0,1476,1,1,0,4,-1,0,3,0
4,59,1,1,1,0,0,1,0,0,1,-1,0,3,0


In [158]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X = data.drop('y', axis=1) # 輸入特徵

scaled_data = ss.fit_transform(X)
scaled_data[0]

array([-1.05626965,  1.7657156 , -0.24642938, -1.75887388, -0.1307588 ,
        0.12107186, -1.14205138, -0.42475611, -0.26707118, -0.57682947,
       -0.4072183 , -0.32041282,  0.44441328])

In [159]:
from sklearn.decomposition import PCA
pca = PCA(n_components=11)
X_pca = pca.fit_transform(scaled_data)

In [160]:
pca.explained_variance_ratio_.sum()


np.float64(0.953649948664985)