### 題目
    由課本的技巧來進行，以 Bank 為範例進行資料前處理，整理後的資料，以 CSV 的格式輸出，處理之後的資料集，以 linear regression （至少包含線性以及非線性的兩種機制）進行預測。
 

In [160]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# 讀取資料集
data = pd.read_csv("../../data/bank/bank.csv", sep=";", header="infer")
print("資料集大小:", data.shape, "\n")
print("欄位:", data.columns.tolist(), "\n")
print(data.head(), "\n")

data.isnull().sum()

資料集大小: (4521, 17) 

欄位: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'] 

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unkno

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### 處理二元類別資料

In [161]:
## 處理二元類別資料
data.y = data.y.map({"yes": 1, "no": 0})
data.default = data.default.map({"yes": 1, "no": 0})
data.housing = data.housing.map({"yes": 1, "no": 0})
data.loan = data.loan.map({"yes": 1, "no": 0})

### 處理 Unknown 欄位

In [162]:
# 檢查各欄位有多少 "unknown" 值
unknown_counts = {}
for col in data.columns:
    # 字串欄位才需要檢查 "unknown"
    if data[col].dtype == "object":
        count = (data[col] == "unknown").sum()
        if count > 0:
            unknown_counts[col] = count

# 顯示結果
if unknown_counts:
    print("含有 'unknown' 值的欄位:")
    for col, count in unknown_counts.items():
        print(f"{col}: {count} 筆")
else:
    print("沒有欄位含 'unknown' 值")

# 處理 'unknown' 值欄位
# job, education 以眾數、平均數取代 'unknown' 值, poutcome 因 unknown 筆數很多，將 unknown 視為一類別
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values="unknown", strategy="most_frequent")

# job 的 unknown 僅占約 0.8%，透過眾數填補
nan_columns = ["job"]
data[nan_columns] = imp.fit_transform(X[nan_columns])

含有 'unknown' 值的欄位:
job: 38 筆
education: 187 筆
contact: 1324 筆
poutcome: 3705 筆


KeyError: "None of [Index(['job'], dtype='object')] are in the [columns]"

In [None]:
# drop 掉具有時序的欄位
data.drop(["day", "month"], axis=1, inplace=True)

# pdays 比起上次聯繫這個資訊，銀行可能更重視是否曾經聯繫過客戶
# 因此將 pdays 轉成二元欄位 contacted
data["contacted"] = np.where(data["pdays"] > 0, 1, 0)
data.drop("pdays", axis=1, inplace=True)

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,previous,poutcome,contacted
0,30,unemployed,married,primary,0,1787,0,0,cellular,79,1,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,220,1,4,failure,1
2,35,management,single,tertiary,0,1350,1,0,cellular,185,1,1,failure,1
3,30,management,married,tertiary,0,1476,1,1,unknown,199,4,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,226,1,0,unknown,0


In [None]:
categorical_columns = data.select_dtypes(include=["object"]).columns
print("類別欄位", categorical_columns)

# 所有的數值欄位的名稱
numeric_cols_index = data.select_dtypes(include=["int64", "float64"]).columns

numeric_cols = numeric_cols_index[data[numeric_cols_index].nunique() > 2].tolist()
binary_cols = numeric_cols_index[data[numeric_cols_index].nunique() == 2].tolist()

print("多值數值欄位 (nunique > 2):", numeric_cols)
print("二元欄位 (nunique = 2):", binary_cols)

類別欄位 Index(['job', 'marital', 'education', 'contact', 'poutcome'], dtype='object')
多值數值欄位 (nunique > 2): ['age', 'balance', 'duration', 'campaign', 'previous']
二元欄位 (nunique = 2): ['default', 'housing', 'loan', 'contacted']


### One-Hot Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# education、poutcome 具有順序性，使用 Ordinal Encoding
education_order = [
    "unknown",  # 0
    "primary",  # 1
    "secondary",  # 2
    "tertiary",  # 3
]

oe = OrdinalEncoder(categories=[education_order])
data["education"] = oe.fit_transform(data[["education"]])

poutcome_order = [
    "unknown",  # 0
    "failure",  # 1
    "other",  # 2
    "success",  # 3
]

oe = OrdinalEncoder(categories=[poutcome_order])
data["poutcome"] = oe.fit_transform(data[["poutcome"]])

# 再次確認類別欄位
categorical_columns = data.select_dtypes(include=["object"]).columns
print(categorical_columns)

# 剩餘的類別欄位使用 One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(
    sparse_output=False,  # 輸出 NumPy 陣列
    handle_unknown="ignore",  # 忽略未見過的類別
    drop="first",  # 避免完全共線性 (用於線性模型)
)
data_encoded = ohe.fit_transform(data[categorical_columns])
data_encoded = pd.DataFrame(
    data_encoded, columns=ohe.get_feature_names_out(categorical_columns)
)
data = pd.concat([data, data_encoded], axis=1)
data.drop(categorical_columns, axis=1, inplace=True)

data.head()

Index(['job', 'marital', 'contact'], dtype='object')


Unnamed: 0,age,education,default,balance,housing,loan,duration,campaign,previous,poutcome,...,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_married,marital_single,contact_telephone,contact_unknown
0,30,1.0,0,1787,0,0,79,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,33,2.0,0,4789,1,1,220,1,4,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,35,3.0,0,1350,1,0,185,1,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,30,3.0,0,1476,1,1,199,4,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,59,2.0,0,0,1,0,226,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
