### 題目
    由課本的技巧來進行，以 Bank 為範例進行資料前處理，整理後的資料，以 CSV 的格式輸出，處理之後的資料集，以 linear regression （至少包含線性以及非線性的兩種機制）進行預測。
 

In [67]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# 讀取資料集
data = pd.read_csv("../../data/bank/bank.csv", sep=";", header="infer")
print("資料集大小:", data.shape, "\n")
print("欄位:", data.columns.tolist(), "\n")
print(data.head(), "\n")

data.isnull().sum()

資料集大小: (4521, 17) 

欄位: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'] 

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unkno

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### 處理二元類別資料

In [68]:
## 處理二元類別資料
data.y =  data.y.map({'yes':1, 'no':0})
data.default = data.default.map({'yes':1, 'no':0})
data.housing = data.housing.map({'yes':1, 'no':0})
data.loan = data.loan.map({'yes':1, 'no':0})

# 分離 X 跟 y
X = data.drop(columns=['y'])
y = data.y

X.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,-1,0,unknown


### 處理 Unknown 欄位

In [69]:
# 檢查各欄位有多少 "unknown" 值
unknown_counts = {}
for col in data.columns:
    # 字串欄位才需要檢查 "unknown"
    if data[col].dtype == 'object':
        count = (data[col] == 'unknown').sum()
        if count > 0:
            unknown_counts[col] = count, 

# 顯示結果
if unknown_counts:
    print("含有 'unknown' 值的欄位:")
    for col, count in unknown_counts.items():
        print(f"{col}: {count} 筆")
else:
    print("沒有欄位含 'unknown' 值")

# 處理 'unknown' 值欄位
# job, education 以眾數、平均數取代 'unknown' 值, poutcome 因 unknown 筆數很多，將 uknown 視為一類別
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values='unknown', strategy="most_frequent")

# job 的 unknown 僅占約 0.8%，透過眾數填補
nan_columns = ['job']
X[nan_columns] = imp.fit_transform(X[nan_columns])

含有 'unknown' 值的欄位:
job: (np.int64(38),) 筆
education: (np.int64(187),) 筆
contact: (np.int64(1324),) 筆
poutcome: (np.int64(3705),) 筆


In [70]:
# drop 掉具有時序的欄位
X.drop(["day", "month"], axis=1, inplace=True)
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,0,1787,0,0,cellular,79,1,-1,0,unknown
1,33,services,married,secondary,0,4789,1,1,cellular,220,1,339,4,failure
2,35,management,single,tertiary,0,1350,1,0,cellular,185,1,330,1,failure
3,30,management,married,tertiary,0,1476,1,1,unknown,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,0,0,1,0,unknown,226,1,-1,0,unknown


In [72]:
categorical_columns = X.select_dtypes(include=["object"]).columns
print("類別欄位", categorical_columns)

# 所有的數值欄位的名稱
numeric_cols_index = X.select_dtypes(include=["int64", "float64"]).columns

numeric_cols = numeric_cols_index[X[numeric_cols_index].nunique() > 2].tolist()
binary_cols = numeric_cols_index[X[numeric_cols_index].nunique() == 2].tolist()

print("多值數值欄位 (nunique > 2):", numeric_cols)
print("二元欄位 (nunique = 2):", binary_cols)

類別欄位 Index(['job', 'marital', 'education', 'contact', 'poutcome'], dtype='object')
多值數值欄位 (nunique > 2): ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
二元欄位 (nunique = 2): ['default', 'housing', 'loan']


In [None]:
# job、education