### 題目
    由課本的技巧來進行，以 Bank 為範例進行資料前處理，整理後的資料，以 CSV 的格式輸出，處理之後的資料集，以 linear regression （至少包含線性以及非線性的兩種機制）進行預測。
 

In [18]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

data = pd.read_csv("../../data/bank/bank.csv", sep=";", header="infer")
print("資料集大小:", data.shape, "\n")
print("欄位:", data.columns.tolist(), "\n")
print(data.head(), "\n")

# 處理 'unknown' 的值
data.replace("unknown", np.nan, inplace=True)
null_counts = data.isnull().sum()
print("Null Data:\n", null_counts[null_counts > 0])

資料集大小: (4521, 17) 

欄位: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'] 

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unkno

### 處理二元類別資料

In [19]:
## 處理二元類別資料
data.y =  data.y.map({'yes':1, 'no':0})
data.default = data.default.map({'yes':1, 'no':0})
data.housing = data.housing.map({'yes':1, 'no':0})
data.loan = data.loan.map({'yes':1, 'no':0})

# 分離 X 跟 y
X = data.drop(columns=['y'])
y = data.y

X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,0,1476,1,1,,3,jun,199,4,-1,0,
4,59,blue-collar,married,secondary,0,0,1,0,,5,may,226,1,-1,0,


### 處理 Unknown 欄位

In [20]:
# 處理 'unknown' 值欄位
# job, education 以眾數、平均數取代 'unknown' 值, poutcome 因 unknown 筆數很多，將 uknown 視為一類別
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy="most_frequent")

# 要用眾數填補的欄位
nan_columns = ['job', 'education', 'contact']
data[nan_columns] = imp.fit_transform(data[nan_columns])

### 排除具有時序性的欄位

In [21]:
X.drop(columns=["duration", "month", "day"], axis=1, inplace=True)
print(X.columns.tolist())
X.head()

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'campaign', 'pdays', 'previous', 'poutcome']


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,0,1787,0,0,cellular,1,-1,0,
1,33,services,married,secondary,0,4789,1,1,cellular,1,339,4,failure
2,35,management,single,tertiary,0,1350,1,0,cellular,1,330,1,failure
3,30,management,married,tertiary,0,1476,1,1,,4,-1,0,
4,59,blue-collar,married,secondary,0,0,1,0,,1,-1,0,


### One-Hot Encoding

In [22]:
# 做 OneHotEncoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

categorical_columns = X.select_dtypes(include=["object"]).columns
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns

for col in categorical_columns:
    X[col] = le.fit_transform(X[col])

X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,campaign,pdays,previous,poutcome
0,30,10,1,0,0,1787,0,0,0,1,-1,0,3
1,33,7,1,1,0,4789,1,1,0,1,339,4,0
2,35,4,2,2,0,1350,1,0,0,1,330,1,0
3,30,4,1,2,0,1476,1,1,2,4,-1,0,3
4,59,1,1,1,0,0,1,0,2,1,-1,0,3


### 標準化資料

In [23]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

scaled_data = ss.fit_transform(X)
scaled_data[0]

array([-1.05626965,  1.71680374, -0.24642938, -1.64475535, -0.1307588 ,
        0.12107186, -1.14205138, -0.42475611, -0.72364152, -0.57682947,
       -0.4072183 , -0.32041282,  0.44441328])

### Feature Selection

In [24]:
from sklearn.decomposition import PCA
pca = PCA(n_components=11)
X_pca = pca.fit_transform(scaled_data)
X_pca[0:5]

array([[ 5.09196084e-01,  6.32608942e-01, -8.22569715e-01,
         3.05912815e-01, -4.25458900e-01, -5.22778307e-01,
        -8.80624880e-01, -4.55599371e-01, -1.87082386e+00,
        -1.15890887e+00, -1.38474803e+00],
       [-4.46827762e+00, -3.29767218e-01,  7.06175131e-01,
         7.68805312e-01,  1.87649274e-01, -6.43016524e-01,
         1.38480004e+00,  2.00757939e+00, -1.09101760e+00,
        -5.01550099e-01, -5.29630152e-01],
       [-3.73750763e+00,  1.19680031e+00,  8.43680598e-01,
        -3.85069552e-01, -3.39475885e-01,  2.89707831e-02,
         3.25949051e-02, -2.82233542e-03,  5.35417440e-01,
        -3.29011130e-02,  1.08510154e+00],
       [ 1.03231937e+00,  2.80369531e-01,  1.53754177e+00,
         6.34703586e-01,  3.56703892e-01,  4.12597972e-02,
         1.08229477e+00,  2.29274184e+00,  4.13041082e-01,
         7.55809975e-01, -4.38581240e-01],
       [ 9.89048734e-01, -1.80434930e+00,  6.23255778e-01,
        -7.99945233e-01, -7.54451158e-01,  1.77296382e-01,
  

### 輸出

In [25]:
# 將處理好的資料儲存
processed_data = pd.DataFrame(X_pca)
processed_data['y'] = y.values # 將目標變數加回來
processed_data.to_csv('../../output/week7_hw_data_processing_output.csv', index=False)
print("\n 已將處理完成的資料儲存至 week7_hw_data_processing_output.csv")


 已將處理完成的資料儲存至 week7_hw_data_processing_output.csv
