In [13]:
# panda v1.5 above (p.2-28)
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
pd.get_dummies(df, columns=["color"], prefix='is', prefix_sep='_')

Unnamed: 0,size,price,classlabel,is_blue,is_green,is_red
0,M,10.1,class1,False,True,False
1,L,13.5,class2,False,False,True
2,XL,15.3,class1,True,False,False


In [14]:
df2 = pd.get_dummies(df, columns=["color"], prefix='is', prefix_sep='_')
pd.from_dummies(df2[['is_blue', 'is_green', 'is_red']], sep='_')

Unnamed: 0,is
0,green
1,red
2,blue


In [15]:
#-- §2-6 p.2-29 --#
from sklearn.preprocessing import OneHotEncoder

X = [['Male', 1], ['Female', 3], ['Female', 2]]
encoder = OneHotEncoder(handle_unknown='ignore')
X_new = encoder.fit_transform(X)
X_new.toarray()

array([[0., 1., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.]])

In [16]:
# 顯示類別
encoder.categories_


[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

In [17]:
# inverse_transform 可還原為原值
X_org = encoder.inverse_transform(X_new)
X_org

array([['Male', 1],
       ['Female', 3],
       ['Female', 2]], dtype=object)

In [18]:
# One-hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore')
color_new = encoder.fit_transform(df[['color']])

# get_feature_names_out 產生新的欄位名稱；feature_names_in_ 可取得原欄位名稱
column_names = encoder.get_feature_names_out(encoder.feature_names_in_)

df_new = pd.DataFrame(color_new.toarray(), columns=column_names)
df_new


Unnamed: 0,color_blue,color_green,color_red
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0


In [None]:
"""
【sklearn中fit、fit_transform、transform的區別】
https://www.cupoy.com/post/0000017EA058A745000000016375706F795F72656C656173654B5741535354434C55424E455753

fit：原義指的是安裝、使適合的意思，其實有點train的含義但是和train不同的是，它並不是一個訓練的過程，而是一個適配的過程，過程都是定死的，最後只是得到了一個統一的轉換的規則模型。
transform：是將資料進行轉換，比如資料的歸一化和標準化，將測試資料按照訓練資料同樣的模型進行轉換，得到特徵向量。
fit_transform：可以看做是fit和transform的結合，如果訓練階段使用fit_transform，則在測試階段只需要對測試樣本進行transform就行了。

【scikit-learn 數據預處理-fit_transform()與transform()之差異】
https://medium.com/@maggieliao.cm04g/scikit-learn-%E6%95%B8%E6%93%9A%E9%A0%90%E8%99%95%E7%90%86-fit-transform-%E8%88%87transform-%E4%B9%8B%E5%B7%AE%E7%95%B0-3c7cc07c124f

fit_transform():
fit_transform(partData)是先對partData作fit()的功能，找到該partData的整體統計特性之指標，如平均值、標準差、最大最小值等等(能依據不同目的套用這些指標在不同的轉換(即後面的transform()動作)上，再實行transform(partData)以對partData進行標準化(英文稱為normalization, 主要有兩種方法: min-max normalization or standard deviation normalization)或歸一化等動作.
根據之前fit(partData)所找出來，對於剩餘數據（restData）使用相同的平均值、標準差、最大/最小值等指標進行轉換 transform(restData)，從而保證partData、restData是以同樣的統計指標下去作標準化等轉換之資料前處理。
"""

In [19]:
# 刪除原欄位 'color'
df.drop(['color'], axis=1, inplace=True)

# 合併表格
df2 = pd.concat([df, df_new], axis=1)
df2

Unnamed: 0,size,price,classlabel,color_blue,color_green,color_red
0,M,10.1,class1,0.0,1.0,0.0
1,L,13.5,class2,0.0,0.0,1.0
2,XL,15.3,class1,1.0,0.0,0.0


In [20]:
# 存檔
import joblib

joblib.dump(encoder, 'joblib')

['joblib']

In [12]:
#-- p.2-32 --#
"""
pip install yfinance --upgrade --no-cache-dir
"""
import pandas as pd
import yfinance as yf

df_quote = yf.download('1101.TW', start='2020-01-01', end='2024-11-19')
df_quote.tail()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-11-12,32.700001,32.75,32.200001,32.450001,32.450001,23022123
2024-11-13,32.25,32.75,32.200001,32.450001,32.450001,18150100
2024-11-14,32.450001,32.650002,32.299999,32.349998,32.349998,16571895
2024-11-15,32.599998,33.349998,32.549999,33.150002,33.150002,29588758
2024-11-18,33.25,33.700001,33.150002,33.25,33.25,19238941


In [23]:
# 04_05_SelectKBest.ipynb
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile, chi2


In [20]:
#X, y = datasets.load_iris(return_X_y=True)
X, y = datasets.load_iris(return_X_y=True, as_frame=False)
X.shape

(150, 4)

In [21]:
print('X= ', X)
print('y= ', y)

X=  [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.

In [24]:
clf = SelectKBest(chi2, k=2)
X_new = clf.fit_transform(X, y)
X_new.shape

(150, 2)

In [25]:
print('X_new= ', X_new)

X_new=  [[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [1.7 0.4]
 [1.4 0.3]
 [1.5 0.2]
 [1.4 0.2]
 [1.5 0.1]
 [1.5 0.2]
 [1.6 0.2]
 [1.4 0.1]
 [1.1 0.1]
 [1.2 0.2]
 [1.5 0.4]
 [1.3 0.4]
 [1.4 0.3]
 [1.7 0.3]
 [1.5 0.3]
 [1.7 0.2]
 [1.5 0.4]
 [1.  0.2]
 [1.7 0.5]
 [1.9 0.2]
 [1.6 0.2]
 [1.6 0.4]
 [1.5 0.2]
 [1.4 0.2]
 [1.6 0.2]
 [1.6 0.2]
 [1.5 0.4]
 [1.5 0.1]
 [1.4 0.2]
 [1.5 0.2]
 [1.2 0.2]
 [1.3 0.2]
 [1.4 0.1]
 [1.3 0.2]
 [1.5 0.2]
 [1.3 0.3]
 [1.3 0.3]
 [1.3 0.2]
 [1.6 0.6]
 [1.9 0.4]
 [1.4 0.3]
 [1.6 0.2]
 [1.4 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [4.7 1.4]
 [4.5 1.5]
 [4.9 1.5]
 [4.  1.3]
 [4.6 1.5]
 [4.5 1.3]
 [4.7 1.6]
 [3.3 1. ]
 [4.6 1.3]
 [3.9 1.4]
 [3.5 1. ]
 [4.2 1.5]
 [4.  1. ]
 [4.7 1.4]
 [3.6 1.3]
 [4.4 1.4]
 [4.5 1.5]
 [4.1 1. ]
 [4.5 1.5]
 [3.9 1.1]
 [4.8 1.8]
 [4.  1.3]
 [4.9 1.5]
 [4.7 1.2]
 [4.3 1.3]
 [4.4 1.4]
 [4.8 1.4]
 [5.  1.7]
 [4.5 1.5]
 [3.5 1. ]
 [3.8 1.1]
 [3.7 1. ]
 [3.9 1.2]
 [5.1 1.6]
 [4.5 1.5]
 [4.5 1.6]
 [4.7 1.5]
 [4.4 1.3]
 [4.1 1.3]
 [4.  1.3]
 [

In [27]:
print('score= ', clf.scores_)
print('pvalue= ', clf.pvalues_)

score=  [ 10.81782088   3.7107283  116.31261309  67.0483602 ]
pvalue=  [4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]


In [28]:
# 顯示特徵名稱 method 1
import numpy as np
ds = datasets.load_iris()
np.array(ds.feature_names)[clf.scores_.argsort()[-2:][::-1]]

array(['petal length (cm)', 'petal width (cm)'], dtype='<U17')

In [42]:
# 顯示特徵名稱 method 2
import pandas as pd
X = pd.DataFrame(ds.data, columns=ds.feature_names)
#print('X= ', X)

clf = SelectKBest(chi2, k=2)
print('clf= ', clf)
X_new = clf.fit_transform(X, y)
#print('X_new= ', X_new)
print(clf.get_feature_names_out())

clf=  SelectKBest(k=2, score_func=<function chi2 at 0x0000029AFE7B1940>)
['petal length (cm)' 'petal width (cm)']


In [43]:
X = X[clf.get_feature_names_out()].values


In [44]:
X

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1],
       [1.5, 0.2],
       [1.6, 0.2],
       [1.4, 0.1],
       [1.1, 0.1],
       [1.2, 0.2],
       [1.5, 0.4],
       [1.3, 0.4],
       [1.4, 0.3],
       [1.7, 0.3],
       [1.5, 0.3],
       [1.7, 0.2],
       [1.5, 0.4],
       [1. , 0.2],
       [1.7, 0.5],
       [1.9, 0.2],
       [1.6, 0.2],
       [1.6, 0.4],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.6, 0.2],
       [1.6, 0.2],
       [1.5, 0.4],
       [1.5, 0.1],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.2, 0.2],
       [1.3, 0.2],
       [1.4, 0.1],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.3, 0.3],
       [1.3, 0.3],
       [1.3, 0.2],
       [1.6, 0.6],
       [1.9, 0.4],
       [1.4, 0.3],
       [1.6, 0.2],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [4.7, 1.4],
       [4.5, 1.5],
       [4.9,