In [13]:
# panda v1.5 above (p.2-28)
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
pd.get_dummies(df, columns=["color"], prefix='is', prefix_sep='_')

Unnamed: 0,size,price,classlabel,is_blue,is_green,is_red
0,M,10.1,class1,False,True,False
1,L,13.5,class2,False,False,True
2,XL,15.3,class1,True,False,False


In [14]:
df2 = pd.get_dummies(df, columns=["color"], prefix='is', prefix_sep='_')
pd.from_dummies(df2[['is_blue', 'is_green', 'is_red']], sep='_')

Unnamed: 0,is
0,green
1,red
2,blue


In [15]:
#-- §2-6 p.2-29 --#
from sklearn.preprocessing import OneHotEncoder

X = [['Male', 1], ['Female', 3], ['Female', 2]]
encoder = OneHotEncoder(handle_unknown='ignore')
X_new = encoder.fit_transform(X)
X_new.toarray()

array([[0., 1., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.]])

In [16]:
# 顯示類別
encoder.categories_


[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

In [17]:
# inverse_transform 可還原為原值
X_org = encoder.inverse_transform(X_new)
X_org

array([['Male', 1],
       ['Female', 3],
       ['Female', 2]], dtype=object)

In [18]:
# One-hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore')
color_new = encoder.fit_transform(df[['color']])

# get_feature_names_out 產生新的欄位名稱；feature_names_in_ 可取得原欄位名稱
column_names = encoder.get_feature_names_out(encoder.feature_names_in_)

df_new = pd.DataFrame(color_new.toarray(), columns=column_names)
df_new


Unnamed: 0,color_blue,color_green,color_red
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0


In [None]:
"""
【sklearn中fit、fit_transform、transform的區別】
https://www.cupoy.com/post/0000017EA058A745000000016375706F795F72656C656173654B5741535354434C55424E455753

fit：原義指的是安裝、使適合的意思，其實有點train的含義但是和train不同的是，它並不是一個訓練的過程，而是一個適配的過程，過程都是定死的，最後只是得到了一個統一的轉換的規則模型。
transform：是將資料進行轉換，比如資料的歸一化和標準化，將測試資料按照訓練資料同樣的模型進行轉換，得到特徵向量。
fit_transform：可以看做是fit和transform的結合，如果訓練階段使用fit_transform，則在測試階段只需要對測試樣本進行transform就行了。

【scikit-learn 數據預處理-fit_transform()與transform()之差異】
https://medium.com/@maggieliao.cm04g/scikit-learn-%E6%95%B8%E6%93%9A%E9%A0%90%E8%99%95%E7%90%86-fit-transform-%E8%88%87transform-%E4%B9%8B%E5%B7%AE%E7%95%B0-3c7cc07c124f

fit_transform():
fit_transform(partData)是先對partData作fit()的功能，找到該partData的整體統計特性之指標，如平均值、標準差、最大最小值等等(能依據不同目的套用這些指標在不同的轉換(即後面的transform()動作)上，再實行transform(partData)以對partData進行標準化(英文稱為normalization, 主要有兩種方法: min-max normalization or standard deviation normalization)或歸一化等動作.
根據之前fit(partData)所找出來，對於剩餘數據（restData）使用相同的平均值、標準差、最大/最小值等指標進行轉換 transform(restData)，從而保證partData、restData是以同樣的統計指標下去作標準化等轉換之資料前處理。
"""

In [19]:
# 刪除原欄位 'color'
df.drop(['color'], axis=1, inplace=True)

# 合併表格
df2 = pd.concat([df, df_new], axis=1)
df2

Unnamed: 0,size,price,classlabel,color_blue,color_green,color_red
0,M,10.1,class1,0.0,1.0,0.0
1,L,13.5,class2,0.0,0.0,1.0
2,XL,15.3,class1,1.0,0.0,0.0


In [20]:
# 存檔
import joblib

joblib.dump(encoder, 'joblib')

['joblib']

In [22]:
#-- p.2-32 --#
"""
pip install yfinance --upgrade --no-cache-dir
"""
import pandas as pd
import yfinance as yf

df_quote = yf.download('1101.TW', start='2020-01-01', end='2024-11-19')
df_quote.tail()

[*********************100%***********************]  1 of 1 completed


Price,Adj Close,Close,High,Low,Open,Volume
Ticker,1101.TW,1101.TW,1101.TW,1101.TW,1101.TW,1101.TW
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2024-11-12 00:00:00+00:00,32.450001,32.450001,32.75,32.200001,32.700001,23022123
2024-11-13 00:00:00+00:00,32.450001,32.450001,32.75,32.200001,32.25,18150100
2024-11-14 00:00:00+00:00,32.349998,32.349998,32.650002,32.299999,32.450001,16571895
2024-11-15 00:00:00+00:00,33.150002,33.150002,33.349998,32.549999,32.599998,29588758
2024-11-18 00:00:00+00:00,33.25,33.25,33.700001,33.150002,33.25,19238941
