### 各季類股上漲分析

In [183]:
import pandas as pd
import numpy as np
import os
import yfinance as yf
import datetime as dt
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties as font

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report

font1 = font(fname="/content/drive/MyDrive/Colab Notebooks/font/NotoSansTC-Regular.otf")

### 導入CSV檔案

In [184]:
fileName = os.path.join(os.getcwd(), '0050成分股_獲利能力.csv')
df = pd.read_csv(fileName, encoding='utf-8-sig')
df = df[df.columns[1:]]

df.sort_values(by=['股票名稱', '股票類別', '季別'], inplace=True, ascending=True)

print(df.head(5).to_string())


            股票名稱   股票類別          季別     營業收入  營業成本  營業毛利  毛利率  營業利益  營益率  業外收支    稅前淨利    稅後淨利  EPS(元)
2982  上海商銀(5876)  金融保險業  2008-08-14   3143.0   NaN   NaN  NaN   NaN  NaN   NaN  1818.0  1811.0    0.77
2981  上海商銀(5876)  金融保險業  2008-11-14   3448.0   NaN   NaN  NaN   NaN  NaN   NaN  2224.0  1815.0    0.77
2980  上海商銀(5876)  金融保險業  2009-03-31  14213.0   NaN   NaN  NaN   NaN  NaN   NaN  5721.0  2006.0    0.85
2979  上海商銀(5876)  金融保險業  2009-08-14   3194.0   NaN   NaN  NaN   NaN  NaN   NaN  1705.0  2780.0    1.14
2978  上海商銀(5876)  金融保險業  2009-11-14   3475.0   NaN   NaN  NaN   NaN  NaN   NaN  2194.0  1949.0    0.80


### 計算稅後淨利率

In [201]:
df['稅後淨利率'] = df['稅後淨利']/df['營業收入']

### 使用yfinance API 連接各類股數據

In [186]:
sybq =  [i.split('(')[1].replace(')', '') for i in np.unique(df[['股票名稱']])]
endDate = df['季別'].max()
startDate = df['季別'].min()


dfd = pd.DataFrame()
for i in sybq:    
    # yahoo finance API 查詢股價
    dfy = yf.Ticker(str(i) + '.tw').history(start=startDate, end=endDate, interval='1d')
    dfy.index.name = 'Date'
    dfy.index = dfy.index.tz_localize(None)
    dfy.reset_index(inplace=True, drop=False)
    dfy = dfy[['Date', 'Close']].copy()

    # 取得基本分析要用到的資料
    dff = df[df['股票名稱'].str.contains(str(i), na=False)][['股票名稱', '股票類別', '季別', '稅後淨利率', 'EPS(元)']].copy()
    
    dff.rename(columns={'季別':'Date'}, inplace=True)
    dff['Date'] = pd.to_datetime(dff['Date'], errors='coerce')

    # 合併兩資料表
    dfc = pd.merge(dff, dfy, how='outer', on='Date').copy()

    # 資料回填
    dfc.ffill(inplace=True)
    
    # 篩選資料
    condition = ((dfc['Date'].dt.month == 5) & (dfc['Date'].dt.day == 15)) | ((dfc['Date'].dt.month == 8) & (dfc['Date'].dt.day == 14)) |((dfc['Date'].dt.month == 11) & (dfc['Date'].dt.day == 14)) | ((dfc['Date'].dt.month == 3) & (dfc['Date'].dt.day == 31))
    dfq = dfc[condition].copy()
    dfq.sort_values(by=['Date'], inplace=True)
    dfq.dropna(inplace=True, axis=0)
    dfq['漲跌幅_next'] = (dfq['Close'].shift(-1) - dfq['Close']) / dfq['Close']
    dfq['漲跌幅_標籤'] = np.where(dfq['漲跌幅_next'] > 0, 1, np.where(dfq['漲跌幅_next']==0, 0, -1))
    dfq.dropna(inplace=True, axis=0)

    dfd = pd.concat([dfd, dfq], axis=0)

dfd.reset_index(inplace=True, drop=True)
print(dfd.head(5).to_string())

         股票名稱   股票類別       Date     稅後淨利率  EPS(元)      Close  漲跌幅_next  漲跌幅_標籤
0  上海商銀(5876)  金融保險業 2014-11-14  0.370282    1.38  21.293217  0.050125       1
1  上海商銀(5876)  金融保險業 2015-03-31  0.373627    1.49  22.360531  0.018460       1
2  上海商銀(5876)  金融保險業 2015-05-15  0.373627    1.49  22.773304 -0.018216      -1
3  上海商銀(5876)  金融保險業 2015-08-14  0.383570    1.52  22.358465 -0.147839      -1
4  上海商銀(5876)  金融保險業 2016-03-31  0.373255    1.46  19.053022  0.018392       1


In [203]:
dfd.to_csv('Kmeans.csv', encoding='utf-8-sig')

### 機器學習 > 決策樹(標籤處理)

In [204]:

dfs = dfd[['股票類別', 'Date', '稅後淨利率', 'EPS(元)', '漲跌幅_標籤']].copy()
dfs['月份'] = dfs['Date'].dt.month
dfss = dfs.copy()
label_encoder = LabelEncoder()
dfss['股票類別'] = label_encoder.fit_transform(dfss['股票類別'])
dfss['月份_標籤'] = label_encoder.fit_transform(dfss['月份'])
dfss['漲跌_文字'] = np.where(dfss['漲跌幅_標籤'] == 1, '上漲', np.where(dfss['漲跌幅_標籤']==0, '持平', '下跌'))

X = dfss[['股票類別', '月份_標籤', '稅後淨利率', 'EPS(元)']]
y = dfss[['漲跌幅_標籤']]



### 機器學習 > 決策樹(資料處理)

In [205]:
# 拆分數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


### 機器學習 > 決策樹(模型產生)

In [206]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# 進行預測
y_pred = clf.predict(X_test)

# 計算準確率
accuracy = accuracy_score(y_test, y_pred)
print(f'模型準確率: {accuracy:.2f}')

# 顯示分類報告
print("\n分類報告:")
print(classification_report(y_test, y_pred, target_names=np.unique(dfss['漲跌_文字'])))

模型準確率: 0.51

分類報告:
              precision    recall  f1-score   support

          上漲       0.38      0.39      0.38       266
          下跌       0.14      0.14      0.14         7
          持平       0.60      0.60      0.60       416

    accuracy                           0.51       689
   macro avg       0.38      0.38      0.38       689
weighted avg       0.51      0.51      0.51       689

