# 載入相關套件

In [47]:
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
import numpy as np

In [48]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix  
from sklearn.metrics import classification_report  

In [49]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


# 讀取資料集與keyword

In [50]:
# 讀取training csv文件
file = "traing_df_2609.csv"
news_df = pd.read_csv('./output/'+file,encoding='utf-8-sig')

In [51]:
news_df['post_time'] = pd.to_datetime(news_df['post_time']).dt.date

In [52]:
#讀取關鍵字
term_df = pd.read_csv('./output/2609_selected_keywords_n14_chi_in_lda.csv',encoding='utf-8-sig')

#轉換為dict
term_dict = term_df['keyword'].to_dict()
term_dict = {v: k for k, v in term_dict.items()}

In [28]:
print('keyword length:',len(term_dict))
news_df.info()

keyword length: 284
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7248 entries, 0 to 7247
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   post_time  7248 non-null   object 
 1   content    7248 non-null   object 
 2   標記         7248 non-null   float64
dtypes: float64(1), object(2)
memory usage: 170.0+ KB


In [53]:
news_df['標記'].value_counts()

0.0    3809
1.0    3439
Name: 標記, dtype: int64

# 建立空間向量模型，訓練分類器

In [54]:
#建vsm
vectorizer = TfidfVectorizer(vocabulary=term_dict,use_idf=True)

In [55]:
#fit 所有資料
X = vectorizer.fit_transform(news_df['content']) 
X = pd.DataFrame(X.toarray(),columns=term_dict)
y = news_df['標記']


* 訓練9種不同classifier模型

In [56]:
def train_and_evaluate_classifier(classifier, X_train, y_train, X_test, y_test):
    print("Classifier:", type(classifier).__name__)
    classifier.fit(X_train, y_train)
    print("Score (X_train, y_train):", classifier.score(X_train, y_train))

    y_pred = classifier.predict(X_test)
    print("Score (y_test, y_pred):", accuracy_score(y_test, y_pred))

    print(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    print("\n\n")

classifier_list = [
    MultinomialNB(),
    GaussianNB(),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=3, random_state=0),
    KNeighborsClassifier(n_neighbors=7),
    DecisionTreeClassifier(criterion="entropy"),
    SVC(kernel='linear'),
    RandomForestClassifier(max_depth=5, n_estimators=50),
    AdaBoostClassifier(),
    MLPClassifier(alpha=1, max_iter=1000),
]

#拆分 80% 20% 訓練測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

for classifier in classifier_list:
    train_and_evaluate_classifier(classifier, X_train, y_train, X_test, y_test)

Classifier: MultinomialNB
Score (X_train, y_train): 0.6422904449810279
Score (y_test, y_pred): 0.6393103448275862
[[356 312]
 [211 571]]



Classifier: GaussianNB
Score (X_train, y_train): 0.6450500172473267
Score (y_test, y_pred): 0.6179310344827587
[[394 274]
 [280 502]]



Classifier: GradientBoostingClassifier
Score (X_train, y_train): 0.8539151431528114
Score (y_test, y_pred): 0.6268965517241379
[[379 289]
 [252 530]]



Classifier: KNeighborsClassifier
Score (X_train, y_train): 0.7124870645050018
Score (y_test, y_pred): 0.6137931034482759
[[333 335]
 [225 557]]



Classifier: DecisionTreeClassifier
Score (X_train, y_train): 0.9865470852017937
Score (y_test, y_pred): 0.5958620689655172
[[375 293]
 [293 489]]



Classifier: SVC
Score (X_train, y_train): 0.677302518109693
Score (y_test, y_pred): 0.64
[[374 294]
 [228 554]]



Classifier: RandomForestClassifier
Score (X_train, y_train): 0.6416005519144533
Score (y_test, y_pred): 0.6213793103448276
[[186 482]
 [ 67 715]]



Classifier

# 回測

* 分資料區間

In [57]:
train_for_epoch = 4
test_for_epoch = 1

In [58]:
# 建構list儲存每次回測要用的train val資料
train_data_list = []
validation_data_list = []
date_list = []

# 4個月為訓練，一個月為測試
for month_offset in range(0, 36):  
    train_start = pd.Timestamp(year=2020, month=1, day=1) + pd.DateOffset(months=month_offset)
    train_end = train_start + pd.DateOffset(months=train_for_epoch, days=-1)
    validation_start = train_end + pd.DateOffset(days=1)
    validation_end = validation_start + pd.DateOffset(months=test_for_epoch, days=-1)

    #切每次test,val的資料
    t = news_df[news_df['post_time'].between(train_start.date(), train_end.date())]
    v = news_df[news_df['post_time'].between(validation_start.date(), validation_end.date())]
    
    if len(t) != 0 and len(v) != 0:
        t_str = str(train_start.date()) +"_"+ str(train_end.date())
        train_data_list.append(t)
        validation_data_list.append(v)
        date_list.append(t_str)


In [59]:
def train_data(train, val):
    train_content = train['content']
    valid_content = val['content']
    
    # 建構空間向量
    X = vectorizer.fit_transform(train_content)
    X = pd.DataFrame(X.toarray(), columns=term_dict)
    y = train['標記']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
    
    # 定義分類器
    classifier = RandomForestClassifier(max_depth=5, n_estimators=50)
    classifier.fit(X_train, y_train)
    
    # 預測結果
    y_pred = classifier.predict(X_test)
    
    X_val = vectorizer.fit_transform(valid_content)
    X_val = pd.DataFrame(X_val.toarray(), columns=term_dict)
    
    y_val = val['標記']
    y_pre = classifier.predict(X_val)
    
    val['預測標記'] = y_pre
    
    return classifier, X_test, y_test, y_pred, y_val, y_pre, val

In [60]:
def calculate_accuracy_and_shooting_rate(v_df):
    # 初始化計數器
    shooting = 0
    acc_count = 0
    shooting_rate = 0
    accuracy_rate = 0
    # 遍歷每一行，檢查條件並計數
    for _, row in v_df.iterrows():
        if row['max_count_ratio'] >= 0.7:
            shooting += 1
            if row['pred_index'] == row['true_index']:
                acc_count += 1

    # 計算出手率和準確率
    if shooting!=0:
        shooting_rate = (shooting / len(v_df) ) * 100
        accuracy_rate = (acc_count / shooting) * 100
    
    return shooting_rate,accuracy_rate

* 做回測，應用Random Forest作為分類器

In [61]:
result = []

# 資料
for t, v, date in zip(train_data_list, validation_data_list, date_list):
    train = t.copy()
    val = v.copy()
    
    classifier, X_test, y_test, y_pred, y_val, y_pre, val = train_data(train, val)
    
    print("Score (X_train, y_train):", classifier.score(X_train, y_train))
    print("Score (X_test, y_test):", accuracy_score(y_test, y_pred))
    print("Score (X_val, y_val):", accuracy_score(y_val, y_pre))
    print('\n\n')
    
    true_idx = val[['post_time', '標記']].drop_duplicates()['標記'].reset_index(drop=True)
    
    # 建構一個df 來作為計算當日分類出的漲跌件比例的 dataframe
    v_df = val.groupby('post_time')['預測標記'].value_counts().unstack(fill_value=0)
    v_df['max_count_ratio'] = v_df.apply(lambda row: max(row) / row.sum(), axis=1)
    
    v_df['pred_index'] = v_df.apply(lambda row: 1 if row.get(1, 0) >= row.get(0, 0) else 0, axis=1)
    v_df.reset_index(inplace=True)
    v_df['true_index'] = true_idx
    
    #計算出手率與準確率
    shooting_rate, accuracy_rate = calculate_accuracy_and_shooting_rate(v_df)
    
    result.append((date, shooting_rate, accuracy_rate))

Score (X_train, y_train): 0.4780958951362539
Score (X_test, y_test): 0.75
Score (X_val, y_val): 0.0



Score (X_train, y_train): 0.4872369782683684
Score (X_test, y_test): 0.7272727272727273
Score (X_val, y_val): 0.4540540540540541



Score (X_train, y_train): 0.5027595722662987
Score (X_test, y_test): 0.6691176470588235
Score (X_val, y_val): 0.278118609406953



Score (X_train, y_train): 0.5534667126595377
Score (X_test, y_test): 0.6196581196581197
Score (X_val, y_val): 0.6142241379310345



Score (X_train, y_train): 0.5289755087961366
Score (X_test, y_test): 0.713855421686747
Score (X_val, y_val): 0.9951278928136419



Score (X_train, y_train): 0.5220765781303898
Score (X_test, y_test): 0.7608247422680412
Score (X_val, y_val): 0.321451717433571



Score (X_train, y_train): 0.5703690927906174
Score (X_test, y_test): 0.631439894319683
Score (X_val, y_val): 0.6873661670235546



Score (X_train, y_train): 0.5757157640565712
Score (X_test, y_test): 0.6289893617021277
Score (X_val, y_val):

In [64]:
#計算平均出手率,準確率 
shooting_rate_sum = 0
accuracy_rate_sum = 0
num_results = 0

for r in result:
    print(f"出手率:{r[1]:.2f}% 準確率:{r[2]:.2f}%")
    if r[1] != 0:
        shooting_rate_sum += r[1]
        accuracy_rate_sum += r[2]
        num_results += 1


#計算平均出手率,平均準確率 
shooting_rate_avg = shooting_rate_sum / num_results
accuracy_rate_avg = accuracy_rate_sum / num_results

print(f"平均出手率: {shooting_rate_avg:.2f}% 平均準確率: {accuracy_rate_avg:.2f}%")

出手率:100.00% 準確率:100.00%
出手率:100.00% 準確率:50.00%
出手率:100.00% 準確率:18.18%
出手率:100.00% 準確率:63.64%
出手率:100.00% 準確率:100.00%
出手率:100.00% 準確率:33.33%
出手率:100.00% 準確率:62.50%
出手率:100.00% 準確率:28.57%
出手率:60.00% 準確率:66.67%
出手率:100.00% 準確率:20.00%
出手率:0.00% 準確率:0.00%
出手率:100.00% 準確率:100.00%
出手率:50.00% 準確率:100.00%
出手率:100.00% 準確率:57.14%
出手率:100.00% 準確率:66.67%
出手率:100.00% 準確率:100.00%
出手率:100.00% 準確率:20.00%
出手率:75.00% 準確率:33.33%
出手率:100.00% 準確率:100.00%
出手率:100.00% 準確率:80.00%
出手率:100.00% 準確率:50.00%
平均出手率: 94.25% 平均準確率: 62.50%
