### r1 - part 2 + r2 : 挑出漲跌超過特定幅度當日的文章，並從中選出看漲、看跌的關鍵詞，再進行訓練與預測。

#### process
* read and combine datasets
* 挑出漲跌超過特定幅度的日子當天的文章，並加以過濾
    * 挑出第 D 天 ( 漲 & 跌特定幅度當天是第 D + 3 天 ) 的新聞
    * 過濾文章
    * 確認看漲和看跌新聞的數量
    * 將資料加上標籤，並切分成訓練集、測試集
* 挑出看漲、看跌兩批文章合起來後文件集的關鍵詞
    * 法一：monpa 斷詞，並使用 df卡分值開根號、tfidf * df卡方值開根號 挑選特徵
    * 法二：jieba 斷詞，並使用 tf 值挑選每篇文章的特徵詞，將挑選結果放入 nltk 模型進行運算

In [9]:
import monpa
from monpa import utils
import re

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from collections import Counter
from scipy.stats import chisquare
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import svm
from collections import defaultdict
from statistics import mean
import math

import warnings
warnings.filterwarnings('ignore')

In [11]:
# 這個function用來將字串以正則化處理去除中文字元以外的字元
def clearSentence(sentence):
    return re.sub(r'[^\u4e00-\u9fa5]+', '', sentence)

# 我們從stopwords_zh.txt這個檔案中匯入繁體中文的停用詞
with open('stopwords_zh.txt', 'r', encoding="utf-8") as file:
    stopwords = file.read().splitlines() 
file.close()

In [12]:
def df_filter_dates(df, up_dates, down_dates):
    """ filter dataframe according to a date list """
    df_fd_up = df[df['post_time'].str.contains('|'.join(up_dates))]
    df_fd_down = df[df['post_time'].str.contains('|'.join(down_dates))]
    print("up:", df_fd_up.shape[0])
    # display(df_fd_up)
    print("down:", df_fd_down.shape[0])
    # display(df_fd_down)
    
    return df_fd_up, df_fd_down

def df_filter_content_len(df, lower_bound=0, upper_bound=0, autoSetBound=True):
    """ filter articles that are too short or too long """
    df['content_len'] = df['content'].str.len()
    
    if autoSetBound:
        display(df['content_len'].describe())
        lower_bound = max(df['content_len'].mean() - 3 * df['content_len'].std(), 0)
        upper_bound = df['content_len'].mean() + 3 * df['content_len'].std()
        print("mean - 3 * std =", lower_bound)
        print("mean + 3 * std =", upper_bound)
    
    result = df
    if lower_bound > 0:
        result = result[result['content_len'] > lower_bound]
    if upper_bound > 0:
        result = result[result['content_len'] < upper_bound]    
    print("df size after filtering content length", result.shape[0], ", ", "{:.3f}".format(result.shape[0] / df.shape[0]), " of the original dataframe")
    
    return result

def df_filter_keyword(df, keyword: list):
    """ filter articles that contain a specific keyword in content or title """
    df['titleAndContent'] = df['title'].str.cat(df['content'], sep=' ')
    result = df[df['titleAndContent'].str.contains('|'.join(keyword), case=False)]
    display(result['titleAndContent'])
    print("df size after filtering the keyword", result.shape[0], ", ", "{:.3f}".format(result.shape[0] / df.shape[0]), " of the original dataframe \n")
    
    return result


def dfchi2_train(df, k1, k2):
    """ compute square root of df chi value with monpa for splitting sentence """
        
    dfMatrix = pd.DataFrame()

    for index, row in df.iterrows():
        sentence_list = utils.short_sentence(row['titleAndContent'])   # 文章斷句
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            terms = monpa.cut(sentence)   # 句子斷詞
            for term in terms:
                if(term not in dfMatrix.columns and term not in stopwords):
                    dfMatrix[term] = 0
                dfMatrix.at[index, term] = 1

    dfMatrix.fillna(0, inplace=True)            
    display(dfMatrix)
    
    y_train = df['label']

    chi2_selector = SelectKBest(chi2, k=k1)
    chi2_selector.fit(dfMatrix, y_train)
    kbest_vocabs_k1 = dfMatrix.columns[chi2_selector.get_support()]
    X_train_k1 = dfMatrix[kbest_vocabs_k1]
    
    chi2_selector = SelectKBest(chi2, k=k2)
    chi2_selector.fit(dfMatrix, y_train)
    kbest_vocabs_k2 = dfMatrix.columns[chi2_selector.get_support()]
    X_train_k2 = dfMatrix[kbest_vocabs_k2]
    
    return X_train_k1, kbest_vocabs_k1, X_train_k2, kbest_vocabs_k2


def dfchi2_test(df, kbest_vocabs_k1, kbest_vocabs_k2):
    """ compute square root of df chi value with monpa for splitting sentence """

    dfMatrix = pd.DataFrame()
    for index, row in df.iterrows():
        sentence_list = utils.short_sentence(row['titleAndContent'])   # 文章斷句
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            terms = monpa.cut(sentence)   # 句子斷詞
            for term in terms:
                if(term not in dfMatrix.columns and term not in stopwords):
                    dfMatrix[term] = 0
                dfMatrix.at[index, term] = 1
                
    dfMatrix.fillna(0, inplace=True)            
    display(dfMatrix)

    X_test_k1 = dfMatrix.reindex(kbest_vocabs_k1, axis=1, fill_value=0)
    X_test_k2 = dfMatrix.reindex(kbest_vocabs_k2, axis=1, fill_value=0)
    
    return X_test_k1, X_test_k2



def tfidf_train(df, k1, k2):
    """ compute tfidf with monpa for splitting sentence """
    train_doc_list = []
    for index, row in df.iterrows():
        sentence_list = utils.short_sentence(row['titleAndContent'])   # 文章斷句
        term_list = str()
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            terms = monpa.cut(sentence)   # 句子斷詞
            term_list += ' '.join(terms)
        train_doc_list.append(term_list)
    
    vectorizer = TfidfVectorizer(stop_words=stopwords)
    X_train = vectorizer.fit_transform(train_doc_list)
    importance = np.argsort(np.asarray(X_train.sum(axis=0)).ravel())[::-1]
    
    tfidf_feature_names = np.array(vectorizer.get_feature_names())
    target_feature_names_k1 = tfidf_feature_names[importance[:k1]]
    target_feature_names_k2 = tfidf_feature_names[importance[:k2]]
    
    X_train = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names())
    X_train_k1 = X_train[target_feature_names_k1]
    X_train_k2 = X_train[target_feature_names_k2]
    
    return X_train_k1, target_feature_names_k1, X_train_k2, target_feature_names_k2


def tfidf_test(df, kbest_vocabs_k1, kbest_vocabs_k2):
    """ compute tfidf with monpa for splitting sentence """
    test_doc_list = []
    for index, row in df.iterrows():
        sentence_list = utils.short_sentence(row['titleAndContent'])   # 文章斷句
        term_list = str()
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            terms = monpa.cut(sentence)   # 句子斷詞
            term_list += ' '.join(terms)
        test_doc_list.append(term_list)
    
    vectorizer = TfidfVectorizer(stop_words=stopwords)
    X_test = vectorizer.fit_transform(test_doc_list) 
    X_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names())
    X_test_k1 = X_test.reindex(kbest_vocabs_k1, axis=1, fill_value=0)
    X_test_k2 = X_test.reindex(kbest_vocabs_k2, axis=1, fill_value=0)

    return X_test_k1, X_test_k2

def filter_news_year(years):
    df_news = pd.DataFrame()
    for year in years:
        df_cur = pd.read_csv(f"./bda2023_mid_dataset/bda2023_mid_news_{year}.csv")
        print(f"{year} len: {df_cur.shape[0]}")
        df_news = pd.concat([df_news, df_cur])
        
    print("total", df_news.shape[0])
    return df_news

def zero():
    return 0

def emp_list():
    return []

def tfidfchi2_train(df, k1, k2):
    """ compute square root of df chi value times tfidf with monpa for splitting sentence """
    train_doc_list = []
    classes = [[],[]]
    all_term = []
    doc_word = defaultdict(zero)
    tf_counter = Counter()
    df_counter = Counter()

    for index, row in df.iterrows():
        if row['label'] == 1:
            classes[1].append(index)
        else:
            classes[0].append(index)
        term_appeared = Counter()
        sentence_list = utils.short_sentence(row['titleAndContent'])   # 文章斷句
        term_list = []
        term_str = str()
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            terms = monpa.cut(sentence)   # 句子斷詞
            terms = [word for word in terms if not word in stopwords]
            term_list.extend(terms)
            term_str += ' '.join(terms)
            for t in terms:
                tf_counter[t] += 1
                if term_appeared[t] == 0:
                    term_appeared[t] = 1
        df_counter += term_appeared
        doc_word[index] = term_list
        all_term.extend(term_list)
        train_doc_list.append(term_str)
           
    
    tf_idf = dict()
    for term in list(tf_counter.keys()):
        tf_idf[term] = (1 + math.log10(tf_counter[term])) * math.log10(df_train.shape[0] / df_counter[term])
    all_term = [*set(all_term)]
    
    n_table = defaultdict(emp_list)
    for word in all_term:
        for c in classes:
            n11 = 0
            n01 = 0
            n10 = 0
            n00 = 0
            for i in range(len(classes)):
                for j in range(len(classes[i])):
                    if(classes[i][j] in c):
                        if(word in doc_word[classes[i][j]]):
                            n11 += 1
                        else:
                            n10 += 1
                    else:
                        if(word in doc_word[classes[i][j]]):
                            n01 += 1
                        else:
                            n00 += 1
            n_table[word].append([n11, n10, n01, n00])
    
    N = df.shape[0]
    chi2 = defaultdict(emp_list)
    for i in n_table.keys():
        for j in n_table[i]:
            pre = (j[0] + j[2]) / N
            absent = (j[1] + j[3]) / N
            on = (j[0] + j[1]) / N
            off = (j[2] + j[3]) / N
            E11 =  N * pre * on
            E10 = N * absent * on
            E01 =  N * pre * off
            E00 = N * absent * off
            chi = ((j[0] - E11)**2 / E11) + ((j[1] - E10)**2 / E10) +  ((j[2] - E01)**2 / E01) + ((j[3] - E00)**2 / E00)
            chi2[i].append(chi)
    words = defaultdict(zero)
    for key in chi2.keys():
        chi2[key] = math.sqrt(mean(chi2[key]))
        words[key] = chi2[key] * tf_idf[key]
    words = dict(sorted(words.items(), key=lambda x:x[1], reverse = True))
    
    words_k1 = list(words.keys())[:k1]
    words_k2 = list(words.keys())[:k2]

    
    vectorizer = TfidfVectorizer(stop_words=stopwords,token_pattern='(?u)\\b\\w+\\b')
    X_train = vectorizer.fit_transform(train_doc_list)
    X_train = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names())

    X_train_k1 = X_train.reindex(words_k1, axis=1, fill_value=0)
    X_train_k2 = X_train.reindex(words_k2, axis=1, fill_value=0)
    
    return X_train_k1, words_k1, X_train_k2, words_k2

def tfidfchi2_test(df, words_k1, words_k2):
    """ compute square root of df chi value times tfidf with monpa for splitting sentence """
    test_doc_list = []
    for index, row in df.iterrows():
        sentence_list = utils.short_sentence(row['titleAndContent'])   # 文章斷句
        term_list = str()
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            terms = monpa.cut(sentence)   # 句子斷詞
            for term in terms:
                term_list += ' '.join(terms)
        test_doc_list.append(term_list)
        
    vectorizer = TfidfVectorizer(stop_words=stopwords,token_pattern='(?u)\\b\\w+\\b')
    X_test = vectorizer.fit_transform(test_doc_list)
    X_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names())
    
    X_test_k1 = X_test.reindex(words_k1, axis=1, fill_value=0)
    X_test_k2 = X_test.reindex(words_k2, axis=1, fill_value=0)
    
    return X_test_k1, X_test_k2

### 1. read and combine datasets

In [13]:
df_news = filter_news_year(['2020', '2021', '2022', '2023'])
display(df_news)

2020 len: 157045
2021 len: 180838
2022 len: 225128
2023 len: 68826
total 631837


Unnamed: 0,id,p_type,s_name,s_area_name,post_time,title,author,content,page_url
0,1577813152394_N01,news,yahoo股市,最新財經新聞,2020-01-01 00:28:00,財政部、內政部推一站式服務 不動產移轉登記一次搞定,鉅亨網 鉅亨網記者郭幸宜 台北,為避免民眾辦理不動產買賣移轉登記，需要稅務、地政機關兩邊跑，財政部與內政部合作成立「不動產移...,https://tw.stock.yahoo.com/news/財政部-內政部推-站式服務-...
1,1577815842148_N01,news,yahoo股市,最新財經新聞,2020-01-01 00:29:00,IMF：Q3美元佔全球外匯儲備比重升至一年新高,鉅亨網 鉅亨網編譯林薏禎,週二 (31 日) 國際貨幣組織公布 (IMF) 數據顯示，今年第 3 季，美元在全球外匯儲...,https://tw.stock.yahoo.com/news/imf-q3美元佔全球外匯儲...
2,1577822154326_N01,news,yahoo股市,最新財經新聞,2020-01-01 01:15:00,騰訊集團以34億美元 收購環球音樂10%股權,鉅亨網 鉅亨網編譯張詩苡,外電報導，騰訊集團 (0700-HK) 與法國多媒體集團威望迪 (Vivendi)，在經過了...,https://tw.stock.yahoo.com/news/騰訊集團以34億美元-收購環...
3,1577823048186_N01,news,yahoo股市,最新財經新聞,2020-01-01 03:23:00,加幣漲幅勇冠G10 瑞信：央行政策是明年主要風險,鉅亨網 鉅亨網編譯林薏禎,週二 (31 日)，受到中國強勁經濟數據支持，加元兌美元匯率升至 14 個月高位，且年度漲幅...,https://tw.stock.yahoo.com/news/加幣漲幅勇冠g10-瑞信-央...
4,1577831143458_1_N01,news,yahoo股市,最新財經新聞,2020-01-01 03:41:00,【歐股盤後】收低,中央社 中央社,（中央社台北2020年1月1日電）歐洲股市在今年最後一個交易日交易清淡，倫敦和巴黎股市收低，...,https://tw.stock.yahoo.com/news/歐股盤後-收低-194127...
...,...,...,...,...,...,...,...,...,...
68821,1679411674978_YahooStock,news,Yahoo股市,財經新聞,2023-03-21 22:28:54,德國3月投資人信心銳減,陳穎芃／綜合外電報導,ZEW經濟研究院公布，3月德國投資人信心指數從2月的28.1點驟降至13.0點，且低於路透調...,https://tw.stock.yahoo.com/news/%E5%BE%B7%E5%9...
68822,1679411670754_YahooStock,news,Yahoo股市,財經新聞,2023-03-21 22:30:06,台中單元二「陸府觀森」兼具ESG與宜居 綠實力成新奢標竿,曾麗芳／台中報導,台中新興隱富聚落、位於北單元二重劃區的「陸府觀森」近期落成，18層樓高的建築外觀猶如一座森林...,https://tw.stock.yahoo.com/news/%E5%8F%B0%E4%B...
68823,1679410774885_YahooStock,news,Yahoo股市,財經新聞,2023-03-21 22:45:45,【公告】安力-KY代子公司昆山新力精密五金取得理財產品,中央社,日 期：2023年03月21日公司名稱：安力-KY (5223)主 旨：代子公司昆山新力精密...,https://tw.stock.yahoo.com/news/%E5%85%AC%E5%9...
68824,1679410774748_YahooStock,news,Yahoo股市,財經新聞,2023-03-21 22:46:51,【公告】安力-KY董事會決議召開股東會相關訊息,中央社,日 期：2023年03月21日公司名稱：安力-KY (5223)主 旨：安力-KY董事會決議...,https://tw.stock.yahoo.com/news/%E5%85%AC%E5%9...


### 以 monpas 斷詞的實作
### 2. 挑出漲跌超過特定幅度的日子當天的文章，並加以過濾

#### (1) 挑出第 D 天 ( 漲 & 跌特定幅度當天是第 D + 3 天 ) 的新聞

In [14]:
# read up_and_down_dates from part 1
up_and_down_dates = pd.read_csv(r"./bda2023_mid_dataset/up_and_down_dates/2317 鴻海.csv")
display(up_and_down_dates)

# separate dates
up_dates = up_and_down_dates[up_and_down_dates['label'] == '漲']['Date'].values
down_dates = up_and_down_dates[up_and_down_dates['label'] == '跌']['Date'].values
print("up dates count: ", len(up_dates))
print("down dates count: ", len(down_dates))

print("news: ")
df_news_fd_up, df_news_fd_down = df_filter_dates(df_news, up_dates, down_dates)

Unnamed: 0.1,Unnamed: 0,證券代碼,Date,Close,經過三股票日漲跌比例(%),label
0,1,2317 鴻海,2020-01-03,87.1213,-0.055676,跌
1,10,2317 鴻海,2020-01-16,85.5995,-0.076666,跌
2,11,2317 鴻海,2020-01-17,87.7871,-0.098592,跌
3,12,2317 鴻海,2020-01-20,87.7871,-0.109426,跌
4,39,2317 鴻海,2020-03-09,75.2325,-0.055626,跌
5,40,2317 鴻海,2020-03-10,75.5178,-0.060453,跌
6,41,2317 鴻海,2020-03-11,74.7569,-0.09542,跌
7,42,2317 鴻海,2020-03-12,71.0476,-0.054886,跌
8,43,2317 鴻海,2020-03-13,70.9525,-0.061662,跌
9,44,2317 鴻海,2020-03-16,67.6236,-0.06751,跌


up dates count:  28
down dates count:  25
news: 
up: 16698
down: 22648


#### (2) 過濾文章
* 移除過短內容 < 100 字 ( 可能是日常例行發文 )
* title 或 content 一定要出現 "鴻海" 或 "2317" (大小寫不限)，因為資料量極大

In [15]:
# 文章字數篩選
print("news")
df_news_filter_up = df_filter_content_len(df_news_fd_up, lower_bound=100, autoSetBound=False)
df_news_filter_down = df_filter_content_len(df_news_fd_down, lower_bound=100, autoSetBound=False)

# 一定要出現台積電
keyword = ['鴻海', '2317']   # case insensitive, so includes "TSMC"
print("news")
df_news_filter_up = df_filter_keyword(df_news_filter_up, keyword)
df_news_filter_down = df_filter_keyword(df_news_filter_down, keyword)

news
df size after filtering content length 15234 ,  0.912  of the original dataframe
df size after filtering content length 20640 ,  0.911  of the original dataframe
news


33034     個股：鴻海(2317)應Goldman Sachs之邀參加今日電話會議，說明營運概況 本公司...
33050     麥克瘋／裕隆咬牙減資補虧損 嚴陳莉蓮利空牌一次打到底 裕隆集團去年財報出爐，稅後虧損高達24...
33058     《期貨》震盪盤整 關注國際股市(永豐期貨提供) 【時報-台北電】台指期周二上漲29點至959...
33084     《台北股市》油價重挫 油元主權基金概念股蒙塵 【時報-台北電】國際油價崩跌逾六成，牽動沙烏地...
33107     【盤前焦點】美股道瓊下挫410點 法人：台股震盪量縮 （中央社記者潘羿菁台北2020年4月1...
                                ...                        
206394    《國際產業》缺貨 蘋果年終賺一波恐成泡影 時報資訊《國際產業》缺貨 蘋果年終賺一波恐成泡影閱...
206450    金融股領銜主演，航運、觀光扮演助攻，帶動指數漲逾百點 財訊快報金融股領銜主演，航運、觀光扮演...
206478    【台股盤後】台積電反彈 台股上漲152點收復14700點 中央社財經【台股盤後】台積電反彈 ...
206694    土洋聯手大買金控股 外資持續賣超鴻海 中央社財經土洋聯手大買金控股 外資持續賣超鴻海閱讀全文...
206780    外媒：福斯汽車與鴻海洽商 在美國打造電動車次品牌 中央社財經外媒：福斯汽車與鴻海洽商 在美國...
Name: titleAndContent, Length: 493, dtype: object

df size after filtering the keyword 493 ,  0.032  of the original dataframe 



1140      1/2上市融券減少排行前20名 名次 股票名稱 增減張數 收盤價 漲跌 融券餘額 昨日餘額 ...
1165      《其他電子》鴻海獎學鯨3周年，獎助破億元、13人3連霸 鴻海 （2317） 旗下鴻海教育基金...
1242      海外存託憑證：週四(2日)國內上市公司在海外發行的存託憑證行情表 週四國內上市公司在海外發行...
1244      【盤前焦點】蘋果漲破300美元 台股多頭火種旺 （中央社記者韓婷婷台北2020年1月3日電）...
1274      【Y早報】進入5G元年 佈局CES商機 族群行情再發威 （開盤日09:00出刊）MLCC供貨...
                                ...                        
165397    《金融》永豐台灣ESG成分股調整 2檔金融股入列 【時報記者任珮云台北報導】永豐台灣ESG(...
165485    定期定額存股族最愛 一檔ETF包辦 永豐台灣ESG ETF(00888)23日宣布其追蹤的標...
165506    【公告】鴻海代子公司Foxconn Holdings BV取得新設合資公司(名稱待定)股權 ...
165507    【公告】鴻海代子公司鴻揚創業投資股份有限公司 芯量科技股份有限公司股權 日 期：2022年0...
165539    鴻海與Stellantis深化合作 7500萬美元再設合資公司 （中央社記者張建中新竹202...
Name: titleAndContent, Length: 588, dtype: object

df size after filtering the keyword 588 ,  0.028  of the original dataframe 



#### (3) 確認看漲和看跌新聞的數量

In [16]:
df_up = pd.concat([df_news_filter_up])
df_down = pd.concat([df_news_filter_down])
print("看漲文章", df_up.shape[0])
print("看跌文章", df_down.shape[0])

看漲文章 493
看跌文章 588


#### (4) 將資料加上標籤，並切分成訓練集、測試集

In [17]:
df_up['label'] = 1
df_down['label'] = 0
df_all = pd.concat([df_up, df_down])
df_all.reset_index(drop=True, inplace=True)

df_train, df_test = train_test_split(df_all, random_state=777, train_size=0.8, stratify=df_all['label'])
print(df_train)
print(df_test)

                     id p_type   s_name s_area_name            post_time  \
626   1579512375762_N01   news  yahoo股市      最新財經新聞  2020-01-20 16:56:00   
367   1615433676863_N01   news  yahoo股市        財經新聞  2021-03-11 10:51:00   
382   1615785578900_N01   news  yahoo股市        財經新聞  2021-03-15 12:59:03   
319   1611019571036_N01   news  yahoo股市        財經新聞  2021-01-19 09:12:47   
330   1611051970582_N01   news  yahoo股市        財經新聞  2021-01-19 18:13:17   
...                 ...    ...      ...         ...                  ...   
59    1592558212133_N01   news  yahoo股市        財經新聞  2020-06-19 16:55:00   
713   1583883134072_N01   news  yahoo股市      最新財經新聞  2020-03-11 07:03:00   
742   1583972222490_N01   news  yahoo股市        重大要聞  2020-03-12 08:00:00   
827   1584347574968_N01   news  yahoo股市      最新財經新聞  2020-03-16 16:04:02   
1025  1640777763537_N01   news  yahoo股市        財經新聞  2021-12-29 19:28:14   

                                 title     author  \
626                繼鴻海後 台達也攜手英商進攻電

### 3. 挑出看漲、看跌兩批文章總文件集的關鍵詞
* 先使用 monpa 斷詞後，挑選其中一種方式篩選關鍵詞
    * df 卡分值開根號
    * tfidf * df 卡分值開根號

#### (1) 挑關鍵詞

##### 建構訓練集及測試集向量空間
##### 取出 label 作為 y_train, y_test

In [18]:
k1 = 2000
k2 = 1500
X_train_k1, kbest_vocabs_k1, X_train_k2, kbest_vocabs_k2 = dfchi2_train(df_train, k1, k2)
print("X_train_k1:", X_train_k1)
print("X_train_k2:", X_train_k2)

X_test_k1, X_test_k2 = dfchi2_test(df_test, kbest_vocabs_k1, kbest_vocabs_k2)
print("X_test_k1", X_test_k1)
print("X_test_k2", X_test_k2)

Unnamed: 0,繼,鴻海,後,台達,也,攜手,英商,進攻,電動車,市場,...,友訊中華電,永豐金玉山金南亞台塑,台中銀,華航晶電,台化寶成,封裝廠,訊芸電子科技中山,中山廠,北江省寧鎮,光洲
626,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
367,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
382,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
319,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
330,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
713,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
742,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
827,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0,0,0,0,0


X_train_k1:         後   台達   進攻   記者   台北   年月  飛雅特克萊斯勒   電子   今天    和  ...   逾收   漲聲  \
626   1.0  1.0  1.0  1.0  1.0  1.0      1.0  1.0  1.0  1.0  ...  0.0  0.0   
367   1.0  0.0  0.0  1.0  0.0  0.0      0.0  0.0  0.0  0.0  ...  0.0  0.0   
382   0.0  0.0  0.0  1.0  1.0  0.0      0.0  0.0  1.0  0.0  ...  0.0  0.0   
319   0.0  0.0  0.0  0.0  1.0  0.0      0.0  0.0  0.0  0.0  ...  0.0  0.0   
330   0.0  0.0  0.0  0.0  0.0  1.0      0.0  1.0  0.0  0.0  ...  0.0  0.0   
...   ...  ...  ...  ...  ...  ...      ...  ...  ...  ...  ...  ...  ...   
59    1.0  0.0  0.0  0.0  0.0  0.0      0.0  0.0  0.0  1.0  ...  1.0  0.0   
713   1.0  0.0  0.0  0.0  0.0  0.0      0.0  1.0  0.0  0.0  ...  0.0  0.0   
742   0.0  0.0  0.0  0.0  0.0  0.0      0.0  0.0  0.0  0.0  ...  0.0  0.0   
827   0.0  0.0  0.0  0.0  0.0  0.0      0.0  0.0  0.0  0.0  ...  0.0  0.0   
1025  1.0  0.0  0.0  1.0  1.0  0.0      0.0  0.0  1.0  1.0  ...  0.0  0.0   

      測試廠  滬深正   年間  元富證券   在期   打線   上台  階梯式  
626   0.0  0.0 

Unnamed: 0,盤,後,解析,拉尾盤,險守,萬六,多,空雜,陳續,震盪,...,數十億,援助,上台,財政部,增率,紀錄展,逐周,單周,英利,大宇隆
419,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
133,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1028,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
437,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
176,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


X_test_k1         後   台達   進攻   記者   台北   年月  飛雅特克萊斯勒   電子   今天    和  ...  逾收   漲聲  測試廠  \
419   1.0  0.0  0.0  1.0  1.0  0.0      0.0  1.0  0.0  0.0  ...   0  0.0    0   
133   0.0  0.0  0.0  0.0  0.0  0.0      0.0  0.0  0.0  0.0  ...   0  0.0    0   
564   0.0  0.0  0.0  0.0  0.0  0.0      0.0  0.0  0.0  0.0  ...   0  0.0    0   
249   0.0  0.0  0.0  0.0  0.0  0.0      0.0  0.0  1.0  0.0  ...   0  0.0    0   
1028  1.0  0.0  0.0  0.0  0.0  0.0      0.0  1.0  0.0  1.0  ...   0  0.0    0   
...   ...  ...  ...  ...  ...  ...      ...  ...  ...  ...  ...  ..  ...  ...   
1036  1.0  0.0  0.0  0.0  0.0  0.0      0.0  0.0  0.0  1.0  ...   0  0.0    0   
437   1.0  0.0  0.0  1.0  1.0  1.0      0.0  0.0  1.0  1.0  ...   0  0.0    0   
451   0.0  0.0  0.0  0.0  0.0  0.0      0.0  0.0  0.0  0.0  ...   0  0.0    0   
176   1.0  0.0  0.0  0.0  0.0  0.0      0.0  1.0  0.0  1.0  ...   0  0.0    0   
258   1.0  0.0  0.0  0.0  0.0  1.0      0.0  1.0  0.0  1.0  ...   0  0.0    0   

      滬深正   年間  元

In [20]:
y_train = df_train['label'].tolist()
y_test = df_test['label'].tolist()

### 4. 訓練及測試模型

### k1 = 2000

### (1) NB

In [21]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB() #naive bayes classifier
classifier.fit(X_train_k1, y_train) #訓練

y_pred_nb = classifier.predict(X_test_k1) #預測

In [22]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_nb))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_nb, labels=[1,0])) 

Accuracy: 0.7465437788018433
[[84 15]
 [40 78]]


### (2) KNN

In [23]:
#以下套用之前範例計算相似文件
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(X_test_k1, X_train_k1)#與給定文件集的向量做相似度計算
related_docs_indices = cosine_similarities.argsort(axis=1) #將相似度由小至大做排序，並轉換成文件編號
d = related_docs_indices[:, :-8:-1] #從後面取7個文件編號 (也就是相似度最大的前7名)

#以kNN相似文件來投票決定類別
y_pred_knn=[]
for doc in d:
    pred_up = 0
    pred_down = 0
    for i in doc:
        if(y_train[i] == 1):
            pred_up+=1 #票數加1
        else:
            pred_down+=1 #票數加1
    if(pred_up>pred_down):
        y_pred_knn.append(1)
    elif(pred_up<pred_down):
        y_pred_knn.append(0)

In [24]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_knn))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_knn, labels=[1,0])) 

Accuracy: 0.6912442396313364
[[79 20]
 [47 71]]


### (3) Random Forest

In [25]:
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(X_train_k1, y_train)

y_pred_rf = forest.predict(X_test_k1) #預測

In [26]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_rf, labels=[1,0])) 

Accuracy: 0.7695852534562212
[[70 29]
 [21 97]]


### (4) SVM

In [27]:
rbfModel = svm.SVC(kernel='linear', gamma=0.7, C=1)
rbfModel.fit(X_train_k1, y_train)

y_pred_svm = rbfModel.predict(X_test_k1) #預測

In [28]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_svm))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_svm, labels=[1,0])) 

Accuracy: 0.728110599078341
[[65 34]
 [25 93]]


### (5) Decision Tree

In [29]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train_k1, y_train)

y_pred_dt = clf.predict(X_test_k1) #預測

In [30]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_dt))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_dt, labels=[1,0])) 

Accuracy: 0.6497695852534562
[[51 48]
 [28 90]]


### (6) 所有模型投票型

In [31]:
y_pred_all=[]
for i in range(len(y_pred_nb)):
    if(y_pred_nb[i]+y_pred_knn[i]+y_pred_rf[i]+y_pred_svm[i]+y_pred_dt[i] >= 3):
        y_pred_all.append(1)
    else: y_pred_all.append(0)

In [32]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_all))

Accuracy: 0.7603686635944701


In [33]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_all, labels=[1,0])) 

[[76 23]
 [29 89]]


### k2 = 1500

### (1) NB

In [34]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB() #naive bayes classifier
classifier.fit(X_train_k2, y_train) #訓練

y_pred_nb = classifier.predict(X_test_k2) #預測

acc_score = metrics.accuracy_score(y_test, y_pred_nb)

In [35]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_nb))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_nb, labels=[1,0])) 

Accuracy: 0.7327188940092166
[[83 16]
 [42 76]]


### (2) KNN

In [36]:
#以下套用之前範例計算相似文件
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(X_test_k2, X_train_k2)#與給定文件集的向量做相似度計算
related_docs_indices = cosine_similarities.argsort(axis=1) #將相似度由小至大做排序，並轉換成文件編號
d = related_docs_indices[:, :-8:-1] #從後面取7個文件編號 (也就是相似度最大的前7名)

#以kNN相似文件來投票決定類別
y_pred_knn=[]
for doc in d:
    pred_up = 0
    pred_down = 0
    for i in doc:
        if(y_train[i] == 1):
            pred_up+=1 #票數加1
        else:
            pred_down+=1 #票數加1
    if(pred_up>pred_down):
        y_pred_knn.append(1)
    elif(pred_up<pred_down):
        y_pred_knn.append(0)

In [37]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_knn))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_knn, labels=[1,0])) 

Accuracy: 0.7096774193548387
[[81 18]
 [45 73]]


### (3) Random Forest

In [38]:
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(X_train_k2, y_train)

y_pred_rf = forest.predict(X_test_k2) #預測

In [39]:
acc_score = metrics.accuracy_score(y_test, y_pred_rf)
print("Accuracy:",acc_score)
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_rf, labels=[1,0])) 

Accuracy: 0.7465437788018433
[[68 31]
 [24 94]]


### (4) SVM

In [40]:
rbfModel = svm.SVC(kernel='linear', gamma=0.7, C=1)
rbfModel.fit(X_train_k2, y_train)

y_pred_svm = rbfModel.predict(X_test_k2) #預測

In [41]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_svm))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_svm, labels=[1,0])) 

Accuracy: 0.7511520737327189
[[68 31]
 [23 95]]


### (5) Decision Tree

In [42]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train_k2, y_train)

y_pred_dt = clf.predict(X_test_k2) #預測

In [43]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_dt))
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_dt, labels=[1,0])) 

Accuracy: 0.6359447004608295
[[47 52]
 [27 91]]


### (6) 所有模型投票型

In [44]:
y_pred_all=[]
for i in range(len(y_pred_nb)):
    if(y_pred_nb[i]+y_pred_knn[i]+y_pred_rf[i]+y_pred_svm[i]+y_pred_dt[i] >= 3):
        y_pred_all.append(1)
    else: y_pred_all.append(0)

In [45]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_all))

Accuracy: 0.7419354838709677


In [46]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred_all, labels=[1,0])) 

[[75 24]
 [32 86]]


-------------

### 法二：以 jieba 斷詞 的實作

### 2. 挑出漲跌超過特定幅度的日子當天的文章，並加以過濾

#### (1) 挑出第 D 天 ( 漲 & 跌特定幅度當天是第 D + 3 天 ) 的新聞

In [56]:
# read up_and_down_dates from part 1
up_and_down_dates = pd.read_csv(r"./bda2023_mid_dataset/up_and_down_dates/2330 台積電.csv")
display(up_and_down_dates)

# separate dates
up_dates = up_and_down_dates[up_and_down_dates['label'] == '漲']['Date'].values
down_dates = up_and_down_dates[up_and_down_dates['label'] == '跌']['Date'].values
print("up dates count: ", len(up_dates))
print("down dates count: ", len(down_dates))

print("news: ")
df_news_fd_up, df_news_fd_down = df_filter_dates(df_news, up_dates, down_dates)

Unnamed: 0.1,Unnamed: 0,證券代碼,Date,Close,經過三股票日漲跌比例(%),label
0,10,2330 台積電,2020-01-16,323.9378,-0.053812,跌
1,12,2330 台積電,2020-01-20,322.4852,-0.054054,跌
2,15,2330 台積電,2020-02-03,305.0536,0.055555,漲
3,40,2330 台積電,2020-03-10,297.3062,-0.055375,跌
4,41,2330 台積電,2020-03-11,292.4640,-0.084437,跌
...,...,...,...,...,...,...
74,737,2330 台積電,2023-01-05,456.0324,0.059978,漲
75,738,2330 台積電,2023-01-06,456.0324,0.056706,漲
76,743,2330 台積電,2023-01-13,497.3090,0.086000,漲
77,745,2330 台積電,2023-01-17,500.2929,0.053678,漲


up dates count:  50
down dates count:  29
news: 
up: 34365
down: 25942


#### (2) 過濾文章
* 移除過短內容 < 100 字 ( 可能是日常例行發文 )
* title 或 content 一定要出現 "2330" 或 "台積電" 或 "TSMC" ( 大小寫不限 )，因為資料量極大

In [57]:
# 文章字數篩選
print("news")
df_news_filter_up = df_filter_content_len(df_news_fd_up, lower_bound=100, autoSetBound=False)
df_news_filter_down = df_filter_content_len(df_news_fd_down, lower_bound=100, autoSetBound=False)

# 一定要出現台積電
keyword = ['2330', '台積電', 'tsmc']   # case insensitive, so includes "TSMC"
print("news")
df_news_filter_up = df_filter_keyword(df_news_filter_up, keyword)
df_news_filter_down = df_filter_keyword(df_news_filter_down, keyword)

news
df size after filtering content length 31860 ,  0.927  of the original dataframe
df size after filtering content length 22499 ,  0.867  of the original dataframe
news


19296    晨間解析：新型冠狀病毒疫情擴散，台股今日慎防還有低點 台股在歷經1月30日的史上最大跌點之後...
19359    《台北股市》近10年2月，台股七成機率收紅 台股近20年及近10年2月大盤都偏漲升行情，上漲...
19367    《台北股市》陸股大悶鍋掀開，台股恐回測前低 全球股市動盪，大陸人行啟動1.2兆人民幣逆回購，...
19409    《晨間解盤》不宜躁進(日盛投顧提供) 隨著疫情毫無平息跡象，全球停飛中國並加強邊界管制，美國...
19429    《熱門族群》武漢肺炎概念股出列 武漢肺炎疫情牽動未來台股走向，國票投顧投研部對台股提出蘋果三...
                               ...                        
67841    跟著達人學存股 小資族也能年領百萬股息 靠著存股，小資族也能年領百萬股利。資深存股達人華倫到...
67852    《基金》逢低出擊正逢時 優先布局「台美多重資產基金」 【時報-台北電】台美股市對矽谷銀行（以...
67958    創新機構／科睿唯安2023百大創新機構 「南亞科、華邦電新入榜」台企業11家創新高 科睿唯安...
67970    台積電／台積電「創新」再獲獎 副法務長陳碧莉：去年在台、美專利獲准百發百中 台積電（2330...
67981    工研院以智權影響力 7度榮獲全球百大創新機構獎 科睿唯安（Clarivate）20日頒發「2...
Name: titleAndContent, Length: 2210, dtype: object

df size after filtering the keyword 2210 ,  0.069  of the original dataframe 



12126     2020/01/16財經行事曆 今日國內外財經焦點：<BR>台灣：<BR>1.股東會：康師傅...
12134     〈美股盤後〉美中正式簽署協議 道瓊站上29000 台積電ADR重挫 美國總統川普與中國副總理...
12141     晨間解析：美中如期達成第一階段協議，台積電法說會牽動台股後市 儘管美中簽署第一階段貿協，但市...
12153     日媒：美國當局施壓、希望台積電赴美生產軍用晶片 MoneyDJ新聞 2020-01-16 0...
12159     《美股》中美簽首段協議 道瓊突破2萬9千點；台積ADR挫 MoneyDJ新聞 2020-01...
                                ...                        
181605    外資賣台積電續買聯電 今年賣超台股逾1.3兆元 （中央社記者鍾榮峰台北2022年10月20日...
181620    總裁魏哲家質設1600張股票 台積電不回應 （中央社記者張建中台北2022年10月20日電）...
181632    股匯雙跌 新台幣收32.107元續創近6年新低 （中央社記者潘姿羽台北2022年10月20日...
181694    驚人大手筆！魏哲家設質1600張台積電股票 公開資訊觀測站資料顯示，台積電總裁魏哲家設質16...
181700    特拉斯無預警辭職！美股開盤道瓊漲逾300點 聯電ADR大漲6％ 英國首相特拉斯（Liz Tr...
Name: titleAndContent, Length: 1582, dtype: object

df size after filtering the keyword 1582 ,  0.070  of the original dataframe 



#### (3) 確認看漲和看跌新聞的數量

In [58]:
df_up = pd.concat([df_news_filter_up])
df_down = pd.concat([df_news_filter_down])
print("看漲文章", df_up.shape[0])
print("看跌文章", df_down.shape[0])

看漲文章 2210
看跌文章 1582


#### (4) 將資料加上標籤，並切分成訓練集、測試集

In [59]:
df_up['label'] = 1
df_down['label'] = 0
df_up_train, df_up_test = train_test_split(df_up, random_state=777, train_size=0.8)
df_down_train, df_down_test = train_test_split(df_down, random_state=777, train_size=0.8)

df_all_test = pd.concat([df_up_test, df_down_test])
df_all_test = df_all_test[['post_time', 'author', 'content_len', 'titleAndContent', 'label']]
df_all_test.reset_index(drop=True, inplace=True)
display(df_all_test)

df_all_train = pd.concat([df_up_train, df_down_train])
df_all_train = df_all_train[['post_time', 'author', 'content_len', 'titleAndContent', 'label']]
df_all_train.reset_index(drop=True, inplace=True)
display(df_all_train)

Unnamed: 0,post_time,author,content_len,titleAndContent,label
0,2021-01-28 17:14:45,\N,821.0,《台北股市》外資連5日賣超破千億元 待節後資金歸隊 【時報記者任珮云台北報導】受到美股大跌影...,1
1,2021-01-15 11:09:46,\N,674.0,《盤中解析》台積電神山聳立 台股萬六不是夢 【時報記者任珮云台北報導】台股今早一度衝過萬六關...,1
2,2020-12-03 11:21:00,東森財經,489.0,台股攻克萬四後市還有戲？ 分析師：一棒接一棒 多頭無極限 台股今（3日）早盤近關情怯，指數觀...,1
3,2022-07-12 08:09:44,\N,854.0,美股下挫 法人：台股持續震盪打底 （中央社記者賴言曦台北2022年7月12日電）投資人準備迎...,1
4,2021-08-20 11:43:16,\N,860.0,英特爾擴大先進製程合作 法人：台積電具多重效益 （中央社記者張建中新竹2021年08月20日...,1
...,...,...,...,...,...
754,2022-03-31 08:28:20,\N,919.0,1分鐘讀財經》航運別再殺了！2大商機浮現 運價強彈快了 小編今天（31日）精選5件不可不知的...,0
755,2020-01-20 07:49:00,時報資訊,881.0,《台北股市》專家看盤勢：封關拚收紅，開春押續漲 台股周線收紅及站穩萬二關卡，三資（債市轉股市...,0
756,2022-10-20 10:53:14,\N,1339.0,《台北股市》台股最慘熊市到何時 專家搖頭：2大魔王要來了 【時報-台北電】權王台積電與大盤同...,0
757,2022-09-21 09:30:32,\N,521.0,台積電連3年停辦運動會 今年基層員工仍有1.6萬獎金「總獎金約8億」 台積電（2330）今（...,0


Unnamed: 0,post_time,author,content_len,titleAndContent,label
0,2022-11-11 07:46:18,\N,964.0,《各報要聞》通膨降溫 美股沸騰 時報資訊《各報要聞》通膨降溫 美股沸騰閱讀全文0 2022年...,1
1,2021-01-29 09:01:22,\N,777.0,【高橋證券晨訊】大區間14500~16500 小區間15000~16000 日期：2021年...,1
2,2021-01-14 10:52:56,\N,883.0,《盤中解析》「積」情消退 鴻家軍駕電動車搶主流 【時報記者王逸芯台北報導】美股四大指數漲多跌...,1
3,2022-03-16 10:59:58,\N,561.0,【台股盤中】開高後翻黑失守萬七 航運股弱勢 （中央社記者潘智義台北2022年3月16日電）美...,1
4,2021-01-04 16:48:00,東森財經,1075.0,鴻海劍指百元 網友卻不愛？分析師：股價有望再攻高 鴻海（2317）4日登高一呼大漲逾8％，盤...,1
...,...,...,...,...,...
3028,2021-05-07 11:14:15,\N,1886.0,啥款？當中國相關基金落後時 文/基金一姐趙靖宇「你去看“當男人戀愛時”了嗎？」最近很多朋友碰...,0
3029,2020-01-16 15:11:00,Yahoo奇摩股市,2121.0,【Y晚報】台股守萬二、月線雙支撐 權王法說將成風向球 （開盤日15:30出刊）昨日(15)中...,0
3030,2022-04-22 11:58:07,\N,467.0,"科技股獲利下修風險，美系外資調降台股指數目標至18,500點 【財訊快報／記者劉居全報導】台...",0
3031,2022-03-03 04:10:00,\N,697.0,外資期現貨 同步作空 俄烏戰爭持續升溫，台股2日呈現開低震盪整理格局，電子股熄火，食品、鋼鐵...,0


#### 3. 挑出看漲、看跌文章各自的關鍵詞
* 先使用 jieba 斷詞後，以 tf 值挑選每個文章自己的關鍵詞

In [60]:
def pos_features(article, topK):
    """ 取得每篇文章 tf 值前 k 大的關鍵詞  """
    features={}
    t = ''
    seg_list = jieba.cut(article, cut_all=False)
    t = ",".join(seg_list)
    article_tokens = t.split(',')
    filt_tokens = list(filter(lambda a: a not in stopwords, article_tokens))
    toks = [w[0] for w in nltk.FreqDist(filt_tokens).most_common(topK)]
    for tok in toks:
        features[tok] = True
    return features

def get_featuresets(df, topK):
    """ 將一群文章和其各自的關鍵詞、標籤組合成 feature set """
    featuresets=[]
    for index, row in df.iterrows():
        article = row['titleAndContent']
        labname = row['label']
        features = pos_features(article, topK)
        feature_item = (features, labname)
        featuresets.append(feature_item)

    return featuresets

In [61]:
import jieba
import nltk

In [62]:
k = 700
train_set_jieba = get_featuresets(df_all_train, k)
test_set_jieba = get_featuresets(df_all_test, k)

### 4. 將 feature set 放入 nltk 套件的模型進行訓練與測試

### (1) NB

In [63]:
classifier = nltk.NaiveBayesClassifier.train(train_set_jieba)
print(nltk.classify.accuracy(classifier, test_set_jieba))

t_f = [feature for (feature, label) in test_set_jieba]  # features of test set
t_l = [label for (feature, label) in test_set_jieba]  # labels of test set
t_l_pr = [classifier.classify(f) for f in t_f]  # predicted labels of test set
cm = nltk.ConfusionMatrix(t_l, t_l_pr)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

0.8484848484848485
  |      1      0 |
--+---------------+
1 | <52.7%>  5.5% |
0 |   9.6% <32.1%>|
--+---------------+
(row = reference; col = test)



### (2) SVM

In [64]:
import nltk.classify
from sklearn.svm import LinearSVC

classifier = nltk.classify.SklearnClassifier(LinearSVC())
classifier.train(train_set_jieba)
from nltk import classify 

accuracy = classify.accuracy(classifier, test_set_jieba)
print (accuracy)
t_f = [feature for (feature, label) in test_set_jieba]  # features of test set
t_l = [label for (feature, label) in test_set_jieba]  # labels of test set
t_l_pr = [classifier.classify(f) for f in t_f]  # predicted labels of test set
cm = nltk.ConfusionMatrix(t_l, t_l_pr)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

0.7839262187088274
  |      1      0 |
--+---------------+
1 | <48.0%> 10.3% |
0 |  11.3% <30.4%>|
--+---------------+
(row = reference; col = test)



### (3) Decision Tree

In [65]:
import nltk.classify

classifier = nltk.classify.SklearnClassifier(tree.DecisionTreeClassifier())
classifier.train(train_set_jieba)

accuracy = classify.accuracy(classifier, test_set_jieba)
print (accuracy)
t_f = [feature for (feature, label) in test_set_jieba]  # features of test set
t_l = [label for (feature, label) in test_set_jieba]  # labels of test set
t_l_pr = [classifier.classify(f) for f in t_f]  # predicted labels of test set
cm = nltk.ConfusionMatrix(t_l, t_l_pr)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

0.7180500658761528
  |      1      0 |
--+---------------+
1 | <45.3%> 12.9% |
0 |  15.3% <26.5%>|
--+---------------+
(row = reference; col = test)

