### Step1. Read stock price dataset

In [9]:
import pandas as pd
import re


all_df = pd.read_excel(open('stock_data_2019-2023.xlsx', 'rb'),['上市2023','上市2022','上市2021','上市2020','上市2019'])
all_df


{'上市2023':                 證券代碼         年月日    開盤價(元)    最高價(元)    最低價(元)    收盤價(元)  \
 0        0050 元大台灣50  2023/03/24       122     122.3    121.55    122.20   
 1       0051 元大中型100  2023/03/24     57.35     57.45     57.15     57.40   
 2          0052 富邦科技  2023/03/24     109.7     110.1     109.5    110.10   
 3          0053 元大電子  2023/03/24      61.2      61.3      61.1     61.30   
 4      0055 元大MSCI金融  2023/03/24     21.86     21.86     21.74     21.83   
 ...              ...         ...       ...       ...       ...       ...   
 61821     Y8886 未含金電  2023/01/03  14256.55  14263.22  14102.72  14249.89   
 61822    Y8887 TR未金電  2023/01/03         -         -         -  30316.15   
 61823     Y8888 不含金融  2023/01/03  12093.34  12215.15   12006.5  12204.82   
 61824     Y9997 報酬指數  2023/01/03         -         -         -  29568.47   
 61825     Y9999 加權指數  2023/01/03  14108.16  14237.94  14001.97  14224.12   
 
        成交量(千股)    成交值(千元)  成交筆數(筆)  流通在外股數(千股) 本益比-TSE 股價淨值比-TS

### Step2. Merge all stock prices between 2019 to 2023

In [10]:
wistron_df = pd.concat([all_df['上市2023'],all_df['上市2022'],all_df['上市2021'],all_df['上市2020'],all_df['上市2019']])
wistron_df.head()

Unnamed: 0,證券代碼,年月日,開盤價(元),最高價(元),最低價(元),收盤價(元),成交量(千股),成交值(千元),成交筆數(筆),流通在外股數(千股),本益比-TSE,股價淨值比-TSE
0,0050 元大台灣50,2023/03/24,122.0,122.3,121.55,122.2,13919,1697778,10776,2298000,-,-
1,0051 元大中型100,2023/03/24,57.35,57.45,57.15,57.4,41,2369,149,16000,-,-
2,0052 富邦科技,2023/03/24,109.7,110.1,109.5,110.1,383,42150,489,57000,-,-
3,0053 元大電子,2023/03/24,61.2,61.3,61.1,61.3,5,322,1013,4988,-,-
4,0055 元大MSCI金融,2023/03/24,21.86,21.86,21.74,21.83,220,4804,319,75654,-,-


### Step3. Extract 緯創 and rename columns

In [11]:
from datetime import timedelta
import numpy as np
wistron_df = wistron_df[wistron_df['證券代碼']==r'3231 緯創']
wistron_df['年月日']  = pd.to_datetime(wistron_df['年月日'])
wistron_df = wistron_df.rename(columns={"年月日":"date","收盤價(元)":"close"})
wistron_df.head()

Unnamed: 0,證券代碼,date,開盤價(元),最高價(元),最低價(元),close,成交量(千股),成交值(千元),成交筆數(筆),流通在外股數(千股),本益比-TSE,股價淨值比-TSE
744,3231 緯創,2023-03-24,40.2,40.35,38.65,40.0,69939,2766859,29215,2900166,10.2,1.18
2005,3231 緯創,2023-03-23,38.2,39.8,37.85,39.65,79210,3102975,36220,2900166,10.11,1.17
3266,3231 緯創,2023-03-22,38.45,38.45,37.3,38.2,50786,1930533,23001,2900166,9.74,1.13
4527,3231 緯創,2023-03-21,37.0,37.95,36.55,37.55,67075,2500129,29866,2900166,9.58,1.11
5784,3231 緯創,2023-03-20,35.0,35.9,35.0,35.9,28645,1019862,13347,2900166,9.16,1.06


### Step4. Label price trend

In [12]:
import datetime as dt
dates = []
labels= []

for date in wistron_df['date'].values: #要計算一週多一天的日期
    after_weekday = np.datetime64(date,'D').astype(dt.datetime).weekday()
    if after_weekday ==4: #星期五
      after_days = date+np.timedelta64(10, 'D')
    else:
      after_days = date+np.timedelta64(8, 'D')
    original_close_val = wistron_df[wistron_df['date']==date].close.values[0]
    if after_days not in wistron_df['date'].values :
      label  = 'last'
    elif wistron_df[wistron_df.date==after_days].close.values[0] > (original_close_val*1.03 ) :
      label = 'up'
    elif wistron_df[wistron_df.date==after_days].close.values[0] < (original_close_val-original_close_val*0.03)  :
      label = 'down'
    else:
      label = 'keep'
    dates.append(date)
    labels.append(label)

new_df = {"date":dates,"label":labels}
new_df = pd.DataFrame(data=new_df)
new_df["date"] = pd.to_datetime(new_df["date"]).dt.date
new_df[new_df['label']=='up']

Unnamed: 0,date,label
6,2023-03-16,up
7,2023-03-15,up
8,2023-03-14,up
9,2023-03-13,up
10,2023-03-10,up
...,...,...
1017,2019-01-14,up
1018,2019-01-11,up
1019,2019-01-10,up
1021,2019-01-08,up


In [13]:
new_df['label'].value_counts()

keep    614
up      209
down    127
last     76
Name: label, dtype: int64

In [14]:
new_df_ase = pd.concat([new_df[new_df['label']=='up'],new_df[new_df['label']=='last']])
new_df_ase = new_df_ase.sort_values('date',ascending = True)
new_df_ase

Unnamed: 0,date,label
1023,2019-01-04,up
1021,2019-01-08,up
1019,2019-01-10,up
1018,2019-01-11,up
1017,2019-01-14,up
...,...,...
4,2023-03-20,last
3,2023-03-21,last
2,2023-03-22,last
1,2023-03-23,last


### Step5. Read all docs and concat them

In [15]:

import os
import re
import pandas as pd

path=os.path.join(os.getcwd(),"bda")
try:
    filename=os.listdir(path)
    usecol=['post_time','title','content']
    data=[]
    for fname in filename:
        if not re.search(".*/.xlsx",fname):
            data.append(pd. read_csv(os.path.join(path,fname),usecols=usecol))
    all_term_df = pd.concat(data)
    all_term_df.head()
except:
    bbs1_df = pd.read_csv('bbs_2019-2021.csv')
    bbs2_df = pd.read_csv('bbs_2022-2023.csv')
    forum1_df = pd.read_csv('forum_2019.csv')
    forum2_df = pd.read_csv('forum_2020.csv')
    forum3_df = pd.read_csv('forum_2021.csv')
    forum4_df = pd.read_csv('forum_2022-2023.csv')
    news1_df = pd.read_csv('news_2019.csv')
    news2_df = pd.read_csv('news_2020.csv')
    news3_df = pd.read_csv('news_2021.csv')
    news4_df = pd.read_csv('news_2022.csv')
    news5_df = pd.read_csv('news_2023.csv')
    news6_df = pd.read_csv('news_2022-2023.csv')
    all_term_df = pd.concat([bbs1_df,bbs2_df,forum1_df,forum2_df,forum3_df,forum4_df,news1_df,news2_df,news3_df,news4_df,news5_df,news6_df])
    all_term_df.head()


### Step6. Extract time, title, and content, then drop null columns

In [16]:
term_df = all_term_df[['post_time','title','content']]
term_df = term_df.dropna()
term_df.shape

(1942413, 3)

In [17]:
term_df.head()

Unnamed: 0,post_time,title,content
0,2019-01-01 00:31:32,[公告] n199808m HitMaker 警告一次,1. 主旨：n199808m 違反板規4-2-1 警告一次 HitMake...
1,2019-01-01 01:28:28,Re: [新聞] 貿戰讓台商錢匯不出？ 海基會：漣漪效應,小弟有個想法不知可不可行 如果有人民幣想洗出來 出國一趟，比方去歐洲 用海外刷卡買黃金，存在...
2,2019-01-01 01:32:39,Re: [新聞] 貿易戰搶出口 透支效應2019衝擊中國經濟!,分身帳號好像要連坐水桶 《ＩＤ暱稱》tangolosss (配息配股變成大富翁)《經濟狀況...
3,2019-01-01 07:07:37,Re: [新聞] 陸媒：俄羅斯想聯手中國去美元化,所以說不要小看俄羅斯的險惡奸詐 俄國一直鼓勵中國發展人民幣石油 去美元化的種種行為 俄羅...
4,2019-01-01 07:35:29,[標的] (伺機作多)日元正二,1. 標的：元大日元指數正二 2. 分類：(伺機作多)多 3. 分析/正文： (...


### Step7. Preprocessing 

In [18]:
import re
def remove_tag_num(sentence):
  processed_word=[]
  processed_sentence=''
  try:
    # processed_sentence = re.sub(r'<[^>]+>', '', sentence) # remove html tag
    # processed_sentence = re.sub(r'[^\w\s]','。', sentence) # remove punctuation
    processed_sentence = re.sub(r'[^\u4e00-\u9fa5]+', '', sentence) #將字串以正則化處理去除中文字元以外的字元
    for word in processed_sentence.split():
      if not word.isdigit():
        processed_word.append(word)
    processed_sentence=' '.join(processed_word)
  except:
    print(sentence)
  
  return processed_sentence
term_df['post_time'] = pd.to_datetime(term_df['post_time']).dt.date
term_df['content'] = term_df['content'].apply(lambda x: remove_tag_num(x))
term_df['title'] = term_df['title'].apply(lambda x: remove_tag_num(x))

term_df.head()


Unnamed: 0,post_time,title,content
0,2019-01-01,。公告。警告一次,。主旨。違反板規。警告一次。違反板規。警告一次。說明。經板主巡視板面。發無意義閒聊文。違反板...
1,2019-01-01,。新聞。貿戰讓台商錢匯不出。海基會。漣漪效應,小弟有個想法不知可不可行。如果有人民幣想洗出來。出國一趟。比方去歐洲。用海外刷卡買黃金。存在...
2,2019-01-01,。新聞。貿易戰搶出口。透支效應。衝擊中國經濟。,分身帳號好像要連坐水桶。暱稱。配息配股變成大富翁。經濟狀況。小康。登入次數。次。同天內只計一...
3,2019-01-01,。新聞。陸媒。俄羅斯想聯手中國去美元化,所以說不要小看俄羅斯的險惡奸詐。俄國一直鼓勵中國發展人民幣石油。去美元化的種種行為。俄羅斯真...
4,2019-01-01,。標的。伺機作多。日元正二,。標的。元大日元指數正二。分類。伺機作多。多。分析。正文。均線多頭排列。站上年線。棒型態破。...


### Step8. Extract Wistron's related docs  

In [19]:
wistron_term_df = term_df[term_df['title'].str.contains(r'緯創')| term_df['content'].str.contains(r'緯創')]
print("Num of Document :",len(wistron_term_df))
wistron_term_df

Num of Document : 7122


Unnamed: 0,post_time,title,content
65,2019-01-02,。其他。加權股價指數成分股暨市值比重,大家新年快樂。雖然今天大盤有點糟就是了。然後我因為空手限制。期貨什麼空單都沒有。反而還進場買...
80,2019-01-02,。新聞。台灣製造。中美貿易戰的意外贏家,。原文連結。原文內容。台灣。中美貿易戰的意外贏家。史上最大。最急的電子業回台潮。正風起雲湧展...
118,2019-01-03,。其他。加權股價指數成分股暨市值比重,慘。拉高又殺低。到底今年怎麼走呢。經過了一天。資料應該是準的了。但還是一樣。以下資料為個人試...
146,2019-01-03,。其他。台灣集中市場三大法人買賣超前。名,。台灣集中市場三大法人買賣超前。名。買超。代碼。名稱。外資。投信。自營商。合計買賣超。元大台...
227,2019-01-04,。其他。加權股價指數成分股暨市值比重,。搞得我心情很差。今天的大盤真的是狠狠修正電子股。不過說起來。總是有些人會說台積電如果剩。的...
...,...,...,...
68449,2023-03-21,瑞信危機初步解除。國際股市反彈。台股在金融止穩盤堅挑戰月線,。財訊快報。方亞申。在瑞信被瑞銀收購下。未免金融事態繼續擴大。美國聯準會。加拿大。英國。日本...
68491,2023-03-21,。台北股市。升息太難料。外資有疑慮。三大法人賣超。億元,。時報。台北電。瑞銀拆彈救援瑞信危機。美股漲聲響起也帶動台股拉炮歡呼。強彈近百點挺進。關。月...
68562,2023-03-21,。台北股市。外資。日買超前十大。逾萬張。這金釵也回補,。時報。台北電。台股今日跟隨美股反彈腳步站上。關。日加權指數收。點。漲。點或。總成交值。億元...
68563,2023-03-21,。台北股市。投信。日連。買。前。大電傳各半。最愛這檔綠電股,。時報。台北電。台股。日加權指數收。點。漲。點或。總成交值。億元。三大法人賣超。億元。其中外...


### Step9. Merge label df & Wistron's term df

In [20]:
wistron_term_df = wistron_term_df.rename(columns={'post_time':'date'})
all_term_trend_df = pd.merge(new_df, wistron_term_df) #把相同date的merge在一起
print('up trend documents:',len(all_term_trend_df[all_term_trend_df['label']=='up']))
print('down trend documents:',len(all_term_trend_df[all_term_trend_df['label']=='down']))
all_term_trend_df.head()


up trend documents: 1292
down trend documents: 629


Unnamed: 0,date,label,title,content
0,2023-03-21,last,。新聞。外資大砍金融股。中信金遭賣。萬張最,原文標題。外資大砍金融股。中信金遭賣。萬張最慘。新光金。玉山金同步遭砍。原文連結。發布時間。...
1,2023-03-21,last,。閒聊。盤後閒聊,台股。成交量。億。台指。成交量。口。櫃買指。成交量。億。台指期。自營商。投信。外資。台指選。...
2,2023-03-21,last,。情報。上市投信買賣超排行,。標題。投信買賣超排行。來源。富邦證券。網址。內文。買超。中興電。中鋼。緯創。英業達。欣興。...
3,2023-03-21,last,。情報。上市外資買賣超排行,。標題。外資買賣超排行。來源。台灣證交所。網址。內文。買超。緯創。鴻海。英業達。金像電。元大...
4,2023-03-21,last,。標的。緯創。領錢聊勝於無多,。個月過去了。這一趟抱了。年。個月。今天了結。其中。張已參與去年除息。遲來的紅包啊。績效比前...


### Step10. Tokenize title & content

In [21]:
#不要重跑
import monpa
monpa.use_gpu(True)
all_term_trend_df['title'] = all_term_trend_df['title'].apply(lambda x: monpa.cut_batch(x)[0])
all_term_trend_df['content'] = all_term_trend_df['content'].apply(lambda x: monpa.cut_batch(x)[0])
all_term_trend_df.head()

+---------------------------------------------------------------------+
  Welcome to MONPA: Multi-Objective NER POS Annotator for Chinese
+---------------------------------------------------------------------+
已找到 model檔。Found model file.
GPU unavailable.


Unnamed: 0,date,label,title,content
0,2023-03-21,last,"[。, 新聞, 。, 外資, 大, 砍, 金融股, 。, 中信金, 遭, 賣, 。, 萬, ...","[原文, 標題, 。, 外資, 大, 砍, 金融股, 。, 中信金, 遭, 賣, 。, 萬,..."
1,2023-03-21,last,"[。, 閒聊, 。, 盤, 後, 閒聊]","[台股, 。, 成交量, 。, 億, 。, 台, 指, 。, 成交量, 。, 口, 。, 櫃..."
2,2023-03-21,last,"[。, 情報, 。, 上市, 投信, 買賣, 超, 排行]","[。, 標題, 。, 投信, 買賣, 超, 排行, 。, 來源, 。, 富邦證券, 。, 網..."
3,2023-03-21,last,"[。, 情報, 。, 上市, 外資, 買賣, 超, 排行]","[。, 標題, 。, 外資, 買賣, 超, 排行, 。, 來源, 。, 台灣, 證交所, 。..."
4,2023-03-21,last,"[。, 標的, 。, 緯創, 。, 領, 錢, 聊勝, 於, 無多]","[。, 個, 月, 過去, 了, 。, 這, 一, 趟, 抱, 了, 。, 年, 。, 個,..."


In [22]:
def filter_gram(terms): #過濾掉一字詞語
    return [term for term in terms if len(term)>1 and len(term)<6]
all_term_trend_df['title'] = all_term_trend_df['title'].apply(lambda x: filter_gram(x))
all_term_trend_df['content'] = all_term_trend_df['content'].apply(lambda x: filter_gram(x))
all_term_trend_df.head()

Unnamed: 0,date,label,title,content
0,2023-03-21,last,"[新聞, 外資, 金融股, 中信金]","[原文, 標題, 外資, 金融股, 中信金, 新光金, 玉山金, 同步, 原文, 連結, 發..."
1,2023-03-21,last,"[閒聊, 閒聊]","[台股, 成交量, 成交量, 買指, 成交量, 指期, 自營商, 投信, 外資, 指選, 自..."
2,2023-03-21,last,"[情報, 上市, 投信, 買賣, 排行]","[標題, 投信, 買賣, 排行, 來源, 富邦證券, 網址, 內文, 買超, 中興電, 中鋼..."
3,2023-03-21,last,"[情報, 上市, 外資, 買賣, 排行]","[標題, 外資, 買賣, 排行, 來源, 台灣, 證交所, 網址, 內文, 買超, 緯創, ..."
4,2023-03-21,last,"[標的, 緯創, 聊勝, 無多]","[過去, 今天, 了結, 其中, 參與, 去年, 除息, 遲來, 紅包, 績效, 一點, 就..."


### Step11. Extract up & down trend docs

In [23]:
wistron_term_trend_df = all_term_trend_df[(all_term_trend_df['label']=='up') | (all_term_trend_df['label']=='down')]
wistron_term_trend_df.to_csv('./Group3_wistron.csv', index = False)
wistron_term_trend_df.head

<bound method NDFrame.head of             date label                                 title  \
83    2023-03-16    up                  [情報, 上市, 投信, 買賣, 排行]   
84    2023-03-16    up                  [情報, 上市, 外資, 買賣, 排行]   
85    2023-03-16    up                    [情報, 公股銀行, 買賣, 排行]   
86    2023-03-16    up                 [情報, 上市櫃, 外資, 投信, 排行]   
87    2023-03-16    up                                  [友達]   
...          ...   ...                                   ...   
6282  2019-01-08    up        [營收, 緯軟, 營收, 續創, 新高, 月增率, 年增率]   
6286  2019-01-04    up         [其他, 加權, 股價, 指數, 成分股, 市值, 比重]   
6287  2019-01-04    up                         [上市, 自營商, 排行]   
6288  2019-01-04    up  [台股, 蘋果, 自爆, 美股, 血洗, 台股, 難逃, 殺機, 難守]   
6289  2019-01-04    up                     [上櫃, 認購, 權證, 彙總表]   

                                                content  
83    [標題, 投信, 買賣, 排行, 來源, 富邦證券, 網址, 內文, 買超, 仁寶, 元大,...  
84    [標題, 外資, 買賣, 排行, 來源, 台灣證交所, 網址, 內文, 買超, 聯電, 英業...  
85    [標題, 公股銀行, 買賣, 排行, 來源

In [24]:
print('up trend documents:',len(all_term_trend_df[all_term_trend_df['label']=='up']))
print('down trend documents:',len(all_term_trend_df[all_term_trend_df['label']=='down']))

up trend documents: 1292
down trend documents: 629


# FROM HERE

### Step12. Record up & down trend terms

In [25]:
import pandas as pd
#From here
wistron_term_trend_df = pd.read_csv("Group3_wistron.csv")

term_dict={'up':set(),'down':set()}
remove =[]
content = [] #用於建立向量空間

for idx,row in wistron_term_trend_df.iterrows():
    row['title'] = eval(row['title'])
    row['content'] = eval(row['content'])
    content.append(" ".join(row['title'])+" ".join(row['content']))
    for term in row['title']:
        if term not in term_dict[row['label']]:
            term_dict[row['label']].add(term)
    for term in row['content']:
        if term not in term_dict[row['label']]:
            term_dict[row['label']].add(term)

print('up trend terms:',len(term_dict['up']))
print('down trend terms:',len(term_dict['down']))
print('content terms: ', content[0:5])
print('content terms: ', len(content))
wistron_term_trend_df.head


up trend terms: 8184
down trend terms: 5793
content terms:  ['情報 上市 投信 買賣 排行標題 投信 買賣 排行 來源 富邦證券 網址 內文 買超 仁寶 元大 股息 開發金 英業達 永豐金 宏碁 中鋼 元大金 元大 台灣 中信金 廣達 奇鋐 承業醫 第一 遠東新 聯強 台泥 慧洋 台塑 緯創 上海商銀 富邦金 台灣 南亞 國泰金 裕民 微星 南亞科 兆豐金 中華電 新興 亞泥 華碩 元晶 遠百 可成 智原 光寶科 聯詠 新纖 中華 光罩 金像電 世芯 群電 台勝科 瑞昱', '情報 上市 外資 買賣 排行標題 外資 買賣 排行 來源 台灣證交所 網址 內文 買超 聯電 英業達 仁寶 友達 中興電 飛宏 總太 京元電子 廣達 東森 群創 裕隆 國泰 智能 電動車 興農 緯創 合勤 東元 富邦 越南 太極 神基 統一 威健 微星 亞太電 技嘉 佳世達 日月光 投控 南緯 誠美材 羅門 凌群 中華電 普安 中信 中國 建準 台光電 三陽工業 中磊 晟銘電 富邦 恒生 國企 光罩 三商電 華夏 明泰', '情報 公股銀行 買賣 排行標題 公股銀行 買賣 排行 來源 股網 網址 內文 以下 資訊 張數 排列 買超 排名 股票 張數 排名 股票 張數 國泰 永續 股息 元大 台灣 街口布蘭 特正 群創 富邦 越南 國泰 臺灣 加權 中信金 英業達 臺企銀 緯創 元大 石油 元大 股息 長榮 仁寶 玉山金 元大 美債 開發金 國泰 年美債 長榮航 遠東銀 潤泰 富邦台 永豐金 元大 滬深 元大 台灣 高息低波 華夏 華南金', '情報 上市櫃 外資 投信 排行排行 股票 名稱 成交 漲跌 投信 外資 仁寶 英業達 宏碁 廣達 慧洋 優群 緯創 台灣 微星 中華電 亞泥 合晶 家登 光寶科 聯詠 資料 來源 以上 謝謝', '友達緯創 衝刺 毛利率 產品 有成 董事長 林憲銘 指出 毛利率 產品線 營收 新台幣 毛利率 雙位數 包括 伺服器 產業 電腦 服務 物聯網 產品線 預估 毛利率 產品線 營收 衝到 緯創 總經理 林建勳 指出 緯創 全球 第三 因此 外界 通常 關注 表現 事實 營收 緯創 比重 已經 降到 以下 因此 帶動 毛利率 拉高 提高 分點 林憲銘 指出 長期']
content terms:  19

<bound method NDFrame.head of             date label                                 title  \
0     2023-03-16    up                  [情報, 上市, 投信, 買賣, 排行]   
1     2023-03-16    up                  [情報, 上市, 外資, 買賣, 排行]   
2     2023-03-16    up                    [情報, 公股銀行, 買賣, 排行]   
3     2023-03-16    up                 [情報, 上市櫃, 外資, 投信, 排行]   
4     2023-03-16    up                                  [友達]   
...          ...   ...                                   ...   
1916  2019-01-08    up        [營收, 緯軟, 營收, 續創, 新高, 月增率, 年增率]   
1917  2019-01-04    up         [其他, 加權, 股價, 指數, 成分股, 市值, 比重]   
1918  2019-01-04    up                         [上市, 自營商, 排行]   
1919  2019-01-04    up  [台股, 蘋果, 自爆, 美股, 血洗, 台股, 難逃, 殺機, 難守]   
1920  2019-01-04    up                     [上櫃, 認購, 權證, 彙總表]   

                                                content  
0     [標題, 投信, 買賣, 排行, 來源, 富邦證券, 網址, 內文, 買超, 仁寶, 元大,...  
1     [標題, 外資, 買賣, 排行, 來源, 台灣證交所, 網址, 內文, 買超, 聯電, 英業...  
2     [標題, 公股銀行, 買賣, 排行, 來源

In [26]:
print(content)
print(term_dict)

['情報 上市 投信 買賣 排行標題 投信 買賣 排行 來源 富邦證券 網址 內文 買超 仁寶 元大 股息 開發金 英業達 永豐金 宏碁 中鋼 元大金 元大 台灣 中信金 廣達 奇鋐 承業醫 第一 遠東新 聯強 台泥 慧洋 台塑 緯創 上海商銀 富邦金 台灣 南亞 國泰金 裕民 微星 南亞科 兆豐金 中華電 新興 亞泥 華碩 元晶 遠百 可成 智原 光寶科 聯詠 新纖 中華 光罩 金像電 世芯 群電 台勝科 瑞昱', '情報 上市 外資 買賣 排行標題 外資 買賣 排行 來源 台灣證交所 網址 內文 買超 聯電 英業達 仁寶 友達 中興電 飛宏 總太 京元電子 廣達 東森 群創 裕隆 國泰 智能 電動車 興農 緯創 合勤 東元 富邦 越南 太極 神基 統一 威健 微星 亞太電 技嘉 佳世達 日月光 投控 南緯 誠美材 羅門 凌群 中華電 普安 中信 中國 建準 台光電 三陽工業 中磊 晟銘電 富邦 恒生 國企 光罩 三商電 華夏 明泰', '情報 公股銀行 買賣 排行標題 公股銀行 買賣 排行 來源 股網 網址 內文 以下 資訊 張數 排列 買超 排名 股票 張數 排名 股票 張數 國泰 永續 股息 元大 台灣 街口布蘭 特正 群創 富邦 越南 國泰 臺灣 加權 中信金 英業達 臺企銀 緯創 元大 石油 元大 股息 長榮 仁寶 玉山金 元大 美債 開發金 國泰 年美債 長榮航 遠東銀 潤泰 富邦台 永豐金 元大 滬深 元大 台灣 高息低波 華夏 華南金', '情報 上市櫃 外資 投信 排行排行 股票 名稱 成交 漲跌 投信 外資 仁寶 英業達 宏碁 廣達 慧洋 優群 緯創 台灣 微星 中華電 亞泥 合晶 家登 光寶科 聯詠 資料 來源 以上 謝謝', '友達緯創 衝刺 毛利率 產品 有成 董事長 林憲銘 指出 毛利率 產品線 營收 新台幣 毛利率 雙位數 包括 伺服器 產業 電腦 服務 物聯網 產品線 預估 毛利率 產品線 營收 衝到 緯創 總經理 林建勳 指出 緯創 全球 第三 因此 外界 通常 關注 表現 事實 營收 緯創 比重 已經 降到 以下 因此 帶動 毛利率 拉高 提高 分點 林憲銘 指出 長期', '廣達 去年 獲利 歷史 次高毛利 伺服器 車用 產品 出貨 順暢 匯兌 收益 帶動 廣達 去年 第四季 獲利 優於 預期 不僅 第四季 毛利率 創下 歷史

### Chi Square Value calculation

In [132]:
import pandas
import re
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from scipy.stats import chisquare

chi_term_trend_df = pd.read_csv("Group3_wistron.csv")

all_term = list(term_dict['up'])+list(term_dict['down'])
chi_vocab=Counter(all_term) #用於建立向量空間模型

def remove_tag_num(sentence):
  processed_word=[]
  processed_sentence=''
  try:
    # processed_sentence = re.sub(r'<[^>]+>', '', sentence) # remove html tag
    processed_sentence = re.sub(r'[^\w\s]',' ', sentence) # remove punctuation
    # processed_sentence = re.sub(r'[^\u4e00-\u9fa5]+', '', sentence) #將字串以正則化處理去除中文字元以外的字元
    for word in processed_sentence.split():
      if not word.isdigit():
        processed_word.append(word)
    processed_sentence=' '.join(processed_word)
  except:
    print(sentence)
  return processed_word

chi_term_trend_df["label"]=chi_term_trend_df["label"].map({"up":1,"down":0})
chi_term_trend_df["title"]=chi_term_trend_df["title"].apply(remove_tag_num)
chi_term_trend_df["content"]=chi_term_trend_df["content"].apply(remove_tag_num)
chi_term_trend_df["word_bag"]=chi_term_trend_df["content"]+chi_term_trend_df["title"]
chi_term_trend_df["word_bag"]=chi_term_trend_df["word_bag"].apply(set)
chi_term_trend_df["word_bag"]=chi_term_trend_df["word_bag"].apply(list)
chi_term_trend_df=chi_term_trend_df.drop(columns=["date","title","content"])
chi_tfidf_label=chi_term_trend_df["label"].to_numpy().reshape(-1)
# print(len(chi_term_trend_df[chi_term_trend_df["label"]==1]))
# print(len(chi_term_trend_df[chi_term_trend_df["label"]==0]))
vectorizer=CountVectorizer()
transformer=TfidfTransformer()

X_1=vectorizer.fit_transform(content)
x_tf=pd.DataFrame(X_1.toarray(),columns=vectorizer.get_feature_names_out())
# print(x_tf) ## TF 值
X_2 = transformer.fit_transform(X_1)
x_idf = pd.DataFrame([transformer.idf_],columns=vectorizer.get_feature_names_out())
# print(x_idf) ## idf 值

tfidf_vetorize = TfidfVectorizer(vocabulary = chi_vocab, use_idf = True) #用給定的termset建構向量，並使用idf加權
X = vectorizer.fit_transform(content) #用給定的文件集做計算
x_tfidf = pd.DataFrame(X_2.toarray(),columns=vectorizer.get_feature_names_out())
# print(x_tfidf.head())

tf_chi=(chisquare(np.sqrt(x_tf.to_numpy()))[0]) ## chi TF 取根號
idf_chi=(chisquare(np.sqrt(np.exp(x_idf.to_numpy())))[0]) ## chi idf 取根號
# print(tf_chi.shape)
# print(idf_chi.shape)
chi_tf_idf_sqrt=np.multiply(tf_chi,idf_chi)
# print(chi_tf_idf_sqrt.shape)


for idx,rows in x_tfidf.iterrows():
    r=np.array(rows)
    rows=np.multiply(r,chi_tf_idf_sqrt)

chi_tfidf=np.max(x_tfidf.to_numpy(),axis=0)
# print(chi_tfidf.shape)

chi_mask=(chi_tfidf<0.3)
featrue_best=list(vectorizer.get_feature_names_out()*chi_mask)
chival=list(chi_tfidf*chi_mask)

up_word={}
down_word={}
zero=[]
for idfval,fea in zip(chival,featrue_best):
  if fea in term_dict["up"]:
    up_word[fea]=idfval
  elif fea in term_dict["down"]:
    down_word[fea]=idfval
  else:
    zero.append(fea)

upup=sorted(up_word.keys())
downdown=sorted(down_word.keys())
up_feature =len(term_dict["up"])-len(upup)
down_feature =len(term_dict["down"])-len(downdown)
print(f"up Token: {up_feature}")
print(f"down Token: {down_feature}")
drop_feature=set(upup+downdown+zero)
drop_feature.discard("")
print(f"Drop Featrue: up : {len(upup)} , down : {len(downdown)} Zero : {len(zero)}")

x_chi_tfidf=pd.DataFrame(x_tfidf,columns=vectorizer.get_feature_names_out()).drop(columns=drop_feature)
# print(x_chi_tfidf)
# print(chi_tfidf_label)

up Token: 413
down Token: 162
Drop Featrue: up : 4156 , down : 2016 Zero : 5098


### Step13. Filter duplicated term between up & down trend

In [29]:
remove = []

for term in term_dict['down']:
    if term in term_dict['up']:
        remove.append(term)

for term in remove:
    term_dict['up'].remove(term)
    term_dict['down'].remove(term)

print('up trend terms:',len(term_dict['up']))
print('down trend terms:',len(term_dict['down']))


up trend terms: 4569
down trend terms: 2178


In [30]:
print(term_dict['up'])
print(term_dict['down'])

{'超象', '加緊', '札卡利亞', '影視', '拖延', '傾力', '遇壓走出盤', '富邦恒生', '台電', '月單', '醫學', '難逃', '高人', '趙承佑', '一展', '抗跌', '連鎖業', '每年', '輸出', '移轉', '貨架', '縣長', '家台廠', '恥股', '夜幕', '分組', '角落', '喊好', '機動', '啟用', '產學', '租約', '回春', '地檢署', '廣泛', '補貨', '康世人', '瀚宇', '黃文清', '槓桿化', '晶圓廠', '嘉碩', '同鄉會', '長者', '文全', '風扇', '臨床', '存在感', '居次', '所長', '出場', '垂直', '三四月', '大師', '國人', '美吾華', '朋億', '設施', '共產', '環景', '大宗', '利奇馬', '變局', '達控', '康控群', '高科技業', '八貫', '青年', '勤益控', '造紙股', '晶相', '竹科', '上市日', '募資案', '頹勢', '太子', '地上', '張居冠', '群眾', '士氣', '疫情中心', '前茅', '本益', '大半', '新竹', '聯化', '春意', '竹科管理局', '桃竹苗', '高榮', '前年', '間斷', '部門營', '庫和', '長科', '三創', '巫彩蓮', '豐達科', '神腦剩', '分列', '焦板', '日電貿', '奪得', '船高', '可憐', '灌壓', '高手', '安心', '完備', '艾蜜莉', '侯靜蘭', '總之', '滑鼠', '監管', '素果', '消費機', '症狀', '季筆電', '許勝雄', '骨文', '老店', '大局', '力晶集團', '殖利率股', '明日', '身歷其境', '月續', '不辦', '意向書', '邏輯', '類型', '三部曲', '製作', '沿續', '領息', '房產商', '冷風', '元來', '充分', '創新中心', '振華集團', '研發出', '大勢所趨', '和泰', '委任', '長榮豪', '熱點', '行車', '熱銷', '要不', '三十', '看升', '聽診器', '值此', '全界', '維田', '

### Step14. Build the vocabulary

In [31]:
all_term = list(term_dict['up'])+list(term_dict['down'])
vocab={} #用於建立向量空間模型

for idx, term in enumerate(all_term):
    vocab[term] = idx
print(vocab)
print(len(vocab))

{'超象': 0, '加緊': 1, '札卡利亞': 2, '影視': 3, '拖延': 4, '傾力': 5, '遇壓走出盤': 6, '富邦恒生': 7, '台電': 8, '月單': 9, '醫學': 10, '難逃': 11, '高人': 12, '趙承佑': 13, '一展': 14, '抗跌': 15, '連鎖業': 16, '每年': 17, '輸出': 18, '移轉': 19, '貨架': 20, '縣長': 21, '家台廠': 22, '恥股': 23, '夜幕': 24, '分組': 25, '角落': 26, '喊好': 27, '機動': 28, '啟用': 29, '產學': 30, '租約': 31, '回春': 32, '地檢署': 33, '廣泛': 34, '補貨': 35, '康世人': 36, '瀚宇': 37, '黃文清': 38, '槓桿化': 39, '晶圓廠': 40, '嘉碩': 41, '同鄉會': 42, '長者': 43, '文全': 44, '風扇': 45, '臨床': 46, '存在感': 47, '居次': 48, '所長': 49, '出場': 50, '垂直': 51, '三四月': 52, '大師': 53, '國人': 54, '美吾華': 55, '朋億': 56, '設施': 57, '共產': 58, '環景': 59, '大宗': 60, '利奇馬': 61, '變局': 62, '達控': 63, '康控群': 64, '高科技業': 65, '八貫': 66, '青年': 67, '勤益控': 68, '造紙股': 69, '晶相': 70, '竹科': 71, '上市日': 72, '募資案': 73, '頹勢': 74, '太子': 75, '地上': 76, '張居冠': 77, '群眾': 78, '士氣': 79, '疫情中心': 80, '前茅': 81, '本益': 82, '大半': 83, '新竹': 84, '聯化': 85, '春意': 86, '竹科管理局': 87, '桃竹苗': 88, '高榮': 89, '前年': 90, '間斷': 91, '部門營': 92, '庫和': 93, '長科': 94, '三創': 95, '巫彩蓮': 96, '豐達

### Step15.建立向量空間模型

In [None]:
print(content)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(vocabulary = vocab, use_idf = True) #用給定的termset建構向量，並使用idf加權
X = vectorizer.fit_transform(content) #用給定的文件集做計算
X

<1921x6747 sparse matrix of type '<class 'numpy.float64'>'
	with 12036 stored elements in Compressed Sparse Row format>

In [34]:
from scipy import sparse
sparse.save_npz("Group3_wistron.npz", X) #寫入整個稀疏矩陣

In [35]:
from scipy import sparse
X = sparse.load_npz("Group3_wistron.npz") #讀入稀疏矩陣

### Remove Stopwords & Use Chi2

In [36]:
with open('stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(content)
X_train = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names())
X_train



Unnamed: 0,一世代,一九,一億,一再,一千,一千萬,一半,一口,一口氣,一同,...,龍彩霖,龍燈,龍頭,龍頭台積電,龍頭廠,龍頭惠普,龍頭股,龐大,龔明鑫,龜速
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

y_train = wistron_term_trend_df['label']

chi2_selector = SelectKBest(chi2, k = 5000)
chi2_selector.fit_transform(X_train, y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,一千萬,一半,一同,一城,一展,一峰,一己之力,一年,一度,一斑,...,鼓勵原文,齊揚,齊放,齊格,龍燈,龍頭台積電,龍頭廠,龍頭惠普,龐大,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=10, random_state=1)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9963560645497137

# Load Model & Use classifier

In [39]:
from scipy import sparse
X = sparse.load_npz("Group3_wistron.npz") #讀入稀疏矩陣
X

<1921x6747 sparse matrix of type '<class 'numpy.float64'>'
	with 12036 stored elements in Compressed Sparse Row format>

In [40]:
up_row=len(wistron_term_trend_df[wistron_term_trend_df["label"]=="up"])
down_row=len(wistron_term_trend_df[wistron_term_trend_df["label"]=="down"])
print(up_row,down_row)

1292 629


In [41]:
y=[] #用以儲存已知類別
for i in range(up_row): #已知前1292篇為漲停，標記為看漲 (此為取巧權宜作法)
  y.append(['看漲'])
for i in range(down_row): #已知後629篇為跌停，標記為看跌
  y.append(['看跌'])
  
y=np.array(y).reshape(-1)
print(y.shape)

(1921,)


### Naive Bayes 分類器

In [42]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB() #naive bayes classifier
classifier.fit(X, y) #訓練

MultinomialNB()

In [43]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,X,y,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.67272727 0.68229167 0.67708333 0.68489583 0.66666667]
Avg. Accuracy: 0.6767329545454546


In [44]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) #隨機挑選30%當測試資料
classifier = MultinomialNB()
classifier.fit(X_train, y_train) #訓練

MultinomialNB()

In [45]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
y_pred= classifier.predict(X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #比對答案，計算準確率

Accuracy: 0.7244367417677643


### Naive Bayes 分類器 with CHI_TFIDF

In [133]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB() #naive bayes classifier
classifier.fit(x_chi_tfidf, chi_tfidf_label) #訓練
scores = cross_val_score(classifier,x_chi_tfidf,chi_tfidf_label,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.67012987 0.67447917 0.65104167 0.66145833 0.65364583]
Avg. Accuracy: 0.662150974025974


In [47]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
chi_X_train, chi_X_test,chi_y_train, chi_y_test = train_test_split(x_chi_tfidf, chi_tfidf_label, test_size=0.30) #隨機挑選30%當測試資料
classifier = MultinomialNB()
classifier.fit(chi_X_train, chi_y_train) #訓練
from sklearn import metrics 
from sklearn.metrics import accuracy_score
chi_y_pred= classifier.predict(chi_X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(chi_y_test, chi_y_pred)) #比對答案，計算準確率
from sklearn.metrics import classification_report  
print(classification_report(chi_y_test, chi_y_pred)) #印出分類報告
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(chi_y_test, chi_y_pred, labels=[1,0])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

Accuracy: 0.6811091854419411
              precision    recall  f1-score   support

           0       1.00      0.02      0.03       187
           1       0.68      1.00      0.81       390

    accuracy                           0.68       577
   macro avg       0.84      0.51      0.42       577
weighted avg       0.78      0.68      0.56       577

[[390   0]
 [184   3]]


### 決策樹

In [70]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion="entropy") #naive bayes classifier
classifier.fit(X, y) #訓練
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,X,y,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.64155844 0.46875    0.38020833 0.34635417 0.6484375 ]
Avg. Accuracy: 0.49706168831168834


In [71]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) #隨機挑選30%當測試資料
classifier = DecisionTreeClassifier(criterion="entropy")
classifier.fit(X_train, y_train) #訓練

DecisionTreeClassifier(criterion='entropy')

In [72]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
y_pred= classifier.predict(X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #比對答案，計算準確率

Accuracy: 0.6412478336221837


In [73]:
from sklearn.metrics import classification_report  
print(classification_report(y_test, y_pred)) #印出分類報告

              precision    recall  f1-score   support

          看漲       0.91      0.52      0.66       390
          看跌       0.47      0.89      0.62       187

    accuracy                           0.64       577
   macro avg       0.69      0.71      0.64       577
weighted avg       0.77      0.64      0.65       577



In [74]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred, labels=['看漲','看跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[204 186]
 [ 21 166]]


### 決策樹 with ChI_TFIDF

In [134]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion="entropy") #naive bayes classifier
classifier.fit(x_chi_tfidf, chi_tfidf_label) #訓練
scores = cross_val_score(classifier,x_chi_tfidf, chi_tfidf_label,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.58701299 0.55729167 0.52083333 0.5546875  0.57552083]
Avg. Accuracy: 0.559069264069264


In [135]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
chi_X_train, chi_X_test,chi_y_train, chi_y_test = train_test_split(x_chi_tfidf, chi_tfidf_label, test_size=0.30) #隨機挑選30%當測試資料
classifier = DecisionTreeClassifier(criterion="entropy")
classifier.fit(chi_X_train, chi_y_train) #訓練
from sklearn import metrics 
from sklearn.metrics import accuracy_score
chi_y_pred= classifier.predict(chi_X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(chi_y_test, chi_y_pred)) #比對答案，計算準確率
from sklearn.metrics import classification_report  
print(classification_report(chi_y_test, chi_y_pred)) #印出分類報告
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(chi_y_test, chi_y_pred, labels=[1,0])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

Accuracy: 0.6845753899480069
              precision    recall  f1-score   support

           0       0.52      0.50      0.51       191
           1       0.76      0.78      0.77       386

    accuracy                           0.68       577
   macro avg       0.64      0.64      0.64       577
weighted avg       0.68      0.68      0.68       577

[[300  86]
 [ 96  95]]


### kNN

In [77]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,X,y,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.47272727 0.52604167 0.421875   0.4375     0.55208333]
Avg. Accuracy: 0.48204545454545455


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [78]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) #隨機挑選30%當測試資料
classifier = KNeighborsClassifier(n_neighbors=7)
classifier.fit(X_train, y_train) #訓練

KNeighborsClassifier(n_neighbors=7)

In [79]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
y_pred= classifier.predict(X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #比對答案，計算準確率

Accuracy: 0.6013864818024264


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [80]:
from sklearn.metrics import classification_report  
print(classification_report(y_test, y_pred)) #印出分類報告

              precision    recall  f1-score   support

          看漲       0.78      0.54      0.64       377
          看跌       0.45      0.71      0.55       200

    accuracy                           0.60       577
   macro avg       0.62      0.63      0.60       577
weighted avg       0.67      0.60      0.61       577



In [81]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred, labels=['看漲','看跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[205 172]
 [ 58 142]]


### KNN with CHI_TFIDF

In [82]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,x_chi_tfidf, chi_tfidf_label,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


[0.61038961 0.578125   0.57291667 0.578125   0.6328125 ]
Avg. Accuracy: 0.5944737554112554


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [136]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
chi_X_train, chi_X_test,chi_y_train, chi_y_test = train_test_split(x_chi_tfidf, chi_tfidf_label, test_size=0.30) #隨機挑選30%當測試資料
classifier = KNeighborsClassifier(n_neighbors=7)
classifier.fit(chi_X_train, chi_y_train) #訓練
from sklearn import metrics 
from sklearn.metrics import accuracy_score
chi_y_pred= classifier.predict(chi_X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(chi_y_test, chi_y_pred)) #比對答案，計算準確率
from sklearn.metrics import classification_report  
print(classification_report(chi_y_test, chi_y_pred)) #印出分類報告
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(chi_y_test, chi_y_pred, labels=[1,0])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

Accuracy: 0.708838821490468
              precision    recall  f1-score   support

           0       0.49      0.45      0.47       164
           1       0.79      0.81      0.80       413

    accuracy                           0.71       577
   macro avg       0.64      0.63      0.63       577
weighted avg       0.70      0.71      0.71       577

[[335  78]
 [ 90  74]]


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### SVC

In [87]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,X,y,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.67792208 0.69010417 0.6796875  0.703125   0.6484375 ]
Avg. Accuracy: 0.6798552489177488


In [88]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) #隨機挑選30%當測試資料
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train) #訓練

SVC(kernel='linear')

In [89]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
y_pred= classifier.predict(X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #比對答案，計算準確率

Accuracy: 0.7469670710571924


In [90]:
from sklearn.metrics import classification_report  
print(classification_report(y_test, y_pred)) #印出分類報告

              precision    recall  f1-score   support

          看漲       0.74      0.98      0.84       400
          看跌       0.82      0.23      0.35       177

    accuracy                           0.75       577
   macro avg       0.78      0.60      0.60       577
weighted avg       0.76      0.75      0.69       577



In [91]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred, labels=['看漲','看跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[391   9]
 [137  40]]


### SVC with CHI_TFIDF

In [93]:
from sklearn.svm import SVC
classifier = SVC(kernel='sigmoid')

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,x_chi_tfidf, chi_tfidf_label,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.65714286 0.640625   0.56770833 0.58854167 0.65364583]
Avg. Accuracy: 0.6215327380952381


In [137]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
chi_X_train, chi_X_test,chi_y_train, chi_y_test = train_test_split(x_chi_tfidf, chi_tfidf_label, test_size=0.30) #隨機挑選30%當測試資料
classifier = SVC(kernel='sigmoid')
classifier.fit(chi_X_train, chi_y_train) #訓練
from sklearn import metrics 
from sklearn.metrics import accuracy_score
chi_y_pred= classifier.predict(chi_X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(chi_y_test, chi_y_pred)) #比對答案，計算準確率
from sklearn.metrics import classification_report  
print(classification_report(chi_y_test, chi_y_pred)) #印出分類報告
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(chi_y_test, chi_y_pred, labels=[1,0])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

Accuracy: 0.7383015597920277
              precision    recall  f1-score   support

           0       0.67      0.27      0.38       175
           1       0.75      0.94      0.83       402

    accuracy                           0.74       577
   macro avg       0.71      0.61      0.61       577
weighted avg       0.72      0.74      0.70       577

[[379  23]
 [128  47]]


### Random Forest

In [95]:
from sklearn.ensemble  import RandomForestClassifier                          
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,X,y,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.67792208 0.69010417 0.67708333 0.70572917 0.66666667]
Avg. Accuracy: 0.6835010822510822


In [96]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) #隨機挑選30%當測試資料
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train) #訓練

RandomForestClassifier()

In [97]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
y_pred= classifier.predict(X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #比對答案，計算準確率

Accuracy: 0.6637781629116117


In [98]:
from sklearn.metrics import classification_report  
print(classification_report(y_test, y_pred)) #印出分類報告

              precision    recall  f1-score   support

          看漲       0.88      0.60      0.71       398
          看跌       0.48      0.82      0.60       179

    accuracy                           0.66       577
   macro avg       0.68      0.71      0.66       577
weighted avg       0.75      0.66      0.68       577



In [99]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred, labels=['看漲','看跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[237 161]
 [ 33 146]]


### Random Forest with CHi_TFIDF

In [100]:
from sklearn.ensemble  import RandomForestClassifier                          
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,x_chi_tfidf, chi_tfidf_label,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())


[0.66753247 0.63802083 0.578125   0.62239583 0.6640625 ]
Avg. Accuracy: 0.6340273268398269


In [101]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
chi_X_train, chi_X_test,chi_y_train, chi_y_test = train_test_split(x_chi_tfidf, chi_tfidf_label, test_size=0.30) #隨機挑選30%當測試資料
classifier = RandomForestClassifier()
classifier.fit(chi_X_train, chi_y_train) #訓練
from sklearn import metrics 
from sklearn.metrics import accuracy_score
chi_y_pred= classifier.predict(chi_X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(chi_y_test, chi_y_pred)) #比對答案，計算準確率
from sklearn.metrics import classification_report  
print(classification_report(chi_y_test, chi_y_pred)) #印出分類報告
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(chi_y_test, chi_y_pred, labels=[1,0])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

Accuracy: 0.6949740034662045
              precision    recall  f1-score   support

           0       0.70      0.27      0.39       208
           1       0.69      0.93      0.80       369

    accuracy                           0.69       577
   macro avg       0.70      0.60      0.59       577
weighted avg       0.70      0.69      0.65       577

[[345  24]
 [152  56]]


### 投票法

In [102]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# 定義分類器
clf1 = DecisionTreeClassifier(random_state=1)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = RandomForestClassifier()
clf4 = MultinomialNB(alpha=0.01)
clf5 = SVC(kernel='linear')

# 使用投票法組合分類器
clf_vote = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('rf', clf3), ('nb', clf4), ('svc', clf5)], voting='hard')

# 訓練投票法分類器
clf_vote.fit(X_train, y_train)

# # 計算精度
# from sklearn.metrics import accuracy_score
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy: " ,metrics.accuracy_score(y_test, y_pred)) 

VotingClassifier(estimators=[('dt', DecisionTreeClassifier(random_state=1)),
                             ('knn', KNeighborsClassifier(n_neighbors=7)),
                             ('rf', RandomForestClassifier()),
                             ('nb', MultinomialNB(alpha=0.01)),
                             ('svc', SVC(kernel='linear'))])

In [103]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) #隨機挑選30%當測試資料
classifier = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('rf', clf3), ('nb', clf4), ('svc', clf5)], voting='hard')
classifier.fit(X_train, y_train) #訓練

VotingClassifier(estimators=[('dt', DecisionTreeClassifier(random_state=1)),
                             ('knn', KNeighborsClassifier(n_neighbors=7)),
                             ('rf', RandomForestClassifier()),
                             ('nb', MultinomialNB(alpha=0.01)),
                             ('svc', SVC(kernel='linear'))])

In [104]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
y_pred= classifier.predict(X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #比對答案，計算準確率

Accuracy: 0.7504332755632582


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [105]:
from sklearn.metrics import classification_report  
print(classification_report(y_test, y_pred)) #印出分類報告

              precision    recall  f1-score   support

          看漲       0.74      0.95      0.83       377
          看跌       0.80      0.37      0.51       200

    accuracy                           0.75       577
   macro avg       0.77      0.66      0.67       577
weighted avg       0.76      0.75      0.72       577



In [106]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred, labels=['看漲','看跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[359  18]
 [126  74]]


### 投票法 with CHI_TFIDF

In [107]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# 定義分類器
clf1 = DecisionTreeClassifier(random_state=1)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = RandomForestClassifier()
clf4 = MultinomialNB(alpha=0.01)
clf5 = SVC(kernel='linear')

# 使用投票法組合分類器
clf_vote = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('rf', clf3), ('nb', clf4), ('svc', clf5)], voting='hard')

# 訓練投票法分類器
clf_vote.fit(x_chi_tfidf, chi_tfidf_label)

# # 計算精度
# from sklearn.metrics import accuracy_score
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy: " ,metrics.accuracy_score(y_test, y_pred)) 

VotingClassifier(estimators=[('dt', DecisionTreeClassifier(random_state=1)),
                             ('knn', KNeighborsClassifier(n_neighbors=7)),
                             ('rf', RandomForestClassifier()),
                             ('nb', MultinomialNB(alpha=0.01)),
                             ('svc', SVC(kernel='linear'))])

In [108]:
#以下將自身資料切成train及test兩組，重新訓練一次，測試模型準確率
from sklearn.model_selection import train_test_split
chi_X_train, chi_X_test,chi_y_train, chi_y_test = train_test_split(x_chi_tfidf, chi_tfidf_label, test_size=0.30) #隨機挑選30%當測試資料
classifier = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('rf', clf3), ('nb', clf4), ('svc', clf5)], voting='hard')
classifier.fit(chi_X_train, chi_y_train) #訓練

VotingClassifier(estimators=[('dt', DecisionTreeClassifier(random_state=1)),
                             ('knn', KNeighborsClassifier(n_neighbors=7)),
                             ('rf', RandomForestClassifier()),
                             ('nb', MultinomialNB(alpha=0.01)),
                             ('svc', SVC(kernel='linear'))])

In [109]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
chi_y_pred= classifier.predict(chi_X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(chi_y_test, chi_y_pred)) #比對答案，計算準確率

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy: 0.7746967071057193


In [110]:
from sklearn.metrics import classification_report  
print(classification_report(chi_y_test, chi_y_pred)) #印出分類報告

              precision    recall  f1-score   support

           0       0.77      0.40      0.53       181
           1       0.78      0.94      0.85       396

    accuracy                           0.77       577
   macro avg       0.77      0.67      0.69       577
weighted avg       0.77      0.77      0.75       577



In [111]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(chi_y_test, chi_y_pred, labels=[1,0])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[374  22]
 [108  73]]


# 測試集文章向量化處理, 移動回測

### 文章向量化處理：透過 sklearn 套件中 TfidfVectorizer 將斷詞結果去除停用詞後轉為空間向量

In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [113]:
X_train

<1344x6747 sparse matrix of type '<class 'numpy.float64'>'
	with 8469 stored elements in Compressed Sparse Row format>

In [114]:
vectorizer = TfidfVectorizer(vocabulary = vocab, use_idf = True) #用給定的termset建構向量，並使用idf加權
X = vectorizer.fit_transform(content) #用給定的文件集做計算
X

<1921x6747 sparse matrix of type '<class 'numpy.float64'>'
	with 12036 stored elements in Compressed Sparse Row format>

In [115]:
X_train = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
X_train



Unnamed: 0,超象,加緊,札卡利亞,影視,拖延,傾力,遇壓走出盤,富邦恒生,台電,月單,...,儲蓄,湧進,中共,競逐,任日,台開,席一般,轉交給,印度諾伊達,周周漲
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
y_train = wistron_term_trend_df['label']

chi2_selector = SelectKBest(chi2, k = 5000)
chi2_selector.fit(X_train, y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,札卡利亞,傾力,遇壓走出盤,富邦恒生,台電,月單,醫學,高人,趙承佑,一展,...,儲蓄,湧進,中共,競逐,任日,台開,席一般,轉交給,印度諾伊達,周周漲
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9442998438313378

##### 透過 7-9 月的向量維度將 10 月份的文章也轉成 tf-idf 的向量空間

In [118]:
import datetime
def clearSentence(sentence):
    return re.sub(r'[^\u4e00-\u9fa5]+', '', sentence)
wistron_term_trend_df['date'] = pd.to_datetime(wistron_term_trend_df['date']).dt.date
# 設定訓練資料集的開始日期與結束日期
train_startDate = datetime.date(2022,5,1)
train_endDate = datetime.date(2022,7,30)

# 透過monpa對文章進行斷詞處理，並將個別斷詞結果寫在tokenStr這個字串變數中，最後再將訓練集中所有字串存在tokenStr_list中
train_tokenStr_list = []
for i in list(wistron_term_trend_df[wistron_term_trend_df['date'].between(train_startDate, train_endDate)].index):
    try:
#         txt = clearSentence(bbs23_ase['content'][i])
        sentence_list = utils.short_sentence(wistron_term_trend_df['content'][i])
        tokenStr = str()
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        train_tokenStr_list.append(tokenStr)
    except:
        train_tokenStr_list.append('')

In [119]:
# 首先先計算指定月份的文章個別的tfidf
test_startDate = datetime.date(2021,8,1)
test_endDate = datetime.date(2021,8,30)

test_tokenStr_list = []
for i in list(wistron_term_trend_df[wistron_term_trend_df['date'].between(test_startDate, test_endDate)].index):
    try:
        txt = clearSentence(wistron_term_trend_df['content'][i])
        sentence_list = utils.short_sentence(txt)
        tokenStr = str()
        for sentence in sentence_list:
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        test_tokenStr_list.append(tokenStr)
    except:
        test_tokenStr_list.append('')

In [120]:
# 接著將4月份的結果透過df.reindex這個方法映射到訓練集的向量空間中
y_test = wistron_term_trend_df['label']

vectorizer = TfidfVectorizer(vocabulary = vocab, use_idf = True)
X_test = vectorizer.fit_transform(test_tokenStr_list)
X_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names())
X_test = X_test.reindex(kbest_vocabs, axis=1, fill_value=0)
X_test



Unnamed: 0,札卡利亞,傾力,遇壓走出盤,富邦恒生,台電,月單,醫學,高人,趙承佑,一展,...,儲蓄,湧進,中共,競逐,任日,台開,席一般,轉交給,印度諾伊達,周周漲
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 建立預測模型

In [121]:
from sklearn.ensemble import GradientBoostingClassifier

In [122]:
# Train
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9442998438313378

### 檢視預測結果

In [123]:
new_df_ase['date'] = pd.to_datetime(new_df_ase['date']).dt.date
new_df_ase['date-1'] = new_df_ase['date'] - datetime.timedelta(days=1)

test_label = new_df_ase[new_df_ase['date-1'].between(test_startDate, test_endDate)]['label']

test_data = wistron_term_trend_df[wistron_term_trend_df['date'].between(test_startDate, test_endDate)]
test_data['predict_label'] = clf.predict(X_test)
predict_label = pd.merge(
    new_df_ase[new_df_ase['date-1'].between(test_startDate, test_endDate)], 
    test_data.groupby(['date', 'predict_label']).count().sort_values('label', ascending = False).sort_index(level=[0], sort_remaining=False).groupby(level=0).head(1).reset_index(), 
    left_on='date-1', right_on='date', how='left').fillna(method='ffill').fillna(method='bfill')['predict_label']

from sklearn.metrics import accuracy_score
print('{}月份預測準確率:'.format(test_startDate.month), accuracy_score(test_label, predict_label))


8月份預測準確率: 1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predict_label'] = clf.predict(X_test)


In [124]:
pd.DataFrame([test_label.reset_index(drop=True), predict_label]).T

Unnamed: 0,label,predict_label
0,up,up
1,up,up
