### Step1. Read stock price dataset

In [2]:
import pandas as pd
import re


all_df = pd.read_excel(open('stock_data_2019-2023.xlsx', 'rb'),['上市2023','上市2022','上市2021','上市2020','上市2019'])
all_df


{'上市2023':                 證券代碼         年月日    開盤價(元)    最高價(元)    最低價(元)    收盤價(元)  \
 0        0050 元大台灣50  2023/03/24       122     122.3    121.55    122.20   
 1       0051 元大中型100  2023/03/24     57.35     57.45     57.15     57.40   
 2          0052 富邦科技  2023/03/24     109.7     110.1     109.5    110.10   
 3          0053 元大電子  2023/03/24      61.2      61.3      61.1     61.30   
 4      0055 元大MSCI金融  2023/03/24     21.86     21.86     21.74     21.83   
 ...              ...         ...       ...       ...       ...       ...   
 61821     Y8886 未含金電  2023/01/03  14256.55  14263.22  14102.72  14249.89   
 61822    Y8887 TR未金電  2023/01/03         -         -         -  30316.15   
 61823     Y8888 不含金融  2023/01/03  12093.34  12215.15   12006.5  12204.82   
 61824     Y9997 報酬指數  2023/01/03         -         -         -  29568.47   
 61825     Y9999 加權指數  2023/01/03  14108.16  14237.94  14001.97  14224.12   
 
        成交量(千股)    成交值(千元)  成交筆數(筆)  流通在外股數(千股) 本益比-TSE 股價淨值比-TS

### Step2. Merge all stock prices between 2019 to 2023

In [3]:
wistron_df = pd.concat([all_df['上市2023'],all_df['上市2022'],all_df['上市2021'],all_df['上市2020'],all_df['上市2019']])
wistron_df.head()

Unnamed: 0,證券代碼,年月日,開盤價(元),最高價(元),最低價(元),收盤價(元),成交量(千股),成交值(千元),成交筆數(筆),流通在外股數(千股),本益比-TSE,股價淨值比-TSE
0,0050 元大台灣50,2023/03/24,122.0,122.3,121.55,122.2,13919,1697778,10776,2298000,-,-
1,0051 元大中型100,2023/03/24,57.35,57.45,57.15,57.4,41,2369,149,16000,-,-
2,0052 富邦科技,2023/03/24,109.7,110.1,109.5,110.1,383,42150,489,57000,-,-
3,0053 元大電子,2023/03/24,61.2,61.3,61.1,61.3,5,322,1013,4988,-,-
4,0055 元大MSCI金融,2023/03/24,21.86,21.86,21.74,21.83,220,4804,319,75654,-,-


### Step3. Extract 緯創 and rename columns

In [4]:
from datetime import timedelta
import numpy as np
wistron_df = wistron_df[wistron_df['證券代碼']==r'3231 緯創']
wistron_df['年月日']  = pd.to_datetime(wistron_df['年月日'])
wistron_df = wistron_df.rename(columns={"年月日":"date","收盤價(元)":"close"})
wistron_df.head()

Unnamed: 0,證券代碼,date,開盤價(元),最高價(元),最低價(元),close,成交量(千股),成交值(千元),成交筆數(筆),流通在外股數(千股),本益比-TSE,股價淨值比-TSE
744,3231 緯創,2023-03-24,40.2,40.35,38.65,40.0,69939,2766859,29215,2900166,10.2,1.18
2005,3231 緯創,2023-03-23,38.2,39.8,37.85,39.65,79210,3102975,36220,2900166,10.11,1.17
3266,3231 緯創,2023-03-22,38.45,38.45,37.3,38.2,50786,1930533,23001,2900166,9.74,1.13
4527,3231 緯創,2023-03-21,37.0,37.95,36.55,37.55,67075,2500129,29866,2900166,9.58,1.11
5784,3231 緯創,2023-03-20,35.0,35.9,35.0,35.9,28645,1019862,13347,2900166,9.16,1.06


### Step4. Label price trend

In [5]:
import datetime as dt
dates = []
labels= []

for date in wistron_df['date'].values:
    after_weekday = np.datetime64(date,'D').astype(dt.datetime).weekday()
    if after_weekday ==4:
      after_days = date+np.timedelta64(3, 'D')
    else:
      after_days = date+np.timedelta64(1, 'D')
    original_close_val = wistron_df[wistron_df['date']==date].close.values[0]
    if after_days not in wistron_df['date'].values :
      label  = 'last'
    elif wistron_df[wistron_df.date==after_days].close.values[0] > original_close_val  :
      label = 'up'
    elif wistron_df[wistron_df.date==after_days].close.values[0] < original_close_val  :
      label = 'down'
    else:
      label = 'keep'
    dates.append(date)
    labels.append(label)

new_df = {"date":dates,"label":labels}
new_df = pd.DataFrame(data=new_df)
new_df["date"] = pd.to_datetime(new_df["date"]).dt.date
new_df[new_df['label']=='up']

Unnamed: 0,date,label
1,2023-03-23,up
2,2023-03-22,up
3,2023-03-21,up
4,2023-03-20,up
5,2023-03-17,up
...,...,...
1018,2019-01-11,up
1019,2019-01-10,up
1021,2019-01-08,up
1023,2019-01-04,up


In [6]:
new_df[new_df['label']=='down']

Unnamed: 0,date,label
9,2023-03-13,down
11,2023-03-09,down
17,2023-03-01,down
25,2023-02-15,down
29,2023-02-09,down
...,...,...
1007,2019-01-28,down
1013,2019-01-18,down
1020,2019-01-09,down
1022,2019-01-07,down


In [7]:
new_df[new_df['label']=='last']

Unnamed: 0,date,label
0,2023-03-24,last
18,2023-02-24,last
38,2023-01-17,last
49,2022-12-30,last
108,2022-10-07,last
128,2022-09-08,last
197,2022-06-02,last
220,2022-04-29,last
238,2022-04-01,last
262,2022-02-25,last


### Step5. Read all docs and concat them

In [8]:
import os
import re
import pandas as pd
usecol=['post_time','title','content']
data=[]
path=os.path.join(os.getcwd(),"bda")
filename=os.listdir(path)
for fname in filename:
    if not re.search(".*/.xlsx",fname):
        data.append(pd. read_csv(os.path.join(path,fname),usecols=usecol))

all_term_df = pd.concat(data)
all_term_df.head()
# term_df = all_term_df[['post_time','title','content']]
term_df = all_term_df.dropna()
print(term_df.shape)
term_df.head()

(1942413, 3)


Unnamed: 0,post_time,title,content
0,2019-01-01 00:31:32,[公告] n199808m HitMaker 警告一次,1. 主旨：n199808m 違反板規4-2-1 警告一次 HitMake...
1,2019-01-01 01:28:28,Re: [新聞] 貿戰讓台商錢匯不出？ 海基會：漣漪效應,小弟有個想法不知可不可行 如果有人民幣想洗出來 出國一趟，比方去歐洲 用海外刷卡買黃金，存在...
2,2019-01-01 01:32:39,Re: [新聞] 貿易戰搶出口 透支效應2019衝擊中國經濟!,分身帳號好像要連坐水桶 《ＩＤ暱稱》tangolosss (配息配股變成大富翁)《經濟狀況...
3,2019-01-01 07:07:37,Re: [新聞] 陸媒：俄羅斯想聯手中國去美元化,所以說不要小看俄羅斯的險惡奸詐 俄國一直鼓勵中國發展人民幣石油 去美元化的種種行為 俄羅...
4,2019-01-01 07:35:29,[標的] (伺機作多)日元正二,1. 標的：元大日元指數正二 2. 分類：(伺機作多)多 3. 分析/正文： (...


### Step6. Extract time, title, and content, then drop null columns

### Step7. Preprocessing 

In [55]:
import re
def remove_tag_num(sentence):
  processed_word=[]
  processed_sentence=''
  try:
    processed_sentence = re.sub(r'<[^>]+>', '', sentence) # remove html tag
    processed_sentence = re.sub(r'[^\w\s]','',processed_sentence)
    processed_sentence = re.sub(r'[A-Za-z0-9]','',processed_sentence) # remove punctuation
    for word in processed_sentence.split():
      if not word.isdigit():
        processed_word.append(word) 
    processed_sentence=' '.join(processed_word)
  except:
    print(sentence)
  
  return processed_sentence
term_df['post_time'] = pd.to_datetime(term_df['post_time']).dt.date
term_df['content'] = term_df['content'].apply(lambda x: remove_tag_num(x))
term_df['title'] = term_df['title'].apply(lambda x: remove_tag_num(x))

term_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  term_df['post_time'] = pd.to_datetime(term_df['post_time']).dt.date


跌到塊殖利率有多少可能嗎


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  term_df['content'] = term_df['content'].apply(lambda x: remove_tag_num(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  term_df['title'] = term_df['title'].apply(lambda x: remove_tag_num(x))


Unnamed: 0,post_time,title,content
0,2019-01-01,公告警告一次,主旨違反板規警告一次違反板規警告一次說明經板主巡視板面發無意義閒聊文違反板規發不符格式之標的...
1,2019-01-01,新聞貿戰讓台商錢匯不出海基會漣漪效應,小弟有個想法不知可不可行如果有人民幣想洗出來出國一趟比方去歐洲用海外刷卡買黃金存在當地銀行反...
2,2019-01-01,新聞貿易戰搶出口透支效應衝擊中國經濟,分身帳號好像要連坐水桶ＩＤ暱稱配息配股變成大富翁經濟狀況小康登入次數次同天內只計一次有效文章...
3,2019-01-01,新聞陸媒俄羅斯想聯手中國去美元化,所以說不要小看俄羅斯的險惡奸詐俄國一直鼓勵中國發展人民幣石油去美元化的種種行為俄羅斯真的那麼...
4,2019-01-01,標的伺機作多日元正二,標的元大日元指數正二分類伺機作多多分析正文均線多頭排列站上年線棒型態破的高點含影線經濟日報推...


In [10]:
print(term_df.shape)
print(term_df.head())

(1942413, 3)
    post_time               title  \
0  2019-01-01              公告警告一次   
1  2019-01-01  新聞貿戰讓台商錢匯不出海基會漣漪效應   
2  2019-01-01  新聞貿易戰搶出口透支效應衝擊中國經濟   
3  2019-01-01    新聞陸媒俄羅斯想聯手中國去美元化   
4  2019-01-01          標的伺機作多日元正二   

                                             content  
0  主旨違反板規警告一次違反板規警告一次說明經板主巡視板面發無意義閒聊文違反板規發不符格式之標的...  
1  小弟有個想法不知可不可行如果有人民幣想洗出來出國一趟比方去歐洲用海外刷卡買黃金存在當地銀行反...  
2  分身帳號好像要連坐水桶ＩＤ暱稱配息配股變成大富翁經濟狀況小康登入次數次同天內只計一次有效文章...  
3  所以說不要小看俄羅斯的險惡奸詐俄國一直鼓勵中國發展人民幣石油去美元化的種種行為俄羅斯真的那麼...  
4  標的元大日元指數正二分類伺機作多多分析正文均線多頭排列站上年線棒型態破的高點含影線經濟日報推...  


### Step8. Extract Wistron's related docs  

In [11]:
import re

In [12]:
wistron_term_df = term_df[term_df['title'].str.contains(r'緯創')| term_df['content'].str.contains(r'緯創')]
print("Num of Document :",len(wistron_term_df))
print(wistron_term_df)

Num of Document : 7781
        post_time                        title  \
65     2019-01-02             其他加權股價指數成分股暨市值比重   
80     2019-01-02             新聞台灣製造中美貿易戰的意外贏家   
118    2019-01-03             其他加權股價指數成分股暨市值比重   
146    2019-01-03            其他台灣集中市場三大法人買賣超前名   
227    2019-01-04             其他加權股價指數成分股暨市值比重   
...           ...                          ...   
68449  2023-03-21  瑞信危機初步解除國際股市反彈台股在金融止穩盤堅挑戰月線   
68491  2023-03-21       台北股市升息太難料外資有疑慮三大法人賣超億元   
68562  2023-03-21        台北股市外資日買超前十大逾萬張這金釵也回補   
68563  2023-03-21       台北股市投信日連買前大電傳各半最愛這檔綠電股   
68567  2023-03-21      台北股市三大法人日買超前十大最愛電子心頭好是它   

                                                 content  
65     大家新年快樂雖然今天大盤有點糟就是了然後我因為空手限制期貨什麼空單都沒有反而還進場買現貨總之...  
80     原文連結原文內容台灣中美貿易戰的意外贏家史上最大最急的電子業回台潮正風起雲湧展開為了避開中美...  
118    慘拉高又殺低到底今年怎麼走呢經過了一天資料應該是準的了但還是一樣以下資料為個人試算可能有誤僅...  
146    台灣集中市場三大法人買賣超前名買超代碼名稱外資投信自營商合計買賣超元大台灣反欣興元大滬深正中...  
227    搞得我心情很差ˊˇˋ今天的大盤真的是狠狠修正電子股ˊˇˋ不過說起來總是有些人會說台積電如果剩...  
...                   

### Step9. Merge label df & Wistron's term df

In [13]:
wistron_term_df = wistron_term_df.rename(columns={'post_time':'date'})
all_term_trend_df = pd.merge(new_df, wistron_term_df) #把相同date的merge在一起
print('up trend documents:',len(all_term_trend_df[all_term_trend_df['label']=='up']))
print('down trend documents:',len(all_term_trend_df[all_term_trend_df['label']=='down']))
print(all_term_trend_df.head)

up trend documents: 3692
down trend documents: 2484
<bound method NDFrame.head of             date label                  title  \
0     2023-03-21    up      新聞外資大砍金融股中信金遭賣萬張最   
1     2023-03-21    up                 閒聊盤後閒聊   
2     2023-03-21    up            情報上市投信買賣超排行   
3     2023-03-21    up            情報上市外資買賣超排行   
4     2023-03-21    up            標的緯創領錢聊勝於無多   
...          ...   ...                    ...   
6950  2019-01-02    up       台北股市大產業檔前景靚法人抱跨年   
6951  2019-01-02    up           外資買超股台積電中鋼友達   
6952  2019-01-02    up     緯創說明加碼印度擴增股本達億報導內容   
6953  2019-01-02    up  電腦設備拓印度市場緯創提高子公司額定資本額   
6954  2019-01-02    up         晚報台股出師不利紅盤日失守點   

                                                content  
0     原文標題外資大砍金融股中信金遭賣萬張最慘新光金玉山金同步遭砍原文連結發布時間年月日週二上午記...  
1     台股成交量億台指成交量口櫃買指成交量億台指期自營商投信外資台指選自營商投信外資漲停家數跌停家...  
2     標題投信買賣超排行來源富邦證券網址內文買超中興電中鋼緯創英業達欣興宏碁台塑中華新光鋼廣達永冠...  
3     標題外資買賣超排行來源台灣證交所網址內文買超緯創鴻海英業達金像電元大台灣反欣興元大滬深正潤泰...  
4     個月過去了這一趟抱了年個月今天了結其中張已參與去年除息遲來的紅包啊

### Step10. Tokenize title & content

In [14]:
#不要重跑
import monpa
monpa.use_gpu(True)
all_term_trend_df['title'] = all_term_trend_df['title'].apply(lambda x: monpa.cut(x))


+---------------------------------------------------------------------+
  Welcome to MONPA: Multi-Objective NER POS Annotator for Chinese
+---------------------------------------------------------------------+
已找到 model檔。Found model file.
GPU unavailable.


In [15]:
all_term_trend_df['content'] = all_term_trend_df['content'].apply(lambda x: monpa.cut(x))
all_term_trend_df.head()

Unnamed: 0,date,label,title,content
0,2023-03-21,up,"[新聞, 外資, 大砍, 金融股, 中信金, 遭, 賣, 萬, 張, 最]","[原文, 標題, 外資, 大, 砍, 金融股, 中信金, 遭, 賣, 萬, 張, 最, 慘,..."
1,2023-03-21,up,"[閒聊, 盤, 後, 閒聊]","[台股, 成交量, 億台指, 成交, 量口櫃, 買指, 成交量, 億, 台, 指期, 自營商..."
2,2023-03-21,up,"[情報, 上市, 投信, 買賣, 超, 排行]","[標題, 投信, 買賣, 超, 排行, 來源, 富邦證券, 網址, 內文, 買, 超, 中興..."
3,2023-03-21,up,"[情報, 上市, 外資, 買賣, 超, 排行]","[標題, 外資, 買賣, 超, 排行, 來源, 台灣, 證交所, 網址, 內文, 買, 超,..."
4,2023-03-21,up,"[標的, 緯創, 領錢, 聊勝, 於, 無, 多]","[個, 月, 過去, 了, 這, 一, 趟, 抱, 了, 年, 個, 月, 今天, 了結, ..."


In [16]:
# import monpa
# from monpa import utils
# monpa.use_gpu(True)

# def monpa_split(string_sentence):
#     word_list=[]
#     sentencelist=utils.short_sentence(string_sentence)
#     print(sentencelist)
#     for item in sentencelist:
#             reslutcut=monpa.cut(item) ## list
#             for term in reslutcut:
#                 term=term.strip()
#                 if len(term)>1 and not term.isdigit(): ## 長度>1 就是詞
#                      word_list.append(term)
#     return word_list

# upword=monpa_split(up_string)
# downword=monpa_split(down_string)
# print(downword)



In [17]:
def filter_gram(terms): #過濾掉一字詞語
    return [term for term in terms if len(term)>1]
all_term_trend_df['title'] = all_term_trend_df['title'].apply(lambda x: filter_gram(x))
all_term_trend_df['content'] = all_term_trend_df['content'].apply(lambda x: filter_gram(x))
all_term_trend_df.head()

Unnamed: 0,date,label,title,content
0,2023-03-21,up,"[新聞, 外資, 大砍, 金融股, 中信金]","[原文, 標題, 外資, 金融股, 中信金, 新光金, 玉山金, 同步, 原文, 連結, 發..."
1,2023-03-21,up,"[閒聊, 閒聊]","[台股, 成交量, 億台指, 成交, 量口櫃, 買指, 成交量, 指期, 自營商, 投信, ..."
2,2023-03-21,up,"[情報, 上市, 投信, 買賣, 排行]","[標題, 投信, 買賣, 排行, 來源, 富邦證券, 網址, 內文, 中興電, 中鋼緯創英業..."
3,2023-03-21,up,"[情報, 上市, 外資, 買賣, 排行]","[標題, 外資, 買賣, 排行, 來源, 台灣, 證交所, 網址, 內文, 緯創, 鴻海英業..."
4,2023-03-21,up,"[標的, 緯創, 領錢, 聊勝]","[過去, 今天, 了結, 其中, 參與, 去年, 除息遲來, 紅包, 績效, 一點, 就是,..."


### Step11. Extract up & down trend docs

In [42]:
wistron_term_trend_df = all_term_trend_df[(all_term_trend_df['label']=='up') | (all_term_trend_df['label']=='down')]
wistron_term_trend_df.to_csv(os.path.join(os.getcwd(),"bda")+'wistron_term_2gram_new.csv', index = False)
wistron_term_trend_df.shape

(6176, 4)

In [43]:
print('up trend documents:',len(wistron_term_trend_df[wistron_term_trend_df['label']=='up']))
print('down trend documents:',len(wistron_term_trend_df[wistron_term_trend_df['label']=='down']))

up trend documents: 3692
down trend documents: 2484


### Step12. Record up & down trend terms

In [45]:
#From here
payh=os.path.join(os.getcwd())
wistron_term_trend_df = pd.read_csv(os.path.join(os.getcwd(),"bda")+'wistron_term_2gram_new.csv')
term_dict={'up':set(),'down':set()}
remove =[]
content = [] #用於建立向量空間

for idx,row in wistron_term_trend_df.iterrows():
    row['title'] = eval(row['title'])
    row['content'] = eval(row['content'])
    content.append(" ".join(row['title'])+" ".join(row['content']))
    for term in row['title']:
        if term not in term_dict[row['label']]:
            term_dict[row['label']].add(term)
    for term in row['content']:
        if term not in term_dict[row['label']]:
            term_dict[row['label']].add(term)

print('up trend terms:',len(term_dict['up']))
print('down trend terms:',len(term_dict['down']))
print('content terms: ', content[0:5])
print('content terms: ', len(content))
wistron_term_trend_df.head


up trend terms: 27924
down trend terms: 22140
content terms:  ['新聞 外資 大砍 金融股 中信金原文 標題 外資 金融股 中信金 新光金 玉山金 同步 原文 連結 發布 時間 年月日 週二 上午 記者 署名 李瑞瑾 原文 內容 歐美銀行 信用 危機 連環 衝擊 金融 市場 信心 金融股 同步 遭到 外資 大砍 統計 上周 外資 買賣 個股 金融股 中信金 最多 新光金 玉山金 買超 最多 聯電 最多 根據 台灣證券交易所 統計 上周 外資 集中 市場 統計 年年 月日 外資 累計 買超 買超 最多 聯電 京元 電子 緯創', '閒聊 閒聊台股 成交量 億台指 成交 量口櫃 買指 成交量 指期 自營商 投信 外資 指選 自營商 投信 外資 漲停 家數 跌停 家數 成交量 排行 元大 台灣 反飛宏 群創 緯創 凌群 成交值 排行 台積電 世芯 創意 普飛宏 親愛 只要 解套', '情報 上市 投信 買賣 排行標題 投信 買賣 排行 來源 富邦證券 網址 內文 中興電 中鋼緯創英業達欣興宏碁 台塑 中華新光鋼廣達永冠南電 奇鋐 穎崴 友達台勝科聚陽嘉基中砂 瑞鼎仁寶 緯穎 瑞昱智邦東元元大高股息開發金晶華元大金台積電 台泥長榮航 太力智雄獅新唐金像電視陽第一金聯電聯強元大台灣高息低波元大 台灣聯發科 兆豐金聯詠國泰金遠東新光寶科盛達台灣 大賣 超群創臻鼎力積電 台光電華航 上緯 投控 世紀鋼致伸中鴻華擎裕民華新台耀 中信 金上品 同欣電 玉山金技 嘉榮成和康生', '情報 上市 外資 買賣 排行標題 外資 買賣 排行 來源 台灣 證交所 網址 內文 緯創 鴻海英業達金像電元大 台灣 欣興元大 滬深正潤泰新國泰智能電動車永豐金 臻鼎 奇鋐 耿鼎 群創 光罩 永冠仁寶聯合 台積電中鴻堤維西王道銀行 和碩國泰永續高股息台新金 兆豐金 台泥 裕隆 統一實中華毅嘉國泰費城半導體台汽電建準聯電元大高股息中環南緯威健智邦晶豪科華新期 街口 布蘭 特正達運 台中銀京元電子新光鋼佳龍精元賣超中信金陽明華航 長榮遠東新華邦電彩晶元晶國泰金萬海玉山金富', '標的 緯創 領錢 聊勝過去 今天 了結 其中 參與 去年 除息遲來 紅包 績效 一點 就是 懶快 信仰 不足 所以 打算 薯條 你們 魚尾 可以 後續 操作 可能 就是 基期

### Step13. Filter duplicated term between up & down trend

In [22]:
remove = []

for term in term_dict['down']:
    if term in term_dict['up']:
        remove.append(term)

for term in remove:
    term_dict['up'].remove(term)
    term_dict['down'].remove(term)

print('up trend terms:',len(term_dict['up']))
print('down trend terms:',len(term_dict['down']))



up trend terms: 19354
down trend terms: 13570


In [23]:
print(term_dict['up'])
print(term_dict['down'])

{'中美晶註', '乙特台新金中鋼國泰臺灣加權鴻海中信金旺宏元大台灣反東森臺企銀', '週期', '電子報', '宏碁東哥', '加速卡', '產品值', '特定穎', '自年以來', '日月光投控遠東新台塑', '銀康控', '愛國', '瑞昱晶豪科富采華擎矽格華碩', '証正富', '布局緯謙', '組人', '兩國', '雙衰', '長興億光遠東銀京城銀國泰美國道瓊', '兆豐金寶成元大金國泰永續高股息力積電中信金', '管理廠', '第一惠普', '升群益證', '富邦金緯創大成鋼新唐正新鴻準國泰智能電動車台中銀', '年將', '金成', '張不等', '說真', '台積電彩晶中信金', '永昌', '輪攻檔', '靈敏', '美債年', '國泰金國泰中國元大反國巨南電華新瀚宇博樺漢富邦中証南亞科國泰臺灣加權反國產智原', '強茂晶豪科台達化彰銀', '益航中鴻冠德中工南亞科信邦萬潤奇鋐超眾台嘉碩', '上海銀聯發科', '全科', '激化', '藥華藥', '樺漢科技元富證券', '台化致', '滬深正鴻海聯詠金寶中壽', '富邦金國泰金期元大', '彰銀中壽', '廣達英業達精華東聯國喬永豐餘矽統旭軟廣運晶電光寶科華容', '健鼎信義中興電', '良維僑威科', '台灣華邦電允強', '友達中', '萬海燿華定穎', '客製化', '宣德晶電光洋科玉晶光建準', '台泥華通國泰特新光鋼中信中國', '京元電子景碩欣興聯詠聯發科', '玉山金弘憶股益登泰碩京元電子', '鴻海董', '閎康華通元大滬深正中', '支等', '弘塑正達', '兆豐金緯創聯強大聯大世界南亞科環球晶晶電光寶科光洋科', '呂朝勝', '搶眼', '王台生', '強茂玉晶光長榮泰鼎金像電勤美聯陽創意中鋼', '側重', '永豐金富邦金華通台表科', '強茂劍麟', '永豐金宏全仁寶', '基泰裕民', '安控', '行銷長', '欣興電子國巨', '佳世達精元台新金國票金中工聯電臺企銀', '統一世紀鋼聯強', '友訊台積電', '祥碩大江', '潤泰材台積電', '台紙', '台灣大陸', '富邦國泰金日月光', '聯合大', '新潤', '元大台灣反元大台灣新光金', '凌巨裕民鳳凰興富發英業達亞', '永冠鈺齊第一金兆豐金聯強大亞', '億光力成晶電中信中國元大股智邦訊芯'

### Step14. Build the vocabulary

In [24]:
all_term = list(term_dict['up'])+list(term_dict['down'])
vocab={} #用於建立向量空間模型

for idx, term in enumerate(all_term):
    vocab[term] = idx
print(vocab)
print(len(vocab))

{'中美晶註': 0, '乙特台新金中鋼國泰臺灣加權鴻海中信金旺宏元大台灣反東森臺企銀': 1, '週期': 2, '電子報': 3, '宏碁東哥': 4, '加速卡': 5, '產品值': 6, '特定穎': 7, '自年以來': 8, '日月光投控遠東新台塑': 9, '銀康控': 10, '愛國': 11, '瑞昱晶豪科富采華擎矽格華碩': 12, '証正富': 13, '布局緯謙': 14, '組人': 15, '兩國': 16, '雙衰': 17, '長興億光遠東銀京城銀國泰美國道瓊': 18, '兆豐金寶成元大金國泰永續高股息力積電中信金': 19, '管理廠': 20, '第一惠普': 21, '升群益證': 22, '富邦金緯創大成鋼新唐正新鴻準國泰智能電動車台中銀': 23, '年將': 24, '金成': 25, '張不等': 26, '說真': 27, '台積電彩晶中信金': 28, '永昌': 29, '輪攻檔': 30, '靈敏': 31, '美債年': 32, '國泰金國泰中國元大反國巨南電華新瀚宇博樺漢富邦中証南亞科國泰臺灣加權反國產智原': 33, '強茂晶豪科台達化彰銀': 34, '益航中鴻冠德中工南亞科信邦萬潤奇鋐超眾台嘉碩': 35, '上海銀聯發科': 36, '全科': 37, '激化': 38, '藥華藥': 39, '樺漢科技元富證券': 40, '台化致': 41, '滬深正鴻海聯詠金寶中壽': 42, '富邦金國泰金期元大': 43, '彰銀中壽': 44, '廣達英業達精華東聯國喬永豐餘矽統旭軟廣運晶電光寶科華容': 45, '健鼎信義中興電': 46, '良維僑威科': 47, '台灣華邦電允強': 48, '友達中': 49, '萬海燿華定穎': 50, '客製化': 51, '宣德晶電光洋科玉晶光建準': 52, '台泥華通國泰特新光鋼中信中國': 53, '京元電子景碩欣興聯詠聯發科': 54, '玉山金弘憶股益登泰碩京元電子': 55, '鴻海董': 56, '閎康華通元大滬深正中': 57, '支等': 58, '弘塑正達': 59, '兆豐金緯創聯強大聯大世界南亞科環球晶晶電光寶科光洋科': 60, '呂朝勝': 61, '搶眼': 62, '王台生': 63, '強茂玉晶光長榮泰鼎金像電勤美聯陽創

### Step15.建立向量空間模型

In [47]:
# print(content)
uprow=len(wistron_term_trend_df[wistron_term_trend_df['label']=='up'])
downrow=len(wistron_term_trend_df[wistron_term_trend_df['label']=='down'])
print(uprow,downrow)

3692 2484


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(vocabulary = vocab, use_idf = True) #用給定的termset建構向量，並使用idf加權
X = vectorizer.fit_transform(content) #用給定的文件集做計算
X.shape



(6176, 32924)

In [32]:
from scipy import sparse
sparse.save_npz(os.path.join(os.getcwd(),"bda")+"model_new.npz", X) #寫入整個稀疏矩陣

### Load Model

In [33]:
from scipy import sparse
X = sparse.load_npz(os.path.join(os.getcwd(),"bda")+"model_new.npz") #讀入稀疏矩陣
X.shape

(6176, 32924)

In [48]:
import numpy as np
y=[] #用以儲存已知類別
for i in range(uprow): #已知前612篇為漲停，標記為看漲 (此為取巧權宜作法)
  y.append(['看漲'])
for i in range(downrow): #已知後126篇為跌停，標記為看跌
  y.append(['看跌'])

y=np.ravel(y) #轉換成array一維陣列，避免警告訊息
print(y.shape)

(6176,)


In [36]:
#給定任何一段內容當作查詢
query='筆電大廠緯創(3231)去年財報逆勢繳出亮麗成績單，EPS達4.01元，將配發2.6元現金股利，發放率65%，以週二(14日)收盤價計算殖利率7.7%。 \
  緯創週二傍晚召開的法說會中，看今年營運審慎樂觀，預估全年營收仍可維持成長，桌機、筆電等產品線評估持平至個位數衰退，但雲端需求仍望，伺服器出貨將優於去年。\
由於母公司本體業務產品結構調整效益發揮，旗下雲端伺服器大廠緯穎(6669)獲利又年年創高，緯創近年營運蒸蒸日上，去年更不畏PC市場的急凍殺戮，營收獲利持續創高。\
累計該公司全年合併營收9846.2億元，營業淨利達274.72億元，稅後淨利達111.62億元，EPS 4.01元。其中，第四季合併營收2638.5億元，單季營業淨利首破百億大關，\
  達106.94億元，稅後淨利為40.76億元，優於上季，但較同期下滑逾24%，EPS 1.46元。\
董事長林憲銘在法說會上特別談到AI大趨勢，看好聊天機器陣ChatGPT將帶動AI伺服器需求的成長，緯創受惠，而車電市場未來10年則會有大成長，營收規模上看千億元大關。\
至於今年，雖PC市況不佳，但緯創將持續深耕高毛利非PC業務，包括IPC、車用、服務、IoT及高階手持裝置等，估計今年非PC與顯示器業務貢獻之營收有機會衝3千億大關。\
而因應未來營運發展所需，緯創週二董事會決議將在不超過2.5億股的額度內，擬辦理現金增資發行普通股或參與發行海外存託憑證(GDR)，另訂6/15召開今年度的股東常會。\
受財報利多與良好的年度展望激勵，緯創週三(15日)股價大漲逾3%，最高來到34.85元，創逾2年半新高價，在PC大廠中一支獨秀，三大法人今年以來大買，外資買逾13萬張。'

# query='台股加權指數在最近9個交易日，從最高到最低點，跌了2,544點，創下史上最快速的失速列車紀錄；\
#   12日台股盤中急挫1,418點，市場衰鴻遍野，據統計，盤中最多曾有711檔個股觸及跌停、占上市櫃的四成比重，\
#   最後仍有251檔收跌停，其中，陽明等15檔股價亮燈跌停，仍有7千張以上賣單高掛，貨櫃三雄均入榜。'

#以下套用斷詞小範例
import monpa
from monpa import utils
str='' #暫存本篇斷詞後的內容用
sentence_list = utils.short_sentence(query) #斷句
for item in sentence_list:
  result_cut = monpa.cut(item) #斷詞
  for term in result_cut:
    term=term.strip() #去除前後多餘空白
    if(len(term)>1): #若詞長>1
      str=str+' '+term
print(str)

 筆電 大廠 緯創 3231 去年 財報 逆勢 繳出 亮麗 成績單 EPS 4.01 配發 2.6 現金 股利 發放率 65 週二 14 收盤價 計算 殖利率 7.7 緯創 週二 傍晚 召開 法說會 今年 營運 審慎 樂觀 預估 全年 營收 維持 成長 桌機 筆電 產品線 評估 個位數 衰退 雲端 需求 伺服器 出貨 優於 去年 由於 公司 本體 業務 產品 結構 調整 效益 發揮 旗下 雲端 伺服器 大廠 緯穎 6669 獲利 年年 創高 緯創 近年 營運 蒸蒸日 去年 PC 市場 急凍 殺戮 營收 獲利 持續 創高 累計 公司 全年 合併 營收 9846.2 營業 淨利 274.72 稅後 淨利 111.62 EPS 4.01 其中 第四季 合併 營收 2638.5 單季 營業 淨利 破百億 大關 106.94 淨利 40.76 優於 同期 下滑 24 EPS 1.46 董事長 林憲銘 法說會 特別 談到 AI 趨勢 看好 聊天 機器 ChatGPT 帶動 AI 伺服器 需求 成長 緯創 受惠 車電 市場 未來 10年 成長 營收 規模 千億 大關 至於 今年 PC 市況 不佳 緯創 持續 深耕 毛利 PC 業務 包括 IPC 車用 服務 IoT 高階 手持 裝置 估計 今年 PC 顯示器 業務 貢獻 營收 機會 3千億 大關 未來 營運 發展 緯創 週二 董事會 決議 超過 2.5 額度 辦理 現金 增資 發行 普通 參與 發行 海外 存託 憑證 GDR 15 召開 今年度 股東常會 財報 利多 良好 年度 展望 激勵 緯創 週三 15 股價 大漲 來到 34.85 高價 PC 大廠 獨秀 法人 今年 以來 外資 13萬


In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(vocabulary=vocab,use_idf=True) #用給定的termset建構向量，並使用idf加權 (這邊作法值得商榷)
q=vectorizer.fit_transform([str]) #將查詢也套入同一個向量空間
q.shape



(1, 32924)

In [51]:
print(q) #印出該查詢做觀察

  (0, 17314)	0.4472135954999579
  (0, 12867)	0.4472135954999579
  (0, 12396)	0.4472135954999579
  (0, 11709)	0.4472135954999579
  (0, 1162)	0.4472135954999579


### Build kNN vote

In [52]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(q[0:1], X).flatten() #與給定文件集的向量做相似度計算
related_docs_indices = cosine_similarities.argsort() #將相似度由小至大做排序，並轉換成文件編號
d=related_docs_indices[:-8:-1] #從後面取7個文件編號 (也就是相似度最大的前7名)
print(d)

[ 119  134 3779   46   61 3076  319]


In [53]:
#以kNN相似文件來投票決定類別
pred_up=0
pred_down=0
for i in d:
  if(i>=612): #已知文件編號0~611為漲停，612~737為跌停，故依此範圍判定 (此為取巧權宜作法)
    pred_down+=1 #票數加1
  else:
    pred_up+=1 #票數加1
print('pred_up:', pred_up, 'pred_down:', pred_down)

if(pred_up>pred_down):
  print('pred_label: up') #判定為漲停
elif(pred_up<pred_down):
  print('pred_label: down') #判定為跌停
else:
  print('pred_label: even') #票數平手 

pred_up: 5 pred_down: 2
pred_label: up


In [54]:
import pandas as pd
pd.DataFrame(X.toarray(),columns=vocab) #將X印出做觀察，把稀疏矩陣轉成array，再把欄位名稱用termset標示上去

Unnamed: 0,中美晶註,乙特台新金中鋼國泰臺灣加權鴻海中信金旺宏元大台灣反東森臺企銀,週期,電子報,宏碁東哥,加速卡,產品值,特定穎,自年以來,日月光投控遠東新台塑,...,宏碁華新潤泰新中信,長榮航居冠,滬深正富邦彩晶亞太電華映,次序,群益深証中小玉山金南港台泥,全新矽力天鈺瑞昱創惟頎邦穩懋中信中國,苗頭,啟痐舅,維持率,同小可因
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pd.DataFrame(y) #將y印出做觀察

Unnamed: 0,0
0,看漲
1,看漲
2,看漲
3,看漲
4,看漲
...,...
5605,看跌
5606,看跌
5607,看跌
5608,看跌


In [None]:
pd.DataFrame(q.toarray(),columns=vocab) #將q印出做觀察

Unnamed: 0,二連三,C Ratio,略顯,命潮,恐龍,銅鑼廠,2800萬,Sell in May and Go Away,旨 緯創,總算,...,整合推,長榮航 第一金 元晶 兆豐金 亞聚,原油正2 永豐金 晶電 燿華 鈞寶 仲琦 柏承 奇力新 亞翔 偉詮電 聯嘉 台灣高鐵 新興 華航,統一 陽明 遠東新 聯電 兆豐金 英業達 彰銀 台積電 第一金 富邦金,5號,廣宣,光業,Wistron InfoComm Mexico S A de C V WIMX WMX,美好證,萊因聯合
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### naive bayes classifier Cut 80/20

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB() #naive bayes classifier，準備好一個空的分類器
classifier.fit(X, y) #訓練，Ｘ就是要丟進去訓練的矩陣，y就是目標矩陣

MultinomialNB()

In [None]:
y_pred = classifier.predict(q) #預測，產出結果
y_pred

array(['看漲'], dtype='<U2')

In [None]:
from sklearn.model_selection import train_test_split #可以用來切成訓練資料及跟驗證資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) #隨機挑選30%當測試資料，70%當訓練資料
classifier = MultinomialNB()
classifier.fit(X_train, y_train) #訓練

MultinomialNB()

### Calculate Accuracy

In [None]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
y_pred= classifier.predict(X_test) #用測試資料預測
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #比對答案，計算準確率，y_test是答案

Accuracy: 0.7052881758764111


In [None]:
from sklearn.metrics import classification_report  
print(classification_report(y_test, y_pred)) #印出分類報告

              precision    recall  f1-score   support

          看漲       0.68      0.98      0.80      1029
          看跌       0.88      0.28      0.42       654

    accuracy                           0.71      1683
   macro avg       0.78      0.63      0.61      1683
weighted avg       0.76      0.71      0.66      1683



In [None]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(y_test, y_pred, labels=['看漲','看跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌  (機器預測的)
#    看漲         TN     FP
#    看跌         FN     TP
# （真實的情況）

[[1004   25]
 [ 471  183]]


### naive bayes classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

from sklearn.model_selection import cross_val_score #cross_val_score，可以直接計算出score

#交叉驗證，計算準確率，cross validation = cv = 5代表把資料切成5等分，4份做訓練，1份做測試，並且重複做5次求平均值。
#也可以用cv=10，代表把資料切成10等分，9份做訓練，1份做測試，並且重複10次求平均值。
scores = cross_val_score(classifier,X,y,cv=10,scoring='accuracy') 

print(scores)
print("Avg. Accuracy:",scores.mean())

[0.60784314 0.61853832 0.63279857 0.63636364 0.6399287  0.6399287
 0.6399287  0.59893048 0.62923351 0.61675579]
Avg. Accuracy: 0.6260249554367201


### DecisionTreeClassifier  

In [None]:
from sklearn.tree import DecisionTreeClassifier                          
classifier = DecisionTreeClassifier(criterion="entropy")

#接下來就是跟前一個眼㔯法一樣的方法，一樣計算5次求平均正確率
from sklearn.model_selection import cross_val_score
scores_dct = cross_val_score(classifier,X,y,cv=10,scoring='accuracy') #交叉驗證，計算準確率
print(scores_dct)
print("Avg. Accuracy:",scores_dct.mean())

[0.57397504 0.52941176 0.53832442 0.55614973 0.58823529 0.4688057
 0.53297683 0.61497326 0.58823529 0.50445633]
Avg. Accuracy: 0.549554367201426


### kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7)

#以下重複
from sklearn.model_selection import cross_val_score
scores_knn = cross_val_score(classifier,X,y,cv=10,scoring='accuracy') #交叉驗證，計算準確率
print(scores_knn)
print("Avg. Accuracy:",scores_knn.mean())

[0.51158645 0.50445633 0.49554367 0.56684492 0.45098039 0.45632799
 0.48841355 0.5026738  0.48841355 0.50980392]
Avg. Accuracy: 0.4975044563279858


### SVM

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')

from sklearn.model_selection import cross_val_score
scores_svm = cross_val_score(classifier,X,y,cv=10,scoring='accuracy') #交叉驗證，計算準確率
print(scores_svm)
print("Avg. Accuracy:",scores_svm.mean())

[0.64705882 0.5828877  0.60784314 0.61319073 0.61675579 0.57575758
 0.63636364 0.6631016  0.63279857 0.5258467 ]
Avg. Accuracy: 0.6101604278074866
