# 前期處理

* 輸入套件

In [1]:
import pandas as pd
import numpy as np
import monpa
import re
from datetime import datetime, timedelta
from collections import Counter

def clearlist(list1):  # 清除無意義字元
    for j in range(len(list1)):
        for i in range(len(list1[j])):
            list1[j][i] = re.sub(r'[\W\dA-Za-z]','',list1[j][i])
    return(list1)

+---------------------------------------------------------------------+
  Welcome to MONPA: Multi-Objective NER POS Annotator for Chinese
+---------------------------------------------------------------------+
已找到 model檔。Found model file.


* 資料匯入與初期整理（資料型態、空值、合併資訊）

In [2]:
bbs = pd.read_csv("bbs.csv",encoding="utf-8")
news = pd.read_csv("news.csv",encoding="utf-8")
forum = pd.read_csv("forum.csv",encoding="utf-8")

In [3]:
# Drop掉無意義的欄位
news.drop(news.columns[[0,1,2,3,6]], axis = 1, inplace = True)
bbs.drop(bbs.columns[[0,1,2,3,4,7]], axis = 1, inplace = True)
forum.drop(forum.columns[[0,1,2,3,4,7]], axis = 1, inplace = True)

# 將post_time欄位的data type由object改為datetime
news["post_time"] = pd.to_datetime(news["post_time"], format = "%Y/%m/%d")
bbs["post_time"] = pd.to_datetime(bbs["post_time"], format = "%Y/%m/%d")
forum["post_time"] = pd.to_datetime(forum["post_time"], format = "%Y/%m/%d")

# 把全部的新聞、部落格、與論壇文章合併為單一個dataframe，取名為articles
articles = pd.concat([news, bbs, forum])
articles["post_time"] = articles["post_time"].dt.date
articles.dropna(inplace = True)
articles.reset_index(drop = True, inplace = True)

# 匯入股價資訊
stockprice = pd.read_excel("大立光股價2016-2018.xlsx", sheet_name = "Total Price")
stockprice["年月日"] = stockprice["年月日"].dt.date

# 第一題：各挑選出看漲及看跌的一批文章，從中取出關鍵字列表， 建構向量空間

### Step1: 找出股價波動超過±σ的前n日日期，並抓出該些日期中有包含主題的文章

* 先挑出σ在3%以上的日期

In [4]:
Up_date = stockprice[stockprice["波動(漲跌幅)"] >= 0.03]["年月日"].values
Down_date = stockprice[stockprice["波動(漲跌幅)"] <= -0.03]["年月日"].values

* 接著利用上述所查詢到的157個日期，回推該日期前n日內的所有新聞，在這我們取n = 2

In [5]:
# 挑選前n日內所有文章
n = int(input("挑選n日內所有文章，n ="))  # 註：當日文章請輸入0
Up_publish_date = []
Down_publish_date = []

for i in range(len(Up_date)):
    for j in range(n):
        Up_publish_date.append(Up_date[i] - pd.Timedelta(days = j+1))
for i in range(len(Down_date)):
    for j in range(n):
        Down_publish_date.append(Down_date[i] - pd.Timedelta(days = j+1))
        
Up_title = []  # 上漲文章標題
Up_content = []  # 上漲文章內容
Down_title = []  # 下跌文章標題
Down_content = []  # 下跌文章內容

for i in range(len(articles)):
    if articles["post_time"][i] in Up_publish_date:
        Up_title.append(articles["title"][i])
        Up_content.append(articles["content"][i])
        
    if articles["post_time"][i] in Down_publish_date:
        Down_title.append(articles["title"][i])
        Down_content.append(articles["content"][i])

print("上漲文章有：" + str(len(Up_title)) + "篇")
print("下跌文章有：" + str(len(Down_title)) + "篇")

挑選n日內所有文章，n =2
上漲文章有：48486篇
下跌文章有：49122篇


* 抓出大立光漲跌超過2%前兩日裡，與大立光相關的文章

In [6]:
target = input("輸入要尋找的主題")  # 輸入要尋找的主題
Up_indexlist = []  # 上漲文章index
Up_artlist = []  # 所有上漲文章
Down_indexlist = []  # 下跌文章index
Down_artlist = []  # 所有下跌文章

for i in range(len(Up_title)):
    count = 0
    if target in str(Up_title[i]):
        count += 1
    elif target in str(Up_content[i]):
        count += 1

    if count >= 1:
        Up_indexlist.append(i)
        Up_artlist.append(Up_content[i])  

for i in range(len(Down_title)):
    count = 0
    if target in str(Down_title[i]):
        count += 1
    elif target in str(Down_content[i]):
        count += 1

    if count >= 1:
        Down_indexlist.append(i)
        Down_artlist.append(Down_content[i])  

for idx,i in enumerate(Up_artlist):
    Up_artlist[idx] = i.split("，")
clearlist(Up_artlist)  
for idx,i in enumerate(Down_artlist):
    Down_artlist[idx] = i.split("，")
clearlist(Down_artlist)

print("關於『"+target+"』的上漲文章總共出現『"+str(len(Up_indexlist))+"』篇")
print("關於『"+target+"』的下跌文章總共出現『"+str(len(Down_indexlist))+"』篇")

輸入要尋找的主題大立光
關於『大立光』的上漲文章總共出現『1872』篇
關於『大立光』的下跌文章總共出現『1664』篇


### Step2. 使用monpa切出適當字詞，並計算分析指標

In [7]:
def LongCut(long_sentence):  # Monpa切字
    seg = []
    for item in long_sentence.split(" "):
        if item != "\n": 
            seg.extend(monpa.cut(item))
    return seg


def GramCounter(Content):  # 計算TF、DF
    data_list = []
    data_monpa = []

    for i in range(len(Content)):
        data_monpa = []
        for j in Content[i]:
            data_monpa.extend(LongCut(j))
        data_list.append(data_monpa)

    m = data_list
    c = Counter()
    dfcount = Counter()
    for i in range(len(m)):
        for j in range(len(m[i])):
            c[m[i][j]]+=1

    for i in range(len(m)):
        k = list(set(m[i]))
        for j in range(len(k)):
            dfcount[k[j]]+=1 

    df = pd.DataFrame(dfcount.items(),columns = ["Term","DF"])    
    dfc = pd.DataFrame(c.items(),columns = ["Term","TF"])
    final = pd.merge(dfc,df, on="Term")
    
    return final


def OtherInfo(gram): # 計算W(t,f)、idft、TF-IDF
    N = 12735  # 全部有關大立光的文章
    
    WtdList = []   # 算W(t,d)
    Wtd = 1 + np.log10(np.array(gram["TF"])) 
    for i in Wtd:
        wtd = float("%0.3f" %i)
        WtdList.append(wtd)
        
    IDFtList = []   # 算 idf(t)
    IDFt = np.log10(N / np.array(gram["DF"]))   
    for j in IDFt:
        idft = float("%0.3f" %j)
        IDFtList.append(idft)
     
    TFIDFList = []   # 算TF-IDF
    TFIDF = np.array(WtdList) * np.array(IDFt)
    for k in TFIDF:
        tfidf = float("%0.3f" %k)
        TFIDFList.append(tfidf)
    
    gram["Wtf"] = WtdList
    gram["IDFt"] = IDFtList
    gram["TF-IDF"] = TFIDFList
    
    return gram

* 以下四段為切字與計算TF-IDF等指標，運算會消耗約25分鐘，我已經把結果存成兩個csv檔了，可以直接跳到輸入cvs檔的部分繼續運行

In [27]:
Up_gram = GramCounter(Up_artlist)
Down_gram = GramCounter(Down_artlist)

In [39]:
Up_gram = OtherInfo(Up_gram)
Up_gram

Unnamed: 0,Term,TF,DF,Wtf,IDFt,TF-IDF
0,蘋果,1665,712,4.221,1.253,5.287
1,銷售,321,229,3.507,1.745,6.120
2,無力,79,75,2.898,2.230,6.462
3,加上,1026,663,4.011,1.283,5.148
4,大陸,243,181,3.386,1.847,6.255
5,在內,18,17,2.255,2.875,6.482
6,的,9495,1555,4.977,0.913,4.545
7,其餘,133,119,3.124,2.029,6.340
8,品牌,64,57,2.806,2.349,6.592
9,業績,522,343,3.718,1.570,5.836


In [40]:
Down_gram = OtherInfo(Down_gram)
Down_gram

Unnamed: 0,Term,TF,DF,Wtf,IDFt,TF-IDF
0,台積電,1494,882,4.174,1.160,4.840
1,奈米,74,40,2.869,2.503,7.181
2,完勝,1,1,1.000,4.105,4.105
3,三星,111,56,3.045,2.357,7.176
4,製程,57,37,2.756,2.537,6.991
5,強壓,1,1,1.000,4.105,4.105
6,對手,25,21,2.398,2.783,6.673
7,的,8234,1369,4.916,0.969,4.762
8,蘋果,1438,616,4.158,1.315,5.470
9,代工,71,60,2.851,2.327,6.634


* 先存為Excel檔，方便未來存取

In [41]:
# Up_gram.to_excel(r"monpa上漲字串.xlsx",index = False,header = True,encoding="utf-8")
# Down_gram.to_excel(r"monpa下跌字串.xlsx",index = False,header = True,encoding="utf-8")

In [8]:
Up_gram = pd.read_excel("monpa上漲字串.xlsx")
Down_gram = pd.read_excel("monpa下跌字串.xlsx")

### Step3. 刪除頻繁重複出現在上漲與下跌的字詞

* 為了加速計算，我們先設一個DF > 2的門檻，將出現次數極低的字詞先刪除

In [9]:
Up_gram = Up_gram[Up_gram["DF"] > 2].reset_index(drop = True)
Down_gram = Down_gram[Down_gram["DF"] > 2].reset_index(drop = True)

* 利用inner merge，找出重複字再分別從兩個字詞集刪除

In [10]:
Repeat = pd.merge(Up_gram, Down_gram, on = ["Term"], how = "inner")
Repeat

Unnamed: 0,Term,TF_x,DF_x,Wtf_x,IDFt_x,TF-IDF_x,TF_y,DF_y,Wtf_y,IDFt_y,TF-IDF_y
0,蘋果,1665,712,4.221,1.253,5.287,1438,616,4.158,1.315,5.470
1,銷售,321,229,3.507,1.745,6.120,284,208,3.453,1.787,6.170
2,無力,79,75,2.898,2.230,6.462,72,61,2.857,2.320,6.627
3,加上,1026,663,4.011,1.283,5.148,934,609,3.970,1.320,5.242
4,大陸,243,181,3.386,1.847,6.255,220,146,3.342,1.941,6.486
5,在內,18,17,2.255,2.875,6.482,13,11,2.114,3.064,6.476
6,的,9495,1555,4.977,0.913,4.545,8234,1369,4.916,0.969,4.762
7,其餘,133,119,3.124,2.029,6.340,123,107,3.090,2.076,6.414
8,品牌,64,57,2.806,2.349,6.592,82,56,2.914,2.357,6.868
9,業績,522,343,3.718,1.570,5.836,396,252,3.598,1.704,6.130


In [11]:
for i in range(len(Up_gram)):
    if Up_gram["Term"][i] in Repeat["Term"].values:
        Up_gram.drop([i], inplace = True)
Up_gram = Up_gram.sort_values(by = "TF-IDF", ascending = False)
Up_gram= Up_gram.reset_index(drop=True)
Up_gram

Unnamed: 0,Term,TF,DF,Wtf,IDFt,TF-IDF
0,永昌,19,3,2.279,3.628,8.268
1,盛購,16,3,2.204,3.628,7.996
2,谷月涵,27,8,2.431,3.202,7.784
3,分割,18,5,2.255,3.406,7.681
4,平均值,23,11,2.362,3.064,7.236
5,治理,11,4,2.041,3.503,7.149
6,魏永祥,14,6,2.146,3.327,7.139
7,複利,9,3,1.954,3.628,7.089
8,周俊宏,9,3,1.954,3.628,7.089
9,投報率,9,3,1.954,3.628,7.089


In [12]:
for i in range(len(Down_gram)):
    if Down_gram["Term"][i] in Repeat["Term"].values:
        Down_gram.drop([i], inplace = True)
Down_gram = Down_gram.sort_values(by = "TF-IDF", ascending = False)
Down_gram = Down_gram.reset_index(drop = True)
Down_gram

Unnamed: 0,Term,TF,DF,Wtf,IDFt,TF-IDF
0,富購,40,5,2.602,3.406,8.862
1,坪,33,6,2.519,3.327,8.380
2,特選,22,4,2.342,3.503,8.204
3,商圈,16,3,2.204,3.628,7.996
4,租金,24,7,2.380,3.260,7.759
5,士電,12,3,2.079,3.628,7.542
6,辦公,11,3,2.041,3.628,7.404
7,東訊,11,4,2.041,3.503,7.149
8,面積,15,7,2.176,3.260,7.094
9,工業區,15,7,2.176,3.260,7.094


* 現在我們確定了出現在上漲以及下跌字詞集裡的字了，分別有2237與1461個，我們將他們合併為一個叫做Words的List

In [13]:
# LPC = pd.read_excel("大立光新聞.xlsx")
Up_word = Up_gram["Term"][:].values.tolist()
Down_word = Down_gram["Term"][:].values.tolist()
word = Up_word + Down_word

In [14]:
print("Model Dimension:", len(word))

Model Dimension: 3698


### Step 4. 篩選出大立光的訓練與測試集文章

In [15]:
# 找有股價漲跌達3%以上的前兩日日期
Up_publish_date = []
Down_publish_date = []

for i in range(len(Up_date)):
    for j in range(n):
        Up_publish_date.append(Up_date[i] - pd.Timedelta(days = j+1))
for i in range(len(Down_date)):
    for j in range(n):
        Down_publish_date.append(Down_date[i] - pd.Timedelta(days = j+1))

Up_publish_date = list(set(Up_publish_date))
Down_publish_date = list(set(Down_publish_date))

LPC = articles[(articles["content"].str.contains("大立光") | articles["title"].str.contains("大立光"))]  # 所有大立光的文章
LPC = LPC[LPC["post_time"].isin(Up_publish_date + Down_publish_date)].reset_index(drop = True)  # 所有大立光出現在有2%漲幅前兩日的文章
LPC_stock = stockprice[stockprice["年月日"].isin(list(set(LPC["post_time"].tolist())))]

# # # 把股價資料集和新聞資料集做合併，刪除不需要的columns
LPC_stock.rename(columns={"年月日":"post_time"}, inplace = True)
LPC = pd.merge(LPC, LPC_stock, on = ["post_time"], how = "inner")
LPC = LPC.dropna().reset_index(drop = True)
LPC = LPC.iloc[:, [0,1,2,14]]

# # # 將股價波動轉為dummy，只取波動3％以上，其餘刪除
LPC.loc[LPC["波動(漲跌幅)"] >= 0.03, "波動(漲跌幅)"] = 1
LPC.loc[LPC["波動(漲跌幅)"] <= -0.03, "波動(漲跌幅)"] = 0
LPC = LPC[LPC["波動(漲跌幅)"].isin([0,1])]
LPC = LPC.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [16]:
LPC

Unnamed: 0,post_time,title,content,波動(漲跌幅)
0,2016-01-06,《各報要聞》蘋果不甜，大立光12月營收掉3成,蘋果iPhone 6s/6s Plus銷售無力，加上大陸在內的其餘品牌業績乏善可陳，全球手機...,0.0
1,2016-01-06,台股盤前－兩岸貨貿+陸客中轉選前釋利多 可望收復8100,台股昨(5)日跌破8100點關卡，下跌39點，指數收在8075點，成交值802億元。歐美股市...,0.0
2,2016-01-06,《外資》台股電子股，麥格理喊進6檔,麥格理資本證券昨（5）日發表2016年大中華科技產業研究報告，認為產品轉型與新產品週期將重塑...,0.0
3,2016-01-06,《今日焦點新聞》央行穩匯市，掛三大保證,時報-今日焦點新聞<BR> 國內頭條：<BR> 1.蘋果不甜，大立光12月營收掉3成。(工商...,0.0
4,2016-01-06,【Ｙ早報】央行穩匯市 掛三大保證,（開盤日9:00出刊）美股止跌，道瓊小漲9點；選前政策放利多，陸客來台中轉放行，航空、免稅店...,0.0
5,2016-01-06,大立光開盤重挫200元 2000元關卡失守,（中央社記者韓婷婷台北2016年1月6日電）蘋果iPhone6s/plus銷售不如預期，加上...,0.0
6,2016-01-06,焦點類股：大立光(3008)拖累，PCB蘋概股走跌創波段新低,受到12月業績表現低於預期，股王大立光(3008)開盤摜破2000元大關，蘋果光環褪色拖累相...,0.0
7,2016-01-06,《台北股市》盤中焦點股：盛達、東森、晶電、華晶科、大立光,1.台星科 (3265) ：外資、投信兩日賣超，股價回跌逾2%。<BR> 2.F-訊芯 (6...,0.0
8,2016-01-06,《盤中解析》股王遇難，8000點告急,美股回穩，但股王大立光 (3008) 開盤重挫，蘋果供應鏈遭空頭打壓，台股開低走低，最低80...,0.0
9,2016-01-06,《光電股》2000關失守，大立光跌停創14月新低,大立光電 (3008) 公布去年12月合併營收為39.79億元，較11月衰退30%，由於今年...,0.0


* 接著我們將這1068篇的文章，分別以字串方式切字斷句

In [17]:
news = LPC["content"].tolist()
news_split = []
for i in range(len(news)):
    news1 = LongCut(news[i])
    str1 = " ".join(news1)
    news_split.append(str1)
news_split

['蘋果 iPhone 6 s/6s Plus 銷售 無力 ， 加上 大陸 在內 的 其餘 品牌 業績 乏善可陳 ， 全球 手機 鏡頭 霸主 大立光 ( 3008 ) 去年 12月 營收 月 減 3成 ， 迥異 過去 3 年 單月 營收 都 在 12月 登峰 之 慣例 ， 等於 提前 一 個 月 進入 淡季 ， 並 拖累 大立光 去年 第四季 營收 ， 分別 較 上一季 和 前年 同 期 衰退 3.9 ％ 和 8.5 ％ ， 是 近 年 首 見 。 < BR > 元月 營收 預期 持續 下滑 < BR > 大立光 評估 ， 元月 營收 預期 還 將 持續 下滑 ， 加上 2月 春節 長假 效應 ， 整 個 第一 季 營運 無 太 大 亮點 。 至於 整體 展望 ， 預計 元月 中旬 舉行 的 季度法 說會 進一步 說明 。 < BR > 蘋果 旗下 iPhone 產品線 佔 大立光 總 營收 約 65 - 70 ％ ， 由於 iPhone 6 s/6s Plus 創新 功能 不 多 ， 加上 全球 經濟 景氣 出現 劇烈 波動 ， 儘管 蘋果 祭出 「 年年 換 新機 」 的 促銷 ， 且 執行長 庫克 還 大力 在 中國大陸 市場 宣傳 ， 但 終究 難挽 頹勢 。 由於 各 家 研調機構 對 蘋果 iPhone 6 s/6s Plus 數字 都 不 樂觀 ， 導致 蘋果 不僅 自家 股票 價格 直直落 ， 更 引發 供應鏈 股價 重挫 。 < BR > 股價 波段 跌幅 達 43.2 ％ < BR > 以 大立光 來 說 ， 昨 （ 5 ） 日 又 下跌 2.99 ％ 、 以 2,110 元 做收 ， 逼近 前波 創下 的 2,045 元 低點 。 若 從 去年 7月 初 3,715 元 歷史 高峰 回跌 計算 ， 波段 跌幅 高達 43.2 ％ ， 是 台股 去年 下半年 跌幅 最 慘 的 權值股 之 一 。 < BR > 12月 本 是 年度 營收 高峰 < BR > 大立光 在 2012 - 2014年 三 年 之間 ， 都 是 在 12月 登上 當 年度 營收 高峰 ， 同時 改寫 單月 歷史 新高 ， 直到 隔年 元月 才 出現 衰退 ， 但 此 波 卻 一 反常態 ， 先 是 11月 就 出現 月 減 1 ％ ， 然後 12月 更 大幅 衰退 30.

* 切詞部分需要跑大約10分多鐘，所以我把檔案存成"大立光訓練與測試集文章.txt"，之後要跑直接open file就不用重新切字

In [18]:
# with open('大立光訓練與測試集文章.txt', 'w') as f:
#     for item in news_split:
#         f.write("%s\n" % item)

In [21]:
with open("大立光訓練與測試集文章.txt", "r", encoding = "utf-8") as text:
    news_split = text.readlines()
text.close()

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary = word)
vectorizer.fit_transform(word)
tf = vectorizer.transform(news_split)
vec = pd.DataFrame(tf.toarray(), columns = vectorizer.get_feature_names()) 
vec.head()

Unnamed: 0,永昌,盛購,谷月涵,分割,平均值,治理,魏永祥,複利,周俊宏,投報率,...,登峰,江,橫跨,豐達科,捲入,一改,主板,揪伴,登陸,慘兮兮
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


* 我們先從完全不降維開始做

## ＊第二題：將前述兩批文章作為訓練資料及測試資料，使用監督式學習之分類演算法，評估分類模型之準確率

* 我們要用來分析的大立光漲跌文章為以下1755篇與大立光相關的文章：

In [66]:
LPC

Unnamed: 0,post_time,title,content,波動(漲跌幅)
0,2016-01-05,大立光跌跌不休 淡季陰影揮不去,（中央社記者韓婷婷台北2016年1月5日電）蘋果iPhone6的光環漸退，大立光 (3008...,0.0
1,2016-01-05,盤中分析：國際盤衝擊，台股驚魂未定，8100點震盪，僅題材股表現,全球股市2016年紅盤踢鐵板，美國道瓊大跌276.09點，中國股市更上演熔斷機制，滬深指數重...,0.0
2,2016-01-05,【台股盤中】台股止跌 驚魂未定反彈不足,（中央社記者韋樞台北2016年1月5日電）國際股市大跌，亞股率先翻紅，台股開低失守8100點...,0.0
3,2016-01-05,台股盤中－TRF風暴 止穩之路遭金融股扯後腿 8100苦戰,美股昨天重挫但留下200多點下影線，台股今(5)日開低，最低摔到8072點，一度跌破8100...,0.0
4,2016-01-05,【台股盤後】台股信心不足 8100點失守,（中央社記者韋樞台北2016年1月5日電）台股受國際股市拖累，今天延續4日的跌勢，盤中雖一度...,0.0
5,2016-01-05,《盤後解析》空壓罩頂，退守8000,國際股市續挫，國內選舉干擾、經濟不振，台股今日仍開低收低，8100城池再告失。台股大盤今早跟...,0.0
6,2016-01-05,《集中市場》驚魂未定，三大法人同步賣超47.92億元,歐美股市昨夜持續重挫，亞股早盤率先回穩，人行傳出注資1300億人幣企穩股匯市，台股開低後一度...,0.0
7,2016-01-05,歷年罕見 大立光12月營收年減31%,（中央社記者韓婷婷台北2016年1月5日電）大立光電 (3008) 公布，去年12月合併營收...,0.0
8,2016-01-05,《業績-光電》大立光上月營收月減30%，1月估續跌,大立光電 (3008) 自結去年12月合併營收為39.79億元，較11月衰退30%，大立光表...,0.0
9,2016-01-05,蘋果走味！大立光12月營收39.79億元月減30.28%,鉅亨網記者張欽發 台北台北股市股王大立光 (3008) 今天收盤股價爆量重挫2.99%，收盤...,0.0


* 針對文字矩陣，我們試用Variance Threshold刪除掉變異數過小的特徵值，降低維度

### Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty = "l2", solver = "lbfgs", max_iter = 100, random_state = 101)
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

Accuracy： 0.705607476635514
Recall： 0.2318840579710145
Precision： 0.6153846153846154


array([[135,  10],
       [ 53,  16]])

### KNN

In [28]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)

print(pd.DataFrame(y_pred).iloc[:, 0].value_counts())

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

0.0    188
1.0     26
Name: 0, dtype: int64
Accuracy： 0.6214953271028038
Recall： 0.10144927536231885
Precision： 0.2692307692307692


array([[126,  19],
       [ 62,   7]])

### SVM

In [29]:
from sklearn.svm import SVC
svc = SVC(C = 1, kernel = "rbf", gamma = "scale", random_state = 101)
svc.fit(X_train, Y_train)
y_pred = svc.predict(X_test)

print(pd.DataFrame(y_pred).iloc[:, 0].value_counts())

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

0.0    210
1.0      4
Name: 0, dtype: int64
Accuracy： 0.677570093457944
Recall： 0.028985507246376812
Precision： 0.5


array([[143,   2],
       [ 67,   2]])

### Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = "gini", random_state = 101)
dtc.fit(X_train, Y_train)
y_pred = dtc.predict(X_test)

print(pd.DataFrame(y_pred).iloc[:, 0].value_counts())

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

0.0    168
1.0     46
Name: 0, dtype: int64
Accuracy： 0.6682242990654206
Recall： 0.3188405797101449
Precision： 0.4782608695652174


array([[121,  24],
       [ 47,  22]])

### Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100, criterion = "gini", random_state = 101)
rfc.fit(X_train, Y_train)
y_pred = rfc.predict(X_test)

print(pd.DataFrame(y_pred).iloc[:, 0].value_counts())

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

0.0    170
1.0     44
Name: 0, dtype: int64
Accuracy： 0.705607476635514
Recall： 0.36231884057971014
Precision： 0.5681818181818182


array([[126,  19],
       [ 44,  25]])

* 在沒有降維的情況下，準確度最高的是Logistic和Random Foreset，準確度約7成，接著我們嘗試用Variance Threshold做feature selection降低維度，通過對dataframe做describe可以找出變異數的最大與最小值落在(0.046, 0)，所以我們會針對這個區間內的值做微調，找出最佳維度

In [52]:
var_max = max((vec.describe().iloc[2, :]) ** 2)
var_min = min((vec.describe().iloc[2, :]) ** 2)

In [54]:
np.arange(var_min, var_max, 0.002)

array([0.   , 0.002, 0.004, 0.006, 0.008, 0.01 , 0.012, 0.014, 0.016,
       0.018, 0.02 , 0.022, 0.024, 0.026, 0.028, 0.03 , 0.032, 0.034,
       0.036, 0.038, 0.04 , 0.042, 0.044, 0.046])

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression(penalty = "l2", solver = "lbfgs", max_iter = 100, random_state = 101)
knn = KNeighborsClassifier(n_neighbors = 10)
svc = SVC(C = 1, kernel = "rbf", gamma = "scale", random_state = 101)
dtc = DecisionTreeClassifier(criterion = "gini", random_state = 101)
rfc = RandomForestClassifier(n_estimators = 100, criterion = "gini", random_state = 101)

def ModelAccuracy(X_train, X_test, Y_train, Y_test):
    lr.fit(X_train, Y_train)
    knn.fit(X_train, Y_train)
    svc.fit(X_train, Y_train)
    dtc.fit(X_train, Y_train)
    rfc.fit(X_train, Y_train)
    
    LR_acc = accuracy_score(Y_test, lr.predict(X_test))
    KNN_acc = accuracy_score(Y_test, knn.predict(X_test))
    SVC_acc = accuracy_score(Y_test, svc.predict(X_test))
    DTC_acc = accuracy_score(Y_test, dtc.predict(X_test))
    RFC_acc = accuracy_score(Y_test, rfc.predict(X_test))         
    
    return LR_acc, KNN_acc, SVC_acc, DTC_acc, RFC_acc

In [67]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

LR_Best_Dimension = 3000
LR_Best_Score = 0
KNN_Best_Dimension = 3000
KNN_Best_Score = 0
SVC_Best_Dimension = 3000
SVC_Best_Score = 0
DTC_Best_Dimension = 3000
DTC_Best_Score = 0
RFC_Best_Dimension = 3000
RFC_Best_Score = 0

for i in np.arange(var_min, var_max, 0.002):
    vt = VarianceThreshold(threshold = i)
    vt_vec = vt.fit_transform(vec)
    
    X = vt_vec
    Y = LPC["波動(漲跌幅)"]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
    
    LR_acc, KNN_acc, SVC_acc, DTC_acc, RFC_acc = ModelAccuracy(X_train, X_test, Y_train, Y_test)
    if  LR_acc >= LR_Best_Score:
        LR_Best_Score = LR_acc
        LR_Best_Dimension = vt_vec.shape[1]
    if  KNN_acc >= KNN_Best_Score:
        KNN_Best_Score = KNN_acc
        KNN_Best_Dimension = vt_vec.shape[1]
    if  SVC_acc >= SVC_Best_Score:
        SVC_Best_Score = SVC_acc
        SVC_Best_Dimension = vt_vec.shape[1]
    if  DTC_acc >= DTC_Best_Score:
        DTC_Best_Score = DTC_acc
        DTC_Best_Dimension = vt_vec.shape[1]
    if  RFC_acc >= RFC_Best_Score:
        RFC_Best_Score = RFC_acc
        RFC_Best_Dimension = vt_vec.shape[1]
        
print("Logistic Regression Dimension:", LR_Best_Dimension, "Accuracy:", LR_Best_Score)
print("KNN Dimension:", KNN_Best_Dimension, "Accuracy:", KNN_Best_Score)
print("SVM Dimension:", SVC_Best_Dimension, "Accuracy:", SVC_Best_Score)
print("Decision Tree Dimension:", DTC_Best_Dimension, "Accuracy:", DTC_Best_Score)
print("Random Forest: Dimension =", RFC_Best_Dimension, ", Accuracy =", RFC_Best_Score)

Logistic Regression Dimension: 1995 Accuracy: 0.705607476635514
KNN Dimension: 53 Accuracy: 0.6869158878504673
SVM Dimension: 89 Accuracy: 0.6822429906542056
Decision Tree Dimension: 140 Accuracy: 0.6915887850467289
Random Forest: Dimension = 1995 , Accuracy = 0.7102803738317757


* 結論來看，Random Forest還是有最佳的準確度，且相對於其他模型能夠使用更高維度的資料（因為Random Forest本身就有降維的作用，所以這樣的結果也算合理）