In [1]:
import nltk
from nltk.metrics import  BigramAssocMeasures
from nltk.probability import  FreqDist,ConditionalFreqDist
import pandas as pd
import jieba
import re
import sklearn
from nltk.classify.scikitlearn import  SklearnClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import  MultinomialNB, BernoulliNB
from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import  accuracy_score
from random import shuffle

In [2]:
# 利用新聞內容,做新聞分類屬性預測

# 先斷詞 -- 先用Jieba繁體字庫

df = pd.read_csv('freedom.csv',encoding="cp950")  #讀取檔案 CSV須加編碼

world = df[df["版別"] == "world"]['文章內容']
politics = df[df["版別"] == "politics"]['文章內容']

jieba.set_dictionary('dict.txt.big.txt')   #設定結巴的繁體字典 (中研院)
jieba.load_userdict("userdict.txt")  #可以補充字典

#目標型態是[['','',''],['','',''],['','','']...]

wor = []
for article in world:
    wor_2 = []
    newcontent = re.sub("〔.*?〕|（.*?）",'',str(article))       #*?獲取最短滿足條件 > 這邊因為資料內容，把記者XXX拿掉
    newcontent_2 = re.sub("\d|[\s+\.\!\/_,$%^*(+\"\'《》「」]+|[+-—！，。？、~@#￥%……&*()（）:]+",'',newcontent) #拿掉停止詞或符號
    words = jieba.cut(newcontent_2,cut_all=False)
    for w in words:
        wor_2.append(w)
        wor.append(wor_2)
    
pol = []
for article in politics:
    pol_2 = []
    newcontent = re.sub("〔.*?〕|（.*?）",'',str(article))       
    newcontent_2 = re.sub("\d|[\s+\.\!\/_,$%^*(+\"\'《》「」]+|[+-—！，。？、~@#￥%……&*()（）:]+",'',newcontent)
    words = jieba.cut(newcontent_2,cut_all=False)
    for w in words:
        pol_2.append(w)    
        pol.append(pol_2)

Building prefix dict from /Users/adam/Jupyter/TextMining/dict.txt.big.txt ...
Loading model from cache /var/folders/sf/2gqbqkg57fxb0rf8m_kdk4z80000gn/T/jieba.u501f071ef6c2eb8ce82a25c715304e45.cache
Loading model cost 1.142 seconds.
Prefix dict has been built successfully.


In [3]:
# 計算斷完詞後的詞頻 -> 後續用來計算資訊量

word_fd = FreqDist() #統計詞頻
cond_word_fd = ConditionalFreqDist() #統計在條件下的詞頻

#預計結果會是dict {'詞':num,...}
for word2 in wor:
    for word in word2:
        word_fd[word] += 1
        cond_word_fd['wor'][word] += 1

for word2 in pol:
    for word in word2:
        word_fd[word] += 1
        cond_word_fd['pol'][word] += 1

wor_word_count = cond_word_fd['wor'].N() #國際新聞的詞量
pol_word_count = cond_word_fd['pol'].N() #政治新聞的詞量

total_word_count = wor_word_count + pol_word_count  #總詞量

In [4]:
word_scores={}

for word, freq in word_fd.items():   #分別計算在各詞在不同版別的訊息量 (這邊用卡方;也可以用其他的)
    wor_score = BigramAssocMeasures.chi_sq(cond_word_fd['wor'][word],  (freq, wor_word_count), total_word_count)
    pol_score = BigramAssocMeasures.chi_sq(cond_word_fd['pol'][word],  (freq, pol_word_count), total_word_count)
    word_scores[word] = wor_score + pol_score #各個詞總訊息量 (等於分別相加)

In [5]:

#按照訊息量排序(大->小) [只留訊息前幾大的詞]
best_vals = sorted(word_scores.items(), key=lambda x:x[1],  reverse=True)[:300]
# set 去除重複
best_words = set([w for w,s in best_vals])
ss=dict([(word, True) for word in best_words])

#產生訓練用樣本
worfeature = []
for items in wor:
    a = {}
    for item in items:
        if item in ss.keys():  #前幾大訊息量的詞語
            a[item]='True'
    worWords = [a,'wor'] # 型態:一篇新聞一個詞語的dict + 'wor'  [ [dict,'wor'],[dict,'wor']..... ]
    worfeature.append(worWords) 

polfeature = []
for items in pol:
    a = {}
    for item in items:
        if item in ss.keys():
            a[item]='True'
    polWords = [a,'pol'] 
    polfeature.append(polWords)


In [6]:
#隨機排序 樣本數須控制一樣
shuffle(polfeature)
shuffle(worfeature)
size = int(len(polfeature)*0.3)

train =  polfeature[size:]+worfeature[size:]
test = polfeature[:size]+worfeature[:size]

data,tag = zip(*test) #把x與y拆開

In [7]:
def score(ml):
    classifier = SklearnClassifier(ml) 
    classifier.train(train) #訓練
    pred = classifier.classify_many(data) #預測結果
    n = 0
    s = len(pred)
    for i in range(0,s):
        if pred[i]==tag[i]:
            n = n+1
    return n/s #準確度

In [0]:
# test 資料精準度 [分類正確率]  -- gridsearch還需轉換，這邊自己簡單試
print('--------naive_bayes-------')

for alpha in [0.1,1,10,100]: #測試alpha平滑參數
    print('樸素貝葉斯-伯努利      {alpha=%s}, accuracy is %f'  %(alpha,score(BernoulliNB(alpha=alpha))))
    print('樸素貝葉斯-Multinomial {alpha=%s}, accuracy is %f'  %(alpha,score(MultinomialNB(alpha=alpha))))

print('\n')
print('--------Logistic regression-------')

for solver in ['newton-cg','lbfgs','sag','liblinear']: #損失函數優化方式
    print('Logistic {solver=%s}, accuracy is  %f' %(solver,score(LogisticRegression(solver=solver))))

print('\n')
print('--------SVM-------')

for C in [0.001,0.1,1,10]: #懲罰項;調和誤差跟分錯 -> 太大容易過擬和 (default 1)
    for gamma in ['scale','auto',0.0001,0.001,0.1]: #gamma值大 資料影響的範圍越小 -> 可能過擬合 (default scale)
        print('SVM {懲罰項=%s, gamma=%s}, accuracy is %f'  %(C,gamma,score(SVC(C=C,gamma=gamma,random_state=666))))

--------naive_bayes-------
樸素貝葉斯-伯努利      {alpha=0.1}, accuracy is 0.982456
樸素貝葉斯-Multinomial {alpha=0.1}, accuracy is 0.991228
樸素貝葉斯-伯努利      {alpha=1}, accuracy is 0.982456
樸素貝葉斯-Multinomial {alpha=1}, accuracy is 0.986842
樸素貝葉斯-伯努利      {alpha=10}, accuracy is 0.960526
樸素貝葉斯-Multinomial {alpha=10}, accuracy is 0.986842
樸素貝葉斯-伯努利      {alpha=100}, accuracy is 0.820175
樸素貝葉斯-Multinomial {alpha=100}, accuracy is 0.973684


--------Logistic regression-------
Logistic {solver=newton-cg}, accuracy is  0.991228
Logistic {solver=lbfgs}, accuracy is  0.991228
Logistic {solver=sag}, accuracy is  0.991228
Logistic {solver=liblinear}, accuracy is  0.991228


--------SVM-------
SVM {懲罰項=0.001, gamma=scale}, accuracy is 0.929825




SVM {懲罰項=0.001, gamma=auto}, accuracy is 0.899123
SVM {懲罰項=0.001, gamma=0.0001}, accuracy is 0.899123
SVM {懲罰項=0.001, gamma=0.001}, accuracy is 0.899123
SVM {懲罰項=0.001, gamma=0.1}, accuracy is 0.767544
SVM {懲罰項=0.1, gamma=scale}, accuracy is 0.956140
SVM {懲罰項=0.1, gamma=auto}, accuracy is 0.899123
SVM {懲罰項=0.1, gamma=0.0001}, accuracy is 0.899123
SVM {懲罰項=0.1, gamma=0.001}, accuracy is 0.899123
SVM {懲罰項=0.1, gamma=0.1}, accuracy is 0.828947
SVM {懲罰項=1, gamma=scale}, accuracy is 0.995614
SVM {懲罰項=1, gamma=auto}, accuracy is 0.978070
SVM {懲罰項=1, gamma=0.0001}, accuracy is 0.899123
SVM {懲罰項=1, gamma=0.001}, accuracy is 0.960526
SVM {懲罰項=1, gamma=0.1}, accuracy is 0.982456
SVM {懲罰項=10, gamma=scale}, accuracy is 0.991228
SVM {懲罰項=10, gamma=auto}, accuracy is 0.991228
SVM {懲罰項=10, gamma=0.0001}, accuracy is 0.960526
SVM {懲罰項=10, gamma=0.001}, accuracy is 0.986842
SVM {懲罰項=10, gamma=0.1}, accuracy is 0.991228
