In [None]:
import glob
import os
import pandas as pd
import jieba

fs = glob.glob("chinese_news_test/*/*[tT][xX][tT]")
cats = []
contents = []
for fname in fs:
    fn = os.path.split(fname)[-1]
    with open(fname, "r", encoding = "utf8") as f:
        content = f.read()
        contents.append(content)
    dn = os.path.split(fname)[0]
    dn = os.path.split(dn)[-1]
    cats.append(dn)
    
test_df = pd.DataFrame({
    "content": contents,
    "ans":cats
}
)

test_df

In [None]:
fs = glob.glob("chinese_news_trans/*/*[tT][xX][tT]")
cats = []
contents = []
for fname in fs:
    fn = os.path.split(fname)[-1]
    with open(fname, "r", encoding = "utf8") as f:
        content = f.read()
        contents.append(content)
    dn = os.path.split(fname)[0]
    dn = os.path.split(dn)[-1]
    cats.append(dn)
    
train_df = pd.DataFrame({
    "content": contents,
    "ans":cats
}
)

train_df

In [None]:
jieba.set_dictionary('dict.txt.big')
def cut(s):
    s = s.replace("\r", "").replace("\r", "")
    return " ".join(jieba.cut(s))

# apply: pandas operation to the all row
x_train = train_df["content"].apply(cut)
x_test = test_df["content"].apply(cut)

In [None]:
u = train_df["ans"].unique()
trans = {label:i for i, label in enumerate(u)}

# trans = {
#     "交通": 0,
#     "政治": 1,
#     "計算機": 2,
#     "軍事": 3,
#     "教育": 4,
#     "經濟": 5,
#     "環境": 6,
#     "醫藥": 7,
#     "藝術": 8,
#     "體育": 9
# }
reverse_trans = {v:k for k,v in trans.items()}
y_train = train_df["ans"].replace(trans)
y_test = test_df["ans"].replace(trans)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Naive bayes can filter the important variables. Therefore, we don't use TfidfVectorizer
vec = CountVectorizer()
x_train_vec = vec.fit_transform(x_train)
x_test_vec = vec.transform(x_test)
# vec.vocabulary_

In [None]:
from sklearn.naive_bayes import MultinomialNB
# The feature value is discrete.
# GaussianNB: the feature value is continuous.
# BernouliNB: the feature value is binary.
clf = MultinomialNB(alpha = 1)
clf.fit(x_train_vec, y_train)

In [None]:
from sklearn.metrics import accuracy_score
pre = clf.predict(x_test_vec)
accuracy_score(y_test, pre)

In [None]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, pre)
ls = ["交通", "政治", "計算機", "軍事", "教育", "經濟", "環境", "醫藥", "藝術", "體育"]
i = [n + "(真實)" for n in ls]
c = [n + "(預測)" for n in ls]
pd.DataFrame(mat,
            columns = c,
            index = i)

In [None]:
p_list = []
p = []
while True:
    a = input("請輸入新聞：")
    if a == "0":
        break
    else:
        p_list.append(a)

for x in range(0, len(p_list)):
    review = cut(p_list[x])
    p.append(review)
    
# p = vec.transform(p).toarray()
p = vec.transform(p)

In [None]:
pre = clf.predict(p)
for i in range(len(pre)):
    print("*" * 30)
    print("這個新聞是：", reverse_trans[pre[i]])
    print("*" * 30)

In [None]:
proba_list = clf.predict_proba(p)

for i in range(0, len(proba_list)):
    single = list(zip(trans, proba_list[i]))
    proba_list_single = sorted(single, reverse = True, key = lambda x:x[1])
    for l,pn in proba_list_single:
        print("第",i+1,"則新聞的", "類別：",l, "，機率：", pn)
    print("*" * 30)