## 中文文本分类


In [1]:
import os
import jieba

# 保存至文件
def savefile(savepath, content):
    fp = open(savepath, "w", encoding="utf-8")
    fp.write(content)

# 读取文件
def readfile(path):
    fp = open(path, "r", encoding="utf-8")
    content = fp.read()
    return content

def corpus_segment(corpus_path, seg_path):
    # 获取corpus_path下的所有子目录
    catelist = os.listdir(corpus_path) 
    # 获取每个目录（类别）下所有的文件
    for mydir in catelist:
        # 拼出分类子目录的路径如：train_corpus/art/
        class_path = corpus_path + mydir + "/"  
        # 拼出分词后存贮的对应目录路径如：train_corpus_seg/art/
        seg_dir = seg_path + mydir + "/"  
 
        if not os.path.exists(seg_dir):  
            # 是否存在分词目录，如果没有则创建该目录
            os.makedirs(seg_dir)
 
        # 获取未分词语料库中某一类别中的所有文本
        file_list = os.listdir(class_path)  
        for file_path in file_list:  
            # 遍历类别目录下的所有文件
            fullname = class_path + file_path
            content = readfile(fullname)
            content = content.replace("[\n0-9]", "")
            content = content.replace(" ", "")
            content_seg = jieba.cut(content)
            savefile(seg_dir + file_path, " ".join(content_seg))
    print ("中文语料分词结束！！！")
 
if __name__=="__main__":
    #对训练集进行分词
    # 未分词分类语料库路径
    corpus_path = "./train_corpus/"  
    # 分词后分类语料库路径
    seg_path = "./train_corpus_seg/"  
    corpus_segment(corpus_path,seg_path)

    #对测试集进行分词
    # 未分词分类语料库路径
    corpus_path = "./test_corpus/"  
    # 分词后分类语料库路径
    seg_path = "./test_corpus_seg/"  
    corpus_segment(corpus_path,seg_path)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.905 seconds.
Prefix dict has been built successfully.


中文语料分词结束！！！
中文语料分词结束！！！


转化为Bunch类型

In [2]:
import os
import pickle

from sklearn.utils import Bunch
 
def _readfile(path):
    with open(path, "r", encoding="utf8") as fp:
        #with as句法前面的代码已经多次介绍过，今后不再注释
        content = fp.read()
    return content
 
def corpus2Bunch(wordbag_path,seg_path):
    # 获取seg_path下的所有子目录，也就是分类信息
    catelist = os.listdir(seg_path)
    #创建一个Bunch实例
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(catelist)

    # 获取每个目录下所有的文件
    for mydir in catelist:
        class_path = seg_path + mydir + "/"  
        file_list = os.listdir(class_path)  
        for file_path in file_list:  
            fullname = class_path + file_path  
            bunch.label.append(mydir)
            bunch.filenames.append(fullname)
            bunch.contents.append(_readfile(fullname)) 
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("构建文本对象结束！！！")
 
if __name__ == "__main__":
    #对训练集进行Bunch化操作：
    wordbag_path = "train_word_bag/train_set.dat"
    seg_path = "train_corpus_seg/" 
    corpus2Bunch(wordbag_path, seg_path)
 
    # 对测试集进行Bunch化操作：
    wordbag_path = "test_word_bag/test_set.dat"
    seg_path = "test_corpus_seg/" 
    corpus2Bunch(wordbag_path, seg_path)

构建文本对象结束！！！
构建文本对象结束！！！


## 权重策略--TF-IDF


In [3]:
# 引入Bunch类
from sklearn.utils import Bunch
import pickle#之前已经说过，不再赘述
from sklearn.feature_extraction.text import TfidfVectorizer
 
# 读取文件
def _readfile(path):
    with open(path, "r", encoding="utf-8") as fp:
        content = fp.read()
    return content
 
# 读取bunch对象
def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch
 
# 写入bunch对象
def _writebunchobj(path, bunchobj):
    with open(path, "wb") as file_obj:
        pickle.dump(bunchobj, file_obj)
 
#这个函数用于创建TF-IDF词向量空间
def vector_space(stopword_path,bunch_path,space_path):
 
    # 读取停用词
    stpwrdlst = _readfile(stopword_path).splitlines()
    #导入分词后的词向量bunch对象
    bunch = _readbunchobj(bunch_path)

    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})

    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
    
    #此时tdm里面存储的就是if-idf权值矩阵
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_
 
    _writebunchobj(space_path, tfidfspace)
    print("if-idf词向量空间实例创建成功")
 
if __name__ == '__main__':
    # 停用词表的路径
    stopword_path = "train_word_bag/hlt_stop_words.txt"
    # 导入训练集Bunch的路径
    bunch_path = "train_word_bag/train_set.dat"  
    # 词向量空间保存路径
    space_path = "train_word_bag/tfdifspace.dat"  
    vector_space(stopword_path,bunch_path,space_path)

if-idf词向量空间实例创建成功




## 分类器

In [4]:
# 引入Bunch类
from sklearn.utils import Bunch
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
 
def _readfile(path):
    with open(path, "r", encoding="utf-8") as fp:
        content = fp.read()
    return content
 
def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch
 
def _writebunchobj(path, bunchobj):
    with open(path, "wb") as file_obj:
        pickle.dump(bunchobj, file_obj)
 
def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path):
 
    stpwrdlst = _readfile(stopword_path).splitlines()
    bunch = _readbunchobj(bunch_path)
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
 
    #导入训练集的TF-IDF词向量空间
    trainbunch = _readbunchobj(train_tfidf_path)
    tfidfspace.vocabulary = trainbunch.vocabulary
 
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    _writebunchobj(space_path, tfidfspace)
    print ("if-idf词向量空间实例创建成功！！！")
 
if __name__ == '__main__':
    # 停用词表的路径
    stopword_path = "train_word_bag/hlt_stop_words.txt"
    # 词向量空间保存路径
    bunch_path = "test_word_bag/test_set.dat"   
    # TF-IDF词向量空间保存路径
    space_path = "test_word_bag/testspace.dat"   
    train_tfidf_path="train_word_bag/tfdifspace.dat"
    vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)

if-idf词向量空间实例创建成功！！！


In [5]:
from sklearn.utils import Bunch
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
 
def _readfile(path):
    with open(path, "rb") as fp:
        content = fp.read()
    return content
 
def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch
 
def _writebunchobj(path, bunchobj):
    with open(path, "wb") as file_obj:
        pickle.dump(bunchobj, file_obj)
 
def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):
 
    stpwrdlst = _readfile(stopword_path).splitlines()
    bunch = _readbunchobj(bunch_path)
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
 
    if train_tfidf_path is not None:
        trainbunch = _readbunchobj(train_tfidf_path)
        tfidfspace.vocabulary = trainbunch.vocabulary
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
 
    else:
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        tfidfspace.vocabulary = vectorizer.vocabulary_
 
    _writebunchobj(space_path, tfidfspace)
    print ("if-idf词向量空间实例创建成功！！！")
 
if __name__ == '__main__':
 
    stopword_path = "train_word_bag/hlt_stop_words.txt"
    bunch_path = "train_word_bag/train_set.dat"
    space_path = "train_word_bag/tfdifspace.dat"
    vector_space(stopword_path,bunch_path,space_path)
 
    bunch_path = "test_word_bag/test_set.dat"
    space_path = "test_word_bag/testspace.dat"
    train_tfidf_path="train_word_bag/tfdifspace.dat"
    vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)

if-idf词向量空间实例创建成功！！！
if-idf词向量空间实例创建成功！！！


In [6]:
import pickle
from sklearn.naive_bayes import MultinomialNB  # 导入多项式贝叶斯算法

# 读取bunch对象
def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch

# 导入训练集
trainpath = "train_word_bag/tfdifspace.dat"
train_set = _readbunchobj(trainpath)
 
# 导入测试集
testpath = "test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath)

# 训练分类器：输入词袋向量和分类标签，alpha:0.001 alpha越小，迭代次数越多，精度越高
clf = MultinomialNB(alpha=0.001).fit(train_set.tdm, train_set.label)

# 预测分类结果
predicted = clf.predict(test_set.tdm)

for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
    if flabel != expct_cate:
        print (file_name,": 实际类别:",flabel," -->预测类别:",expct_cate)
 
print ("预测完毕!!!")

# 计算分类精度：
from sklearn import metrics
def metrics_result(actual, predict):
    print ('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted')))
    print ('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted')))
    print ('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted')))
 
metrics_result(test_set.label, predicted)

test_corpus_seg/biography/2.txt : 实际类别: biography  -->预测类别: novels
test_corpus_seg/literature/1.txt : 实际类别: literature  -->预测类别: novels
test_corpus_seg/success/1.txt : 实际类别: success  -->预测类别: psychology
test_corpus_seg/success/2.txt : 实际类别: success  -->预测类别: child
预测完毕!!!
精度:0.696
召回:0.714
f1-score:0.660


  _warn_prf(average, modifier, msg_start, len(result))
