## 原代码

In [1]:
import pandas as pd
data = pd.read_excel('./data/复旦大学中文文本分类语料.xlsx','sheet1')

### 分词

In [2]:
import jieba
jieba.enable_parallel(64) #并行分词开启
data['文本分词'] = data['正文'].apply(lambda i:jieba.cut(i) )
data['文本分词'] =[' '.join(i) for i in data['文本分词']]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.707 seconds.
Prefix dict has been built succesfully.


### 文本标签转为数字

In [3]:
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()
y = lbl_enc.fit_transform(data.分类.values)

### 划分训练集和测试集

In [4]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(data.文本分词.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

### TF-IDF提取文本特征

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
def number_normalizer(tokens):
    """ 将所有数字标记映射为一个占位符（Placeholder）。
    对于许多实际应用场景来说，以数字开头的tokens不是很有用，
    但这样tokens的存在也有一定相关性。 通过将所有数字都表示成同一个符号，可以达到降维的目的。
    """
    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)


class NumberNormalizingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

stwlist=[line.strip() for line in open('data/停用词汇总.txt','r',encoding='utf-8').readlines()]

tfv = NumberNormalizingVectorizer(min_df=3,  
                                  max_df=0.5,
                                  max_features=None,                 
                                  ngram_range=(1, 2), 
                                  use_idf=True,
                                  smooth_idf=True,
                                  stop_words = stwlist)

# 使用TF-IDF来fit训练集和测试集（半监督学习）
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

  'stop_words.' % sorted(inconsistent))


In [52]:
with codecs.open('output/tfv.txt','w',encoding='utf-8') as f:
        f.writelines(str(tfv.get_feature_names()))

In [6]:
tfv.vocabulary_

{'文献号': 393545,
 '原文': 206991,
 '出处': 169668,
 '宣传': 286287,
 '导报': 291703,
 '原刊': 205690,
 '地名': 245503,
 '长春': 657668,
 '期号': 416678,
 'f13': 27683,
 '经济': 545375,
 '理论': 485965,
 '实践': 282935,
 '复印': 257442,
 '邓小平': 648579,
 '特征': 474073,
 '作者简介': 130262,
 '社会科学': 514302,
 '邓小平理论': 648801,
 '当代': 321988,
 '马克思主义': 679466,
 '组成部分': 544023,
 '政治经济学': 379251,
 '马克': 679250,
 '主义': 89038,
 '一脉相承': 66966,
 '时代特点': 403695,
 '特色': 475119,
 '纵观': 542837,
 '五个': 98260,
 '求实': 448425,
 '实际': 283668,
 '出发': 168787,
 '中国式': 84974,
 '道路': 647677,
 '实事求是': 280598,
 '马克思列宁主义': 679695,
 '毛泽东思想': 442501,
 '精髓': 537434,
 '指出': 357024,
 '二十年': 96725,
 '历史教训': 204705,
 '最重': 412517,
 '原则': 205738,
 '辩证唯物主义': 633158,
 '历史唯物主义': 204560,
 '毛泽东': 442286,
 '同志': 223389,
 '概括': 432949,
 '长期': 657682,
 '初级阶段': 179550,
 '初级': 179482,
 '阶段': 660590,
 '出发点': 169067,
 '根本任务': 429625,
 '生产力': 488357,
 '市场经济': 304371,
 '实行': 282073,
 '公有制': 148941,
 '主体': 89479,
 '多种': 260962,
 '所有制': 343835,
 '按劳分配': 358973,
 '分配'

In [7]:
tfv.get_feature_names()

['#NUMBER a0',
 '#NUMBER a1',
 '#NUMBER a10',
 '#NUMBER a11',
 '#NUMBER a12',
 '#NUMBER a13',
 '#NUMBER a144',
 '#NUMBER a16',
 '#NUMBER a17',
 '#NUMBER a18',
 '#NUMBER a2',
 '#NUMBER a21',
 '#NUMBER a23',
 '#NUMBER a25',
 '#NUMBER a27',
 '#NUMBER a28',
 '#NUMBER a3',
 '#NUMBER a31',
 '#NUMBER a4',
 '#NUMBER a5',
 '#NUMBER a6',
 '#NUMBER a7',
 '#NUMBER a9',
 '#NUMBER aaai',
 '#NUMBER ab0',
 '#NUMBER ab1',
 '#NUMBER ab2',
 '#NUMBER ab3',
 '#NUMBER abbott',
 '#NUMBER abc4',
 '#NUMBER abel',
 '#NUMBER abs',
 '#NUMBER abstract',
 '#NUMBER abstracta',
 '#NUMBER abstractan',
 '#NUMBER abstractbased',
 '#NUMBER abstractcollision',
 '#NUMBER abstractin',
 '#NUMBER abstractinformation',
 '#NUMBER abstracts',
 '#NUMBER abstractthe',
 '#NUMBER abstractthis',
 '#NUMBER abstractusing',
 '#NUMBER abstractwith',
 '#NUMBER abu',
 '#NUMBER ac',
 '#NUMBER ac0',
 '#NUMBER academy',
 '#NUMBER accepted',
 '#NUMBER access',
 '#NUMBER acct',
 '#NUMBER acid',
 '#NUMBER acm',
 '#NUMBER ada',
 '#NUMBER adams',


## 模型

In [8]:
import numpy as np
def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [9]:
#利用提取的TFIDF特征来fit一个简单的Logistic Regression 

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.607 




## 先分词的代码

### 导入数据

In [10]:
import codecs 

labels = []
text = []
with codecs.open('output/data_pro.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t\ufeff')
        labels.append(temp[0])
        text.append(temp[1].strip())  

### 标签转换为数字

In [11]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

### TF-IDF提取文本特征

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv1 = TfidfVectorizer()

# 使用TF-IDF来fit训练集和测试集（半监督学习）
tfv1.fit(text)
features = tfv1.transform(text)

In [37]:
tfv1.vocabulary_

{'文献号': 236536,
 '2432': 12199,
 '原文': 157593,
 '出处': 142888,
 '出版发行': 143126,
 '研究': 293727,
 '原刊': 157360,
 '地名': 176445,
 '期号': 247286,
 '199601': 8979,
 'z1': 95718,
 '出版': 143121,
 '图书': 175150,
 '评介': 331739,
 '作者': 127598,
 '王益': 282271,
 '复印': 180431,
 '199604': 9051,
 '标题': 255135,
 '美国': 310269,
 '出版社': 143134,
 '经营': 307043,
 '管理': 302434,
 '介绍': 119577,
 '艺术': 318775,
 '专业书籍': 104032,
 '概论': 257822,
 '好几本': 186739,
 '专讲': 104309,
 '并不多': 206161,
 'the': 87887,
 'art': 34558,
 'andscienceo': 33664,
 'book': 37591,
 'pblishing': 73871,
 '推荐': 231013,
 '一本': 97621,
 '赫伯特': 338588,
 '贝利': 335830,
 'herbert': 55756,
 'bailey': 35832,
 'jr': 60552,
 '大学': 182928,
 '文学系': 236326,
 '毕业': 262017,
 '1946': 8332,
 '普林': 243710,
 '斯顿': 237497,
 '1954': 8365,
 '出任': 142731,
 '社长': 295164,
 '直至': 290900,
 '1986': 8604,
 '退休': 346524,
 '1970': 8439,
 '1980': 8503,
 '再版': 139563,
 '199': 8682,
 '年三版': 205773,
 '地被': 176978,
 '课程': 334355,
 '教材': 234845,
 '工作人员': 201833,
 '选作': 347040,
 '参考

In [48]:
tfv1.get_feature_names()

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '0000000',
 '00000000',
 '0000000000',
 '00000000000000',
 '00000000000433',
 '00000000158',
 '0000000158',
 '0000001',
 '00000011',
 '0000002',
 '000001',
 '0000011',
 '00000174',
 '000002',
 '00000219224981',
 '000003',
 '0000060',
 '0000063',
 '00000897',
 '0000090',
 '00001',
 '0000110',
 '00001311',
 '0000158',
 '000016',
 '000017',
 '0000171',
 '0000174',
 '000019',
 '00002',
 '0000250',
 '00003',
 '0000303',
 '00004',
 '0000420',
 '000042x3',
 '0000440',
 '0000444',
 '00005',
 '000058',
 '0000626',
 '000067',
 '00007',
 '000073',
 '00008',
 '00008883',
 '000099',
 '0000e881f551',
 '0000h',
 '0001',
 '00010',
 '000100100010010001001000',
 '0001032',
 '0001091',
 '0001095',
 '0001099',
 '00011',
 '0001111',
 '00012',
 '000125',
 '00013',
 '0001329',
 '000138',
 '00014',
 '000140',
 '000143',
 '000147',
 '000148',
 '0001489',
 '000149',
 '0001498',
 '00015',
 '000150',
 '0001513',
 '000158',
 '0001598',
 '00016',
 '0001603',
 '0001615',

In [49]:
with codecs.open('output/tfv1.txt','w',encoding='utf-8') as f:
        f.writelines(str(tfv1.get_feature_names()))

### 切分数据

In [50]:
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

### 模型

In [51]:
#利用提取的TFIDF特征来fit一个简单的Logistic Regression 

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_valid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.599 


