## 1 没有数字和字母的情况

In [1]:
import pandas as pd
data = pd.read_excel('./data/复旦大学中文文本分类语料.xlsx','sheet1')

### 1.2 分词

In [2]:
import jieba
jieba.enable_parallel(64) #并行分词开启
data['文本分词'] = data['正文'].apply(lambda i:jieba.cut(i) )
data['文本分词'] =[' '.join(i) for i in data['文本分词']]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.715 seconds.
Prefix dict has been built succesfully.


### 1.3 文本标签转为数字

In [3]:
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()
y = lbl_enc.fit_transform(data.分类.values)

### 1.4 划分训练集和测试集

In [4]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(data.文本分词.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

### 1.5 TF-IDF提取文本特征

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
def number_normalizer(tokens):
    """ 将所有数字标记映射为一个占位符（Placeholder）。
    对于许多实际应用场景来说，以数字开头的tokens不是很有用，
    但这样tokens的存在也有一定相关性。 通过将所有数字都表示成同一个符号，可以达到降维的目的。
    """
    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)


class NumberNormalizingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

stwlist=[line.strip() for line in open('data/停用词汇总.txt','r',encoding='utf-8').readlines()]

tfv = NumberNormalizingVectorizer(min_df=3,  
                                  max_df=0.5,
                                  max_features=None,                 
                                  ngram_range=(1, 2), 
                                  use_idf=True,
                                  smooth_idf=True,
                                  stop_words = stwlist)

# 使用TF-IDF来fit训练集和测试集（半监督学习）
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

  'stop_words.' % sorted(inconsistent))


In [6]:
# 把特征词保存到本地
# import codecs
# with codecs.open('output/tfv.txt','w',encoding='utf-8') as f:
#         f.writelines(str(tfv.get_feature_names()))

In [7]:
len(tfv.get_feature_names())

685742

In [8]:
# tfv.vocabulary_

In [9]:
# tfv.get_feature_names()

## 1.6 使用模型分类

In [10]:
import numpy as np
def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [11]:
#利用提取的TFIDF特征来fit一个简单的Logistic Regression 

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.607 




## 2 去除字母和数字的情况

### 2.1 导入数据

In [12]:
import codecs 

labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

### 2.2 标签转换为数字

In [13]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

### 2.3 TF-IDF提取文本特征

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.6)

# 使用TF-IDF来fit训练集和测试集（半监督学习）
tfv1.fit(text)
features = tfv1.transform(text)

In [15]:
# tfv1.vocabulary_

In [16]:
# tfv1.get_feature_names()

In [17]:
# 把特征词保存到本地
# import codecs
# with codecs.open('output/tfv1.txt','w',encoding='utf-8') as f:
#         f.writelines(str(tfv1.get_feature_names()))

In [18]:
len(tfv1.get_feature_names())

84412

### 2.4 切分数据

In [19]:
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

### 2.5 使用模型分类

In [20]:
#利用提取的TFIDF特征来fit一个简单的Logistic Regression 

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_valid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.564 




## 参数统计
这三个数参数默认就None,True,True
```
max_features=None,                 
use_idf=True,
smooth_idf=True
```
### 1
```
tfv1 = TfidfVectorizer()
```
logloss: 0.594 

### 2
```
tfv1 = TfidfVectorizer(min_df=3,  
                      max_df=0.5,
                      max_features=None,                 
                      ngram_range=(1, 2), 
                      use_idf=True,
                      smooth_idf=True)
```
logloss: 0.605 

### 3
```
tfv1 = TfidfVectorizer(min_df=3,  
                      max_df=0.5,
                      max_features=None,                 
                      use_idf=True,
                      smooth_idf=True)
```
logloss: 0.571 

### 4
```
tfv1 = TfidfVectorizer(min_df=3,  
                       max_df=0.8)
```                     
logloss: 0.571 


### 5
```
tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.8)
```
logloss: 0.564 


### 6
```
tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.5)
```
logloss: 0.565 

### 7
```
tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.6)
```
logloss: 0.564 