# 文本分类

使用循环神经网络对文本进行分类, 多对一问题

In [1]:


import sys
import os
import jieba # pip install jieba
import urllib.request as ur

data_file_url = 'https://dikers-data.s3.cn-northwest-1.amazonaws.com.cn/dataset/cnews_data.zip'
base_dir = './dataset/cnews/'

# input files
train_file = base_dir + 'cnews.train.txt'
val_file = base_dir + 'cnews.val.txt'
test_file = base_dir + 'cnews.test.txt'

# output files
seg_train_file = base_dir + 'cnews.train.seg.txt'
seg_val_file = base_dir + 'cnews.val.seg.txt'
seg_test_file = base_dir + 'cnews.test.seg.txt'

vocab_file = base_dir + 'cnews.vocab.txt'
category_file = base_dir + 'cnews.category.txt'

### 准备数据
数据格式  
  分类  文本详细内容
>体育    -->  黄蜂vs湖人首发：科比带伤战保罗 加索尔救赎之......

>娱乐    -->  深湖巨兽 导演想来中国拍“水怪”晚报讯 由华夏电影公司发行的电影.....

>家居    -->  橱柜的商品性质十分模糊(二)橱柜到底是什么性质的商品十分模糊：.....



In [2]:
%%time
!pwd
if not os.path.exists(train_file):
    !mkdir ./datasets
    !mkdir ./datasets/cnews
    print('{}不存在, 开始下载文件'.format(train_file))
    ur.urlretrieve(data_file_url, "cnews_data.zip")
    !unzip ./cnews_data.zip
    !rm ./cnews_data.zip
    !mkdir ./datasets/cnews 
    !mv cnews.train.txt ./datasets/cnews/
    !mv cnews.test.txt ./datasets/cnews/
    !mv cnews.val.txt ./datasets/cnews/
    !rm -fr __MACOSX
else:
    print('文件已经存在')

/mnt/sdf/workspace/git_hub_demo/learn-rnn
mkdir: cannot create directory ‘./datasets’: File exists
./dataset/cnews/cnews.train.txt不存在, 开始下载文件
Archive:  ./cnews_data.zip
  inflating: cnews.test.txt          
   creating: __MACOSX/
  inflating: __MACOSX/._cnews.test.txt  
  inflating: cnews.val.txt           
  inflating: __MACOSX/._cnews.val.txt  
  inflating: cnews.train.txt         
  inflating: __MACOSX/._cnews.train.txt  
mkdir: cannot create directory ‘./datasets/cnews’: File exists
CPU times: user 409 ms, sys: 253 ms, total: 662 ms
Wall time: 3.88 s


### 使用jieba分词

如果没有安装jieba 请安装， 命令如下： 

```
conda info --envs

source activate tensorflow_p36

pip install jieba

```
然后重新启动kernel



In [14]:
# 演示分词的用法
with open(val_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

label, content = lines[0].strip('\r\n').split('\t')
word_iter = jieba.cut(content)

print('label', label)
print(content)
print('/ '.join(word_iter))

label 体育
黄蜂vs湖人首发：科比带伤战保罗 加索尔救赎之战 新浪体育讯北京时间4月27日，NBA季后赛首轮洛杉矶湖人主场迎战新奥尔良黄蜂，此前的比赛中，双方战成2-2平，因此本场比赛对于两支球队来说都非常重要，赛前双方也公布了首发阵容：湖人队：费舍尔、科比、阿泰斯特、加索尔、拜纳姆黄蜂队：保罗、贝里内利、阿里扎、兰德里、奥卡福[新浪NBA官方微博][新浪NBA湖人新闻动态微博][新浪NBA专题][黄蜂vs湖人图文直播室](新浪体育)
黄蜂/ vs/ 湖人/ 首发/ ：/ 科比/ 带伤/ 战/ 保罗/  / 加索尔/ 救赎/ 之战/  / 新浪/ 体育讯/ 北京/ 时间/ 4/ 月/ 27/ 日/ ，/ NBA/ 季后赛/ 首轮/ 洛杉矶/ 湖人/ 主场/ 迎战/ 新奥尔良/ 黄蜂/ ，/ 此前/ 的/ 比赛/ 中/ ，/ 双方/ 战成/ 2/ -/ 2/ 平/ ，/ 因此/ 本场/ 比赛/ 对于/ 两支/ 球队/ 来说/ 都/ 非常/ 重要/ ，/ 赛前/ 双方/ 也/ 公布/ 了/ 首发/ 阵容/ ：/ 湖人队/ ：/ 费舍尔/ 、/ 科比/ 、/ 阿泰斯特/ 、/ 加索尔/ 、/ 拜纳姆/ 黄蜂队/ ：/ 保罗/ 、/ 贝里/ 内利/ 、/ 阿里/ 扎/ 、/ 兰德/ 里/ 、/ 奥卡福/ [/ 新浪/ NBA/ 官方/ 微博/ ]/ [/ 新浪/ NBA/ 湖人/ 新闻动态/ 微博/ ]/ [/ 新浪/ NBA/ 专题/ ]/ [/ 黄蜂/ vs/ 湖人/ 图文/ 直播室/ ]/ (/ 新浪/ 体育/ )


###  将样本文件分词

分词后文件格式

>体育-->黄蜂 vs 湖人 首发 ： 科比 带伤 战 保罗 加索尔 救赎 之战

>体育-->广东 半场 狂飙 狂胜 山东 陈 江华 关键 闪光 稳住 全队 


In [15]:
%%time
def generate_seg_file(input_file, output_seg_file):
    """Segment the sentences in each line in input_file"""
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    with open(output_seg_file, 'w',  encoding='utf-8') as f:
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            word_iter = jieba.cut(content)
            word_content = ''
            for word in word_iter:
                word = word.strip(' ')
                if word != '':
                    word_content += word + ' '
            out_line = '%s\t%s\n' % (label, word_content.strip(' '))
            f.write(out_line)
        print('{} 文件分割完成, 输出路径{} .'.format(input_file, output_seg_file))
        
if not os.path.exists(seg_train_file):
    generate_seg_file(train_file, seg_train_file)
if not os.path.exists(seg_val_file):
    generate_seg_file(val_file, seg_val_file)
if not os.path.exists(seg_test_file):
    generate_seg_file(test_file, seg_test_file)

### 生成词汇表


| 词语 | 出现的数量 |
| :-----| ----: |
| 生活 | 13141 |
| 能够 | 12911 |
| 不会 | 12898 |


In [18]:
%%time
def generate_vocab_file(input_seg_file, output_vocab_file):
    with open(input_seg_file, 'r',  encoding='utf-8') as f:
        lines = f.readlines()
    word_dict = {}
    for line in lines:
        label, content = line.strip('\r\n').split('\t')
        for word in content.split():
            word_dict.setdefault(word, 0)
            word_dict[word] += 1
    # [(word, frequency), ..., ()]
    sorted_word_dict = sorted(
        word_dict.items(), key = lambda d:d[1], reverse=True)
    with open(output_vocab_file, 'w',  encoding='utf-8') as f:
        f.write('<UNK>\t10000000\n')
        for item in sorted_word_dict:
            f.write('%s\t%d\n' % (item[0], item[1]))

generate_vocab_file(seg_train_file, vocab_file)

CPU times: user 6.63 s, sys: 76.2 ms, total: 6.7 s
Wall time: 6.7 s


vocab file 格式： 
词语 和 出现的数量

```
生活	13141
能够	12911
不会	12898
不同	12871
获得	12870
城市	12825
学校	12775
一定	12736
一直	12606
上海	12574
```

### 对应的label 标签


| 分类标签 | 出现的数量 |
| :-----| ----: |
| 体育 | 5000 |
| 娱乐 | 5000 |
| 家居 | 5000 |
| 房产 | 5000 |
| 时尚 | 5000 |
| 时政 | 5000 |
| 游戏 | 5000 |
| 科技 | 5000 |
| 财经 | 5000 |


In [19]:
%%time
def generate_category_dict(input_file, category_file):
    with open(input_file, 'r',  encoding='utf-8') as f:
        lines = f.readlines()
    category_dict = {}
    for line in lines:
        label, content = line.strip('\r\n').split('\t')
        category_dict.setdefault(label, 0)
        category_dict[label] += 1
    category_number = len(category_dict)
    with open(category_file, 'w',  encoding='utf-8') as f:
        for category in category_dict:
            line = '%s\n' % category
            print('%s\t%d' % (category, category_dict[category]))
            f.write(line)
            
generate_category_dict(train_file, category_file)
            

体育	5000
娱乐	5000
家居	5000
房产	5000
教育	5000
时尚	5000
时政	5000
游戏	5000
科技	5000
财经	5000
CPU times: user 348 ms, sys: 29.6 ms, total: 378 ms
Wall time: 377 ms


### 生成词向量

In [20]:
%%time
num_word_threshold = 10
num_timesteps = 50
class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
    
    def _read_dict(self, filename):
        with open(filename, 'r',  encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
    
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
    
    def get_word_dict(self):
        return self._word_to_id
    
    @property
    def unk(self):
        return self._unk
    
    def size(self):
        return len(self._word_to_id)
    
    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) \
                    for cur_word in sentence.split()]
        return word_ids


class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        with open(filename, 'r',  encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
    
    def size(self):
        return len(self._category_to_id)
        
    def category_to_id(self, category):
        if not category in self._category_to_id:
            raise Execption(
                "%s is not in our category list" % category_name)
        return self._category_to_id[category]
        
vocab = Vocab(vocab_file, num_word_threshold)
vocab_size = vocab.size()
print('vocab_size: %d' % vocab_size)

category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()
print('num_classes: %d' % num_classes)
test_str = '时尚'
print(
    'label: %s, id: %d' % (
        test_str,
        category_vocab.category_to_id(test_str)))

vocab_size: 77323
num_classes: 10
label: 时尚, id: 5
CPU times: user 218 ms, sys: 10.2 ms, total: 228 ms
Wall time: 227 ms


### 生成词的概率分布

In [21]:
%%time
import random
import numpy as np
word_counts = vocab.get_word_dict()
threshold = 1e-5
print(len(word_counts))
total_count = len(word_counts)
print('total_count', total_count)
freqs = {word: count/total_count for word, count in word_counts.items()}
print(freqs['儿童'])
# p_drop = {word: 1 - np.sqrt(threshold/(freqs[word] + 1e-10)) for word in word_counts}
p_drop = {vocab.word_to_id(word): 1 - np.sqrt(threshold/(freqs[word] + 1e-10)) for word in word_counts}
print(p_drop[100]) 

77323
total_count 77323
0.03590134888713578
0.9120665058006028
CPU times: user 140 ms, sys: 2.86 ms, total: 143 ms
Wall time: 142 ms


In [22]:
%%time
import random
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        # matrix
        self._inputs = []
        # vector
        self._outputs = []
        self._indicator = 0
        self._parse_file(filename)
    
    def _parse_file(self, filename):
        print('Loading data from %s', filename)
        with open(filename, 'r',  encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)

            id_words = self._vocab.sentence_to_id(content)
            # 过滤掉一部分概率比较低的值
#             print(len(content) , len(id_words))
            id_words = [word for word in id_words if random.random() > (1 - p_drop[word])]

            id_words = id_words[0: self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + [
                self._vocab.unk for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
        self._inputs = np.asarray(self._inputs, dtype = np.int32)
        self._outputs = np.asarray(self._outputs, dtype = np.int32)
        self._random_shuffle()
        self._num_examples = len(self._inputs)
    
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
    
    def num_examples(self):
        return self._num_examples
    
    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Execption("batch_size: %d is too large" % batch_size)
        
        batch_inputs = self._inputs[self._indicator: end_indicator]
        batch_outputs = self._outputs[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_outputs
    
    def get_data(self):
        batch_inputs = self._inputs
        batch_outputs = self._outputs
        return batch_inputs, batch_outputs
        
            
train_dataset = TextDataSet(
    seg_train_file, vocab, category_vocab,num_timesteps) 
val_dataset = TextDataSet(
    seg_val_file, vocab, category_vocab, num_timesteps)
test_dataset = TextDataSet(
    seg_test_file, vocab, category_vocab, num_timesteps)



Loading data from %s ./dataset/cnews/cnews.train.seg.txt
Loading data from %s ./dataset/cnews/cnews.val.seg.txt
Loading data from %s ./dataset/cnews/cnews.test.seg.txt
CPU times: user 23.7 s, sys: 81.6 ms, total: 23.8 s
Wall time: 23.8 s


### 分割数据

In [23]:
%%time
from keras.utils import to_categorical
print(train_dataset.num_examples())
print(val_dataset.num_examples())
print(test_dataset.num_examples())

x_train , y_train = train_dataset.get_data()
x_val, y_val = val_dataset.get_data()
x_test, y_test  = test_dataset.get_data()
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
print('y_train.shape', y_train.shape)
print(x_train[5:8])
print(type(vocab))


50000
5000
10000
(50000, 50)
(5000, 50)
(10000, 50)
y_train.shape (50000, 10)
[[  467    11 48363  6409  6162 10443 44435  8019    87  3833  2320    11
     49  2335   310 17921   251  2637 35252 25975  4605    50 18271     2
    194    91 41216 74378 13858 58863     2 31817 48363  6409  1500     7
     31    73   104   376  2528     8  2177   313     3  2598 24072  1808
   1335  2923]
 [ 9433 25390   672 23055 14845 71763   690   301     4   660  7860     2
   1791  9433 25390   672 32826   133    32 23055  6816   404   949  2488
      6  1237 18224 61989 18167  4238   334   174    44  9433 25390   672
  23055 11260    84     6 59349     8    25    38  2244   385 23055     2
   4685   469]
 [ 2358 54606   746   709   648   297  8575   462   709    11  1465   648
    962  3267    28  1241   684    59   495   193   746  1465   648 11032
    255     6  4808  5930    28   193    86  1728     2  2225    43  1330
     13  1161 12374   677     7  1874   962  1496  1733  1176     5  3033
    

### 进行训练

In [None]:
%%time
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout
maxlen = x_test.shape[1]
_epochs = 10
model = Sequential()
# We specify the maximum input length to our Embedding layer
# so we can later flatten the embedded inputs
model.add(Embedding(vocab_size, 32, input_length=maxlen))
# After the Embedding layer, 
# our activations have shape `(samples, maxlen, 8)`.

# We flatten the 3D tensor of embeddings 
# into a 2D tensor of shape `(samples, maxlen * 8)`
model.add(Flatten())
# We add the classifier on top
model.add(Dropout(0.4))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=_epochs,
                    batch_size=64,
                    validation_data=(x_val, y_val))

print(model.evaluate(x_test, y_test))

In [None]:
import matplotlib.pyplot as plt
_epochs = 10
epochs = range(1, _epochs+1)
val_loss = history.history['val_loss']
plt.plot(epochs, val_loss, 'b+', color='r',label='Model')
plt.xlabel('Epochs')
plt.ylabel('Training loss')
plt.legend()

plt.show()

In [None]:
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=maxlen))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=_epochs,
                    batch_size=64,
                    validation_data=(x_test, y_test))