# 文本分类

In [None]:
# 分词
# 词语 -> id
#   matrix -> [|V|, embed_size]
#   词语A -> id(5)
#   词表

# label -> id

import sys
import os
import jieba # pip install jieba
import urllib 
base_dir = './data/cnews/'

# input files
train_file = base_dir + 'cnews.train.txt'
val_file = base_dir + 'cnews.val.txt'
test_file = base_dir + 'cnews.test.txt'

# output files
seg_train_file = base_dir + 'cnews.train.seg.txt'
seg_val_file = base_dir + 'cnews.val.seg.txt'
seg_test_file = base_dir + 'cnews.test.seg.txt'

vocab_file = base_dir + 'cnews.vocab.txt'
category_file = base_dir + 'cnews.category.txt'

In [None]:
%%time
!pwd
if not os.path.exists(train_file):
    print('{}不存在, 开始下载文件'.format(train_file))
    urllib.request.urlretrieve('https://s3.amazonaws.com/dikers.nwcd/data-set/cnews_data.zip', "cnews_data.zip")
    !unzip ./cnews_data.zip
    !rm ./cnews_data.zip
    !mkdir ./data/cnews 
    !mv cnews.train.txt ./data/cnews/
    !mv cnews.test.txt ./data/cnews/
    !mv cnews.val.txt ./data/cnews/
else:
    print('文件已经存在')

### 使用jieba分词


In [None]:
with open(val_file, 'r') as f:
    lines = f.readlines()

label, content = lines[0].strip('\r\n').split('\t')
word_iter = jieba.cut(content)

print('label', label)
print(content)
print('/ '.join(word_iter))

###  将样本文件分词

In [None]:
def generate_seg_file(input_file, output_seg_file):
    """Segment the sentences in each line in input_file"""
    with open(input_file, 'r') as f:
        lines = f.readlines()
    with open(output_seg_file, 'w') as f:
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            word_iter = jieba.cut(content)
            word_content = ''
            for word in word_iter:
                word = word.strip(' ')
                if word != '':
                    word_content += word + ' '
            out_line = '%s\t%s\n' % (label, word_content.strip(' '))
            f.write(out_line)
        print('{} 文件分割完成, 输出路径{} .'.format(input_file, output_seg_file))
        
if not os.path.exists(seg_train_file):
    generate_seg_file(train_file, seg_train_file)
if not os.path.exists(seg_val_file):
    generate_seg_file(val_file, seg_val_file)
if not os.path.exists(seg_test_file):
    generate_seg_file(test_file, seg_test_file)

### 生成词汇表

In [None]:
%%time
def generate_vocab_file(input_seg_file, output_vocab_file):
    with open(input_seg_file, 'r') as f:
        lines = f.readlines()
    word_dict = {}
    for line in lines:
        label, content = line.strip('\r\n').split('\t')
        for word in content.split():
            word_dict.setdefault(word, 0)
            word_dict[word] += 1
    # [(word, frequency), ..., ()]
    sorted_word_dict = sorted(
        word_dict.items(), key = lambda d:d[1], reverse=True)
    with open(output_vocab_file, 'w') as f:
        f.write('<UNK>\t10000000\n')
        for item in sorted_word_dict:
            f.write('%s\t%d\n' % (item[0], item[1]))

generate_vocab_file(seg_train_file, vocab_file)

vocab file 格式： 
词语 和 出现的数量

```
生活	13141
能够	12911
不会	12898
不同	12871
获得	12870
城市	12825
学校	12775
一定	12736
一直	12606
上海	12574
```

In [None]:
### 对应的label 标签

In [None]:
%%time
def generate_category_dict(input_file, category_file):
    with open(input_file, 'r') as f:
        lines = f.readlines()
    category_dict = {}
    for line in lines:
        label, content = line.strip('\r\n').split('\t')
        category_dict.setdefault(label, 0)
        category_dict[label] += 1
    category_number = len(category_dict)
    with open(category_file, 'w') as f:
        for category in category_dict:
            line = '%s\n' % category
            print('%s\t%d' % (category, category_dict[category]))
            f.write(line)
            
generate_category_dict(train_file, category_file)
            

### 生成词向量

In [None]:
%%time
num_word_threshold = 10
num_timesteps = 50
class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
    
    def _read_dict(self, filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
    
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
    
    def get_word_dict(self):
        return self._word_to_id
    
    @property
    def unk(self):
        return self._unk
    
    def size(self):
        return len(self._word_to_id)
    
    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) \
                    for cur_word in sentence.split()]
        return word_ids


class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
    
    def size(self):
        return len(self._category_to_id)
        
    def category_to_id(self, category):
        if not category in self._category_to_id:
            raise Execption(
                "%s is not in our category list" % category_name)
        return self._category_to_id[category]
        
vocab = Vocab(vocab_file, num_word_threshold)
vocab_size = vocab.size()
print('vocab_size: %d' % vocab_size)

category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()
print('num_classes: %d' % num_classes)
test_str = '时尚'
print(
    'label: %s, id: %d' % (
        test_str,
        category_vocab.category_to_id(test_str)))

### 生成词的概率分布

In [None]:
%%time
import random
import numpy as np
word_counts = vocab.get_word_dict()
threshold = 1e-5
print(word_counts['的'])
total_count = len(word_counts)
print('total_count', total_count)
freqs = {word: count/total_count for word, count in word_counts.items()}
print(freqs['儿童'])
# p_drop = {word: 1 - np.sqrt(threshold/(freqs[word] + 1e-10)) for word in word_counts}
p_drop = {vocab.word_to_id(word): 1 - np.sqrt(threshold/(freqs[word] + 1e-10)) for word in word_counts}
print(p_drop[100]) 

In [None]:
%%time
import random
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        # matrix
        self._inputs = []
        # vector
        self._outputs = []
        self._indicator = 0
        self._parse_file(filename)
    
    def _parse_file(self, filename):
        print('Loading data from %s', filename)
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)

            id_words = self._vocab.sentence_to_id(content)
            # 过滤掉一部分概率比较低的值
#             print(len(content) , len(id_words))
            id_words = [word for word in id_words if random.random() > (1 - p_drop[word])]

            id_words = id_words[0: self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + [
                self._vocab.unk for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
        self._inputs = np.asarray(self._inputs, dtype = np.int32)
        self._outputs = np.asarray(self._outputs, dtype = np.int32)
        self._random_shuffle()
        self._num_examples = len(self._inputs)
    
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
    
    def num_examples(self):
        return self._num_examples
    
    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Execption("batch_size: %d is too large" % batch_size)
        
        batch_inputs = self._inputs[self._indicator: end_indicator]
        batch_outputs = self._outputs[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_outputs
    
    def get_data(self):
        batch_inputs = self._inputs
        batch_outputs = self._outputs
        return batch_inputs, batch_outputs
        
            
train_dataset = TextDataSet(
    seg_train_file, vocab, category_vocab,num_timesteps) 
val_dataset = TextDataSet(
    seg_val_file, vocab, category_vocab, num_timesteps)
test_dataset = TextDataSet(
    seg_test_file, vocab, category_vocab, num_timesteps)



In [None]:
%%time
from keras.utils import to_categorical
print(train_dataset.num_examples())
print(val_dataset.num_examples())
print(test_dataset.num_examples())

x_train , y_train = train_dataset.get_data()
x_val, y_val = val_dataset.get_data()
x_test, y_test  = test_dataset.get_data()
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
print('y_train.shape', y_train.shape)
print(x_train[5:8])
print(type(vocab))

In [None]:
%%time
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout
maxlen = x_test.shape[1]
_epochs = 10
model = Sequential()
# We specify the maximum input length to our Embedding layer
# so we can later flatten the embedded inputs
model.add(Embedding(vocab_size, 32, input_length=maxlen))
# After the Embedding layer, 
# our activations have shape `(samples, maxlen, 8)`.

# We flatten the 3D tensor of embeddings 
# into a 2D tensor of shape `(samples, maxlen * 8)`
model.add(Flatten())
# We add the classifier on top
model.add(Dropout(0.4))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=_epochs,
                    batch_size=64,
                    validation_data=(x_val, y_val))

print(model.evaluate(x_test, y_test))

In [None]:
import matplotlib.pyplot as plt
_epochs = 10
epochs = range(1, _epochs+1)
val_loss = history.history['val_loss']
plt.plot(epochs, val_loss, 'b+', color='r',label='Model')
plt.xlabel('Epochs')
plt.ylabel('Training loss')
plt.legend()

plt.show()

In [None]:
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=maxlen))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=_epochs,
                    batch_size=64,
                    validation_data=(x_test, y_test))