In [22]:
"""
数据预处理任务列表：
[1] 分词
[2] 词 -> id。例如："好":[1,2,3,4,1,2,3,4]
    matrix = [|V|, embedding_size]
    词的id。例如："好":1748。
    词汇表。整个训练集文本，有哪些词语。
[3] label -> id。例如：“体育“：2
"""

import os
import jieba
import sys

# input files
train_file = "/Users/michaelwang/Desktop/Text_Classification/cnews.train.txt"
val_file = "/Users/michaelwang/Desktop/Text_Classification/cnews.val.txt"
test_file = "/Users/michaelwang/Desktop/Text_Classification/cnews.test.txt"

# output files
seg_train_file = "/Users/michaelwang/Desktop/Text_Classification/cnews.train.seg.txt"
seg_val_file = "/Users/michaelwang/Desktop/Text_Classification/cnews.val.seg.txt"
seg_test_file = "/Users/michaelwang/Desktop/Text_Classification/cnews.test.seg.txt"

vocab_file = "/Users/michaelwang/Desktop/Text_Classification/cnews.vocab.txt"
category_file = "/Users/michaelwang/Desktop/Text_Classification/cnews.category.txt"

In [25]:
def generate_seg_file(input_file, seg_file):
    """
    分词
    Segment content with space. Generate a new file.
    """
    with open(input_file, 'r') as f:
        lines = f.readlines()
    with open(seg_file, 'w') as f2:
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            word_iter = jieba.cut(content)
            word_content = ''
            for word in word_iter:
                word = word.strip(' ')
                if word!='':
                    word_content += word + ' '
            out_line = "%s\t%s\n" % (label, word_content)
            f2.write(out_line)
            
# generate_seg_file(val_file, seg_val_file)
# generate_seg_file(test_file, seg_test_file)
# generate_seg_file(train_file, seg_train_file)

In [28]:
def generate_vocab_file(seg_file, vocab_file):
    """
    词表
    Get all words in training segmented file. Make a vocabulary.
    """
    word_dict = {}
    with open(seg_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            for word in content.split():
                word_dict.setdefault(word, 0)
                word_dict[word] += 1
        sorted_word_dict = sorted(word_dict.items(), key=lambda d:d[1], reverse=True)
    with open(vocab_file, 'w') as f2:
        f2.write("<UNK>, 1000000")
        for item in sorted_word_dict:
            f2.write("%s\t%d\n" % (item[0], item[1]))
            
generate_vocab_file(seg_train_file, vocab_file)

In [29]:
def generate_category_file(seg_file, category_file):
    """
    类别表
    """
    category_dict = {}
    with open(seg_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            label, content = line.strip("\r\n").split('\t')
            category_dict.setdefault(label, 0)
            category_dict[label] += 1
    with open(category_file, 'w') as f2:
        for category in category_dict:
            line = "%s\n" % category
            f2.write(line)
            
generate_category_file(seg_train_file, category_file)