# 文本预处理

##### 文本是一类序列数据，一篇文章可以看作是字符或单词的序列，本节将介绍文本数据的常见预处理步骤，预处理通常包括四个步骤：

#####  1.读入文本
#####  2.分词
#####  3.建立字典，将每个词映射到一个唯一的索引（index）
#####  4.将文本从词的序列转换为索引的序列，方便输入模型

# 1.1 读入文本

In [26]:
import re



def read_time_machine():
    with open('/Users/CYJ/Desktop/timemachine2.txt', 'r') as f:
        lines = []
        for line in f:
            lines.append( re.sub('[^a-z]+', ' ',line.strip().lower()) )
    return lines

In [27]:
lines = read_time_machine()
print(lines)
print('# sentences %d' % len(lines))

['the time machine', '', '', 'an invention', '', '', '', '', 'by h g wells', '', '', '', '', 'contents', '', '', 'i introduction', 'ii the machine']
# sentences 18


## 1.2 分词

In [28]:
def tokenize(sentences, token='word'):
    'Split sentence into word or char tokens'
    if token == 'word':
        return [sentence.split(' ') for sentence in sentences]
    elif token == 'char':
        return [list(sentence) for sentence in sentences]
    else:
        print('Error: unknow token type' + token)

In [29]:
tokens = tokenize(lines)
print(tokens)

[['the', 'time', 'machine'], [''], [''], ['an', 'invention'], [''], [''], [''], [''], ['by', 'h', 'g', 'wells'], [''], [''], [''], [''], ['contents'], [''], [''], ['i', 'introduction'], ['ii', 'the', 'machine']]


## 1.3 建立词典

#### 先构建一个字典（vocabulary），将每个词映射到一个唯一的索引编号

In [52]:
import collections

def count_corpus(sentences):
    tokens = [tk for st in sentences for tk in st]
    return collections.Counter(tokens) #返回词典，记录每个词出现频数


class Vocab(object):
    def __init__(self, tokens, min_freq=0, use_special_tokems=False):
        counter = count_corpus(tokens)#每个词出现的频数，字典类型
        self.token_freqs = list(counter.items())
        
        self.idx_to_token = [] #单词集合，无重复元素
        if use_special_tokems: # ??
            # padding, begin of sentence, end of sentence, unknown
            self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
            self.idx_to_token += ['', '', '', '']
        else:
            self.unk = 0
            self.idx_to_token += ['']                
        #去重 & 去低频
        self.idx_to_token = [token for token, freq in self.token_freqs
                          if freq > 0 and token not in self.idx_to_token]            
        self.token_to_idx = dict() #创建唯一索引的词典      
        for idx, token in enumerate(self.idx_to_token):
            self.token_to_idx[token] = idx
            
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    

In [53]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:])

[('the', 0), ('time', 1), ('machine', 2), ('an', 3), ('invention', 4), ('by', 5), ('h', 6), ('g', 7), ('wells', 8), ('contents', 9), ('i', 10), ('introduction', 11), ('ii', 12)]


## 1.4 将词转为索引

In [59]:
for i in range(1,10):
    print('words: ', tokens[i])
    print('indices:', vocab[tokens[i]])

words: %s ['']
indices: [0]
words: %s ['']
indices: [0]
words: %s ['an', 'invention']
indices: [3, 4]
words: %s ['']
indices: [0]
words: %s ['']
indices: [0]
words: %s ['']
indices: [0]
words: %s ['']
indices: [0]
words: %s ['by', 'h', 'g', 'wells']
indices: [5, 6, 7, 8]
words: %s ['']
indices: [0]


# 2.语言模型数据集

In [64]:
with open('../../datasets/jaychou_lyrics.txt') as f:
    corpus_chars = f.read()
print(len(corpus_chars))
print(corpus_chars[100: 120])
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[: 10000]

63282
爱女人
透明的让我感动的可爱女人
坏坏的
想要有直升机 想要和


## 2.1 建立字符索引

In [67]:
idx_to_char = list(set(corpus_chars)) #去重

char_to_idx = {char: i for i, char in enumerate(idx_to_char)} # 字符到索引的映射
vocab_size = len(char_to_idx)
print(vocab_size)



1027


KeyError: 98