# 文本预处理

##### 文本是一类序列数据，一篇文章可以看作是字符或单词的序列，本节将介绍文本数据的常见预处理步骤，预处理通常包括四个步骤：

#####  1.读入文本
#####  2.分词
#####  3.建立字典，将每个词映射到一个唯一的索引（index）
#####  4.将文本从词的序列转换为索引的序列，方便输入模型

# 1.1 读入文本

In [26]:
import torch.nn as nn
import time
import math
import sys
sys.path.append("/home/kesci/input")
import d2l_jay9460 as d2l
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [27]:
lines = read_time_machine()
print(lines)
print('# sentences %d' % len(lines))

['the time machine', '', '', 'an invention', '', '', '', '', 'by h g wells', '', '', '', '', 'contents', '', '', 'i introduction', 'ii the machine']
# sentences 18


## 1.2 分词

In [28]:
def tokenize(sentences, token='word'):
    'Split sentence into word or char tokens'
    if token == 'word':
        return [sentence.split(' ') for sentence in sentences]
    elif token == 'char':
        return [list(sentence) for sentence in sentences]
    else:
        print('Error: unknow token type' + token)

In [29]:
tokens = tokenize(lines)
print(tokens)

[['the', 'time', 'machine'], [''], [''], ['an', 'invention'], [''], [''], [''], [''], ['by', 'h', 'g', 'wells'], [''], [''], [''], [''], ['contents'], [''], [''], ['i', 'introduction'], ['ii', 'the', 'machine']]


## 1.3 建立词典

#### 先构建一个字典（vocabulary），将每个词映射到一个唯一的索引编号

In [52]:
import collections

def count_corpus(sentences):
    tokens = [tk for st in sentences for tk in st]
    return collections.Counter(tokens) #返回词典，记录每个词出现频数


class Vocab(object):
    def __init__(self, tokens, min_freq=0, use_special_tokems=False):
        counter = count_corpus(tokens)#每个词出现的频数，字典类型
        self.token_freqs = list(counter.items())
        
        self.idx_to_token = [] #单词集合，无重复元素
        if use_special_tokems: # ??
            # padding, begin of sentence, end of sentence, unknown
            self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
            self.idx_to_token += ['', '', '', '']
        else:
            self.unk = 0
            self.idx_to_token += ['']                
        #去重 & 去低频
        self.idx_to_token = [token for token, freq in self.token_freqs
                          if freq > 0 and token not in self.idx_to_token]            
        self.token_to_idx = dict() #创建唯一索引的词典      
        for idx, token in enumerate(self.idx_to_token):
            self.token_to_idx[token] = idx
            
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    

In [53]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:])

[('the', 0), ('time', 1), ('machine', 2), ('an', 3), ('invention', 4), ('by', 5), ('h', 6), ('g', 7), ('wells', 8), ('contents', 9), ('i', 10), ('introduction', 11), ('ii', 12)]


## 1.4 将词转为索引

In [59]:
for i in range(1,10):
    print('words: ', tokens[i])
    print('indices:', vocab[tokens[i]])

words: %s ['']
indices: [0]
words: %s ['']
indices: [0]
words: %s ['an', 'invention']
indices: [3, 4]
words: %s ['']
indices: [0]
words: %s ['']
indices: [0]
words: %s ['']
indices: [0]
words: %s ['']
indices: [0]
words: %s ['by', 'h', 'g', 'wells']
indices: [5, 6, 7, 8]
words: %s ['']
indices: [0]


# 2.语言模型数据集

In [72]:
with open('../../datasets/jaychou_lyrics.txt') as f:
    corpus_chars = f.read()
print(len(corpus_chars))
print(corpus_chars[100: 120])
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[: 10000]

63282
爱女人
透明的让我感动的可爱女人
坏坏的


## 2.1 建立字符索引

In [69]:
idx_to_char = list(set(corpus_chars)) #去重

char_to_idx = {char: i for i, char in enumerate(idx_to_char)} # 字符到索引的映射
vocab_size = len(char_to_idx)
print(vocab_size)

corpus_indices = [char_to_idx[char] for char in corpus_chars]  # 将每个字符转化为索引，得到一个索引的序列
sample = corpus_indices[: 20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

1027
chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [269, 17, 505, 106, 89, 131, 579, 269, 17, 745, 837, 143, 871, 51, 468, 973, 579, 269, 17, 745]


## 2.2 采样

### 2.2.1 随机采样

In [3]:
import torch
import random

def data_iter_random(corpus_indices, batch_size, num_steps, device=None):
    # 减1是因为对于长度为n的序列，X最多只有包含其中的前n - 1个字符
    num_example = (len(corpus_indices) - 1) // num_steps # 下取整
    # 每个样本的第一个字符在corpus_indices中的下标
    example_indices = [i * num_steps for i in range(num_example)]
    
    #
    random.shuffle(example_indices)
    
    def _data(i):
        return corpus_indices[i:i+num_steps]
    
    for i in range(0, num_example, batch_size):
        # python 生成器，返回迭代器
        batch_indices = example_indices
        X = [_data(j) for j in batch_indices]
        Y = [_data(j+1) for j in batch_indices]
        yield torch.tensor(X, device=device), torch.tensor(Y, device=device)


### 2.2.2  相邻采样

## 3 循环神经网络

In [8]:
import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l

(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()
#print( d2l.load_data_jay_lyrics() )

## 3.1 one-hot 编码

In [6]:
def one_hot(x, n_class, dtype=torch.float32):
    # X shape: (batch), output shape: (batch, n_class)
    x = x.long()
    res = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device)
    res.scatter_(1, x.view(-1, 1), 1)
    return res

x = torch.tensor([0,2])
y = one_hot(x,vocab_size)
print(y)
print(y.shape)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.]])
torch.Size([2, 1027])


## 3.2 初始化参数模型

In [29]:
print(vocab_size)
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size


import torch.nn as nn
def get_params():
    def _one(shape):
        ts = torch.tensor(np.random.normal(0, 0.01, size = shape))
        return torch.nn.Parameter(ts, requires_grad=True)
    
    #隐藏层
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens)) 
    b_h = torch.nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))
    
    #输出层
    W_hq = _one((num_hiddens, num_outputs))
    b_q = torch.nn.Parameter(torch.zeros(num_outputs,requires_grad=True))
    return nn.ParameterList([W_xh, W_hh, b_h, W_hq, b_q])

1027


## 3.3 定义模型

In [37]:
def init_rnn_state(batch_size, num_hiddens):
    return (torch.zeros((batch_size, num_hiddens)), )

def rnn(inputs, state, params):
    # inputs和outputs皆为num_steps个形状为(batch_size, vocab_size)的矩阵
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        H = torch.tanh(torch.matmul(X, W_xh) + torch.matmul(H, W_hh) + b_h)
        Y = torch.matmul(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H,)




def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, idx_to_char, char_to_idx):
    state = init_rnn_state(1, num_hiddens)
    output = [char_to_idx[prefix[0]]]
    for t in range(num_chars + len(prefix) - 1):
        # 将上一时间步的输出作为当前时间步的输入
        X = one_hot(torch.tensor([[output[-1]]]), vocab_size)
        # 计算输出和更新隐藏状态
        (Y, state) = rnn(X, state, params)
        # 下一个时间步的输入是prefix里的字符或者当前的最佳预测字符
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(Y[0].argmax(dim=1).item()))
    return ''.join([idx_to_char[i] for i in output])