In [1]:
corpus_name = 'cornell movie-dialogs corpus'
corpus = os.path.join('data',corpus_name)
def printLines(file,n=10):
    with open(file,'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)
# printLines(os.path.join(corpus,'movie_lines.txt'))

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
from torch.jit import script,trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv,random,re,os,unicodedata
import codecs
from io import open
import itertools
import math

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [3]:
def loadLines(fileName,fields):
    lines = {}
    with open(fileName,'r',encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            lineObj = {}
            for i,field in enumerate(fields):
                lineObj[field] = values[i]
            lines[lineObj['lineID']] = lineObj
    return lines
# 传入conversation file,lines,和字段们
def loadConversations(fileName,lines,fields):
    conversations = []
    with open(fileName,'r',encoding="iso-8859-1") as f:
        for line in f:
            values = line.split(" +++$+++ ")
            convObj = {}
            for i,field in enumerate(fields):
                convObj[field] = values[i]
            utterance_id_pattern = re.compile('L[0-9]+')
            lineIds = utterance_id_pattern.findall(convObj["utteranceIDs"])
            convObj["lines"] = []
            for lineId in lineIds:
                convObj["lines"].append(lines[lineId])
            conversations.append(convObj)
        return conversations

def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations:
        for i in range(len(conversation["lines"]) - 1):
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i + 1]["text"].strip()
            if inputLine and targetLine:
                qa_pairs.append([inputLine,targetLine])
    return qa_pairs

In [4]:
# # Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

# delimiter = '\t'
# # Unescape the delimiter
# delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# # Initialize lines dict, conversations list, and field ids
# lines = {}
# conversations = []
# MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
# MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]

# # Load lines and process conversations
# print("\nProcessing corpus...")
# lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
# print("\nLoading conversations...")
# conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
#                                   lines, MOVIE_CONVERSATIONS_FIELDS)

# # Write new csv file
# print("\nWriting newly formatted file...")
# with open(datafile, 'w', encoding='utf-8') as outputfile:
#     writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
#     for pair in extractSentencePairs(conversations):
#         writer.writerow(pair)

# # Print a sample of lines
# print("\nSample lines from file:")
# printLines(datafile)

In [5]:
PAD_token = 0
SOS_token = 1 # start of sentence
EOS_token = 2

class Voc:
    def __init__(self,name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:"PAD",SOS_token:"SOS",EOS_token:"EOS"}
        self.num_words = 3
    def addSentence(self,sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    def addWord(self,word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))
        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [6]:
MAX_LENGTH = 10
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) != 'Mn'
    )
def normalizeString(s):
    # print(s)
    s = unicodeToAscii(s.lower().strip())
    # print(s)
    s = re.sub(r"([.!?])", r" \1", s)
    # print(s)
    s = re.sub(r"[^a-zA-Z\.!\?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s
def readVocs(datafile,corpus_name):
    print("Reading lines...")
    lines = open(datafile,encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    # 创建一个新的voc和一组对话pair
    return voc,pairs
def fileterPair(p):
    # 根据单词长度滤
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
def filterPairs(pairs):
    return [pair for pair in pairs if fileterPair(pair)]

# 整合
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


In [7]:
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 221282 sentence pairs
Trimmed to 64271 sentence pairs
Counting words...
Counted words: 18008

pairs:
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [8]:
MIN_COUNT = 3
def trimRareWords(voc,pairs,MIN_COUNT):
    voc.trim(MIN_COUNT)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_outpus = True
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_outpus = False
                break
        if keep_input and keep_outpus:
            keep_pairs.append(pair)
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs

In [9]:
pairs = trimRareWords(voc,pairs,MIN_COUNT)

keep_words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165, 0.8272 of total


In [10]:
def indexesFromSentence(voc,sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
def zeroPadding(l,fillvalue=PAD_token):
    return list(itertools.zip_longest(*l,fillvalue=fillvalue))
def binaryMatrix(l,value=PAD_token):
    m = []
    for i,seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [11]:
def inputVar(l,voc):
    indexes_batch = [indexesFromSentence(voc,sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar,lengths
def outputVar(l,voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len
# 整合
def batch2TrainData(voc,pair_batch):
    pair_batch.sort(key = lambda x:len(x[0].split(" ")),reverse=True)# 按单词数降序
    input_batch,output_batch = [],[]
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp,lengths = inputVar(input_batch,voc)
    output,mask,max_target_len = outputVar(output_batch,voc)
    return inp,lengths,output,mask,max_target_len


In [12]:
small_batch_size = 5
batches = batch2TrainData(voc,[random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches
# print("input_variable:", input_variable)
# print("lengths:", lengths)
# print("target_variable:", target_variable)
# print("mask:", mask)
# print("max_target_len:", max_target_len)

In [13]:
class EncoderRNN(nn.Module):
    def __init__(self,hidden_size,embedding,n_layers=1,dropout=0):
        super(EncoderRNN,self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size,hidden_size,n_layers,dropout = (0 if n_layers == 1 else dropout),bidirectional=True)
    def forward(self,input_seq,input_lengths,hidden_size):
        embedded = self.embedding(input_seq)
        # 把padding pack掉,输入三维矩阵和长度
        packed = nn.utils.rnn.pack_padded_sequence(embedded,input_lengths)
        outputs,hidden = self.gru(packed)
        # 只需要处理output,因为output包含时序
        outputs,_ = nn.utils.rnn.pad_packed_sequence(outputs)
        # 将双向相加,内置的处理是dim=2拼接
        outputs = outputs[:,:,:self.hidden_size] + outputs[:,:,self.hidden_size:]
        return outputs,hidden

![scores](https://pytorch.org/tutorials/_images/scores.png)

In [14]:
# luong
class Attn(nn.Module):
    # method: dot,general,concat
    def __init__(self,method,hidden_size):
        super(Attn,self).__init__()
        self.method = method
        if self.method not in ['dot','general','concat']:
            raise ValueError(self.method," is not method!")
        self.hidden_size = hidden_size
        if self.method == 'general':
            # 内含训练参数W
            self.attn = nn.Linear(self.hidden_size,hidden_size)
        elif self.method == 'concat':
            # 内含训练参数W
            self.attn = nn.Linear(self.hidden_size * 2,hidden_size)
            # 将v加入训练参数
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))
    def dot_score(self,hidden,encoder_output):
        # 广播乘法再相加,形成内积
        return torch.sum(hidden * encoder_output,dim=2)
    def general_score(self,hidden,encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy,dim=2)
    def concat_score(self,hidden,encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0),-1,-1),encoder_output,2))).tanh()
        return torch.sum(self.v * energy,dim=2)
    def forward(self,hidden,encoder_output):
        if self.method == 'general':
            attn_energies = self.general_score(hidden,encoder_output)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden,encoder_output)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden,encoder_output)
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies,dim=1).unsqueeze(1)

In [None]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self,attn_model,embedding,hidden_size,output_size,n_layers=1,dropout=0.1):
        super(LuongAttnDecoderRNN,self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size,hidden_size,n_layers,dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2,hidden_size)
        self.out = nn.Linear(hidden_size,output_size)
        self.attn = Attn(attn_model,hidden_size)
    def forward(self,input_step,last_hidden,encoder_outputs):
        # 先过word embedding和dropout
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # gru过一下
        rnn_output,hidden = self.gru(embedded,last_hidden)
        # 根据输出和encoder算一下attention
        attn_weights = self.attn(rnn_output,encoder_outputs)
        # bmm: batch matmul,批量的矩阵乘法,计算一批context矢量
        context = attn_weights.bmm(encoder_outputs.transpose(0,1))
        # 一个时间过gru,可以挤压
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        # 拼接后进行Linear再tanh,是luong给出的方案(见计算公式)
        concat_input = torch.cat((rnn_output,context),1)
        concat_output = torch.tanh(self.concat(concat_input))
        # 过一下linear
        output = self.out(concat_output)
        # 过一下softmax
        output = F.softmax(output,dim=1)
        return output,hidden

In [None]:
def maskNLLLoss(inp,target,mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp,1,target.view(-1,1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss,nTotal.item()

In [19]:
def train(input_variable,lengths,target_variable,mask,max_target_len,encoder,decoder,embedding,encoder_optimizer,decoder_optimizer,batch_size,clip,max_length=MAX_LENGTH):
    # 去掉梯度
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # packing的length参数放cpu
    lengths = lengths.to("cpu")

    loss = 0
    print_losses = []
    n_totals = 0
    # encoder登场
    encoder_outputs,encoder_hidden = encoder(input_variable,lengths)

    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        for t in range(max_target_len):
            # decoder登场
            decoder_output,decoder_hidden = decoder(
                decoder_input,decoder_hidden,encoder_outputs
            )
            # 新的input
            decoder_input = target_variable[t].view(1,-1)
            mask_loss,nTotal = maskNLLLoss(decoder_output,target_variable[t],mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output,decoder_hidden = decoder(
                decoder_input,decoder_hidden,encoder_outputs
                )
            # 取出最大概率的一组
            _,topi = decoderoutput.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder.to(device)
            mask_loss,nTotal = maskNLLLoss(decoder_output,target_variable[t],mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item * nTotal)
            n_totals += nTotal
    loss.backward()
    _ = nn.utils.clip_grad_norm_(encoder.parameters(),clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters,clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    return sum(print_losses) / n_totals

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 39)