# Part 2: seq2seq model with attention for language translation or chatbot?

## some resources
- [online tutorial](https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb) and [code](https://github.com/spro/practical-pytorch/tree/master/seq2seq-translation) from practical pytorch
- MaximumEntropy [seq2seq-pytorch](https://github.com/MaximumEntropy/Seq2Seq-PyTorch)
- IBM [pytorch seq2seq](https://github.com/IBM/pytorch-seq2seq)
- [seq2seq.pytorch](https://github.com/eladhoffer/seq2seq.pytorch)
- [seq2seq with tensorflow tutorials](https://github.com/ematvey/tensorflow-seq2seq-tutorials)
- [seq2seq neural machine translation tutorial](https://github.com/tensorflow/nmt)
- [chatbot based on seq2seq antilm](https://github.com/Marsan-Ma/tf_chatbot_seq2seq_antilm)
- [practical seq2seq for chatbot](http://suriyadeepan.github.io/2016-12-31-practical-seq2seq/)

## datasets
- [Tab-delimited Bilingual Sentence Pairs](http://www.manythings.org/anki/)
- [chat corpus](https://github.com/Marsan-Ma/chat_corpus)

It might be too long to fit into one notebook, so split it into several.

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import numpy as np
import random

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils import data
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

## seq2seq with `attention`

### Load the language data

In [39]:
PAD, SOS, EOS, UNK = 0, 1, 2, 3

from collections import Counter
import re

# langauge model, aka w2i, i2w, and wc
class Language(object):
    def __init__(self, name, tokenizer):
        self.tokenizer = tokenizer # regular expression
        self.w2i = {"PAD":PAD, "SOS":SOS, "EOS": EOS, "UNK": UNK} # word to index
        self.i2w = {PAD: "PAD", SOS: "SOS", EOS: "EOS", UNK: "UNK"} # index to word
        self.wc = Counter() # word count
        
    def update(self, sentences):
        for sentence in sentences:
            words = re.findall(self.tokenizer, sentence.lower())
            for word in words:
                if word not in self.w2i:
                    self.w2i[word] = len(self.w2i)
                    self.i2w[len(self.i2w)] = word
            self.wc.update(words)
            
    def prune(self, percent):
        pass
    
English = Language("english", u"\\w+|\S")
Chinese = Language("chinese", u".")

In [49]:
lines = open("/home/mali/ws/data/bilingual/cn2en/cmn.txt").readlines()
english_sents, chinese_sents = zip(*[line.split("\t") for line in lines])

English.update(english_sents)
Chinese.update(chinese_sents)

In [50]:
English.w2i

{'PAD': 0,
 'SOS': 1,
 'EOS': 2,
 'UNK': 3,
 'hello': 4,
 'world': 5,
 'i': 6,
 'like': 7,
 'programming': 8,
 '.': 9,
 'in': 10,
 'the': 11,
 'hi': 12,
 'run': 13,
 'wait': 14,
 '!': 15,
 'try': 16,
 'won': 17,
 'oh': 18,
 'no': 19,
 'cheers': 20,
 'he': 21,
 'ran': 22,
 'hop': 23,
 'lost': 24,
 'quit': 25,
 "'": 26,
 'm': 27,
 'ok': 28,
 'listen': 29,
 'way': 30,
 'really': 31,
 '?': 32,
 'it': 33,
 'we': 34,
 'why': 35,
 'me': 36,
 'ask': 37,
 'tom': 38,
 'be': 39,
 'calm': 40,
 'fair': 41,
 'kind': 42,
 'nice': 43,
 'call': 44,
 'us': 45,
 'come': 46,
 'get': 47,
 'out': 48,
 'go': 49,
 'away': 50,
 'goodbye': 51,
 'hang': 52,
 'on': 53,
 'came': 54,
 'runs': 55,
 'help': 56,
 'hold': 57,
 'hug': 58,
 'agree': 59,
 'ill': 60,
 'old': 61,
 's': 62,
 'join': 63,
 'keep': 64,
 'kiss': 65,
 'perfect': 66,
 'see': 67,
 'you': 68,
 'shut': 69,
 'up': 70,
 'skip': 71,
 'take': 72,
 'wake': 73,
 'wash': 74,
 'know': 75,
 'welcome': 76,
 'who': 77,
 'not': 78,
 'back': 79,
 'off': 80,
 'sti

In [51]:
English.i2w

{0: 'PAD',
 1: 'SOS',
 2: 'EOS',
 3: 'UNK',
 4: 'hello',
 5: 'world',
 6: 'i',
 7: 'like',
 8: 'programming',
 9: '.',
 10: 'in',
 11: 'the',
 12: 'hi',
 13: 'run',
 14: 'wait',
 15: '!',
 16: 'try',
 17: 'won',
 18: 'oh',
 19: 'no',
 20: 'cheers',
 21: 'he',
 22: 'ran',
 23: 'hop',
 24: 'lost',
 25: 'quit',
 26: "'",
 27: 'm',
 28: 'ok',
 29: 'listen',
 30: 'way',
 31: 'really',
 32: '?',
 33: 'it',
 34: 'we',
 35: 'why',
 36: 'me',
 37: 'ask',
 38: 'tom',
 39: 'be',
 40: 'calm',
 41: 'fair',
 42: 'kind',
 43: 'nice',
 44: 'call',
 45: 'us',
 46: 'come',
 47: 'get',
 48: 'out',
 49: 'go',
 50: 'away',
 51: 'goodbye',
 52: 'hang',
 53: 'on',
 54: 'came',
 55: 'runs',
 56: 'help',
 57: 'hold',
 58: 'hug',
 59: 'agree',
 60: 'ill',
 61: 'old',
 62: 's',
 63: 'join',
 64: 'keep',
 65: 'kiss',
 66: 'perfect',
 67: 'see',
 68: 'you',
 69: 'shut',
 70: 'up',
 71: 'skip',
 72: 'take',
 73: 'wake',
 74: 'wash',
 75: 'know',
 76: 'welcome',
 77: 'who',
 78: 'not',
 79: 'back',
 80: 'off',
 81: 

In [52]:
English.wc

Counter({'hello': 12,
         'world': 95,
         'i': 10702,
         'like': 1246,
         'programming': 2,
         '.': 32850,
         'in': 3035,
         'the': 9445,
         'hi': 10,
         'run': 64,
         'wait': 154,
         '!': 314,
         'try': 112,
         'won': 174,
         'oh': 8,
         'no': 590,
         'cheers': 2,
         'he': 4398,
         'ran': 60,
         'hop': 2,
         'lost': 134,
         'quit': 46,
         "'": 9694,
         'm': 1158,
         'ok': 28,
         'listen': 60,
         'way': 194,
         'really': 324,
         '?': 5392,
         'it': 2852,
         'we': 1778,
         'why': 356,
         'me': 2126,
         'ask': 134,
         'tom': 3058,
         'be': 1186,
         'calm': 10,
         'fair': 10,
         'kind': 80,
         'nice': 110,
         'call': 166,
         'us': 394,
         'come': 542,
         'get': 582,
         'out': 556,
         'go': 978,
         'away': 170,
        

In [54]:
len(English.wc.most_common(n=len(English.wc)//2))

3027

In [55]:
len(English.wc)

6055

In [58]:
len(Chinese.wc)

3361