# Translation with seq2seq network and attention 

https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html







In [None]:
[KEY: > input, = target, < output]

> il est en train de peindre un tableau .
= he is painting a picture .
< he is painting a picture .

> pourquoi ne pas essayer ce vin delicieux ?
= why not try that delicious wine ?
< why not try that delicious wine ?

> elle n est pas poete mais romanciere .
= she is not a poet but a novelist .
< she not not a poet but a novelist .

> vous etes trop maigre .
= you re too skinny .
< you re all alone .

In [None]:
# Requirements
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
! wget 'https://download.pytorch.org/tutorial/data.zip'

In [None]:
import zipfile
!unzip data.zip
print("done")

In [None]:
SOS_token = 0
EOS_token = 1

# class Lang for word2index and index2word
class Lang:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0: "SOS", 1: "EOS"}
    self.n_words = 2 # SOS-->0 EOS-->1 so the next will be 2 
  def addSentence(self, sentence):
    for word in sentence.split(' '):
      self.addWork(word)
  
  def addWork(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words +=1
    else:
      self.word2count[word] +=1


In [None]:
# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
# read the data and split it 
def readLangs(lang1, lang2, reverse = False):
  print('Reading lines ...')

  # read the file and split into lines 
  # lang1 for langage 1 and we want lang1-->lang2, and inversely lang2--> lang1
  lines = open('data/%s-%s.txt'% (lang1, lang2), encoding = 'utf-8').read().strip().split('\n')

  # split line into pair and normalize : list of list
  pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines ]

  if reverse :
    pairs = [list(reversed(p)) for p in pairs]
    input_lang = Lang(lang2)
    output_lang = Lang(lang1)
  else:
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
  return input_lang , output_lang, pairs 

In [None]:
# reduce the data --> use only sentence with max 10 words per langage, starting with eng_prefixes : 
#p[0] is the french sentence 
#p[1] enlish sentence 
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


## Prepare the Data 

In [None]:
def prepareData(lang1, lang2, reverse = False):
  input_lang , output_lang, pairs = readLangs(lang1, lang2, reverse)
  print('Read %s sentence pairs'% len(pairs))
  pairs = filterPairs(pairs)

  print("Trimmed to %s sentence pairs" % len(pairs))
  print("Counting words...")
  for pair in pairs:
      input_lang.addSentence(pair[0])
      output_lang.addSentence(pair[1])
  print("Counted words:")
  print(input_lang.name, input_lang.n_words)
  print(output_lang.name, output_lang.n_words)
  return input_lang, output_lang, pairs


In [None]:
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

## Seq2Seq  Model :

## The encoder 

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)