In [1]:
import urllib3
import zipfile
import shutil
import os
import pandas as pd
import numpy as np

In [3]:
urllib = urllib3.PoolManager()
url = 'http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)
with urllib.request('GET',url, preload_content=False)as r, open(zipfilename, 'wb') as out_file:
  shutil.copyfileobj(r, out_file)
with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
  zip_ref.extractall(path)

In [36]:
lines = pd.read_csv('fra.txt', names = ['src','tar','CC'], sep='\t')
lines.head(3)

Unnamed: 0,src,tar,CC
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Marche.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Bouge !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [37]:
lines = lines.loc[:,'src':'tar']
lines = lines[0:10000]
lines.sample(10)

Unnamed: 0,src,tar
287,Hang on!,Attends un peu !
6156,Was it funny?,Était-ce drôle ?
9071,We're patient.,Nous sommes patients.
6337,You amuse me.,Tu m'amuses.
5503,Let Tom know.,Informe Tom.
1290,Let it go.,Abandonne !
7975,It's terrible.,C'est effrayant.
2610,Who is Tom?,"C’est qui, Tom ?"
5834,They saw you.,Elles t'ont vu.
8382,That was fast.,Ça a été rapide.


In [38]:
lines.tar

0                    Va !
1                 Marche.
2                 Bouge !
3                 Salut !
4                  Salut.
              ...        
9995    Il était patient.
9996      Il fut parfait.
9997    Il a été parfait.
9998    Il était parfait.
9999     Il fut paralysé.
Name: tar, Length: 10000, dtype: object

In [39]:
lines.tar = lines.tar.apply(lambda x: '\t '+ x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
8969,We must leave.,\t Il nous faut partir. \n
4535,Don't be mad.,\t Ne fais pas la tête. \n
160,Be fair.,\t Sois sincère. \n
7712,I'm not crazy.,\t Je ne suis pas fou. \n
3813,That's wise.,\t C'est sage. \n
1289,Let it go.,\t Laisse tomber. \n
1679,Be careful.,\t Fais attention. \n
8392,That's a copy.,\t C'est une copie. \n
9581,Cows eat grass.,\t Les vaches se nourrissent d'herbe. \n
1956,I hope not.,\t Je n'espère pas. \n


In [40]:
src_vocab = set()

# 캐릭터 단위로 토크나이징 -> 용량 감소 
for line in lines.src:
  for char in line:
    src_vocab.add(char)

In [None]:
src_vocab

In [41]:
tar_vocab = set()

for line in lines.tar :
  for char in line :
    tar_vocab.add(char)

In [None]:
tar_vocab

In [42]:
src_vocab_size = len(src_vocab) + 1
tar_vocab_size = len(tar_vocab) + 1

print(src_vocab_size)
print(tar_vocab_size)

72
93


In [43]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))

In [44]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])

print(tar_to_index)

{'\t': 1, '\n': 2, ' ': 3, '!': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '5': 17, '8': 18, '9': 19, ':': 20, '?': 21, 'A': 22, 'B': 23, 'C': 24, 'D': 25, 'E': 26, 'F': 27, 'G': 28, 'H': 29, 'I': 30, 'J': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 39, 'S': 40, 'T': 41, 'U': 42, 'V': 43, 'Y': 44, 'a': 45, 'b': 46, 'c': 47, 'd': 48, 'e': 49, 'f': 50, 'g': 51, 'h': 52, 'i': 53, 'j': 54, 'k': 55, 'l': 56, 'm': 57, 'n': 58, 'o': 59, 'p': 60, 'q': 61, 'r': 62, 's': 63, 't': 64, 'u': 65, 'v': 66, 'w': 67, 'x': 68, 'y': 69, 'z': 70, '\xa0': 71, '«': 72, '»': 73, 'À': 74, 'Ç': 75, 'É': 76, 'Ê': 77, 'à': 78, 'â': 79, 'ç': 80, 'è': 81, 'é': 82, 'ê': 83, 'î': 84, 'ï': 85, 'ô': 86, 'ù': 87, 'û': 88, 'œ': 89, '\u2009': 90, '’': 91, '\u202f': 92}


In [None]:
encoder_input = []

for line in lines.src:
  temp_x = []
  for w in line:
    temp_x.append(src_to_index[w])
  encoder_input.append(temp_x)

encoder_input

In [None]:
decoder_input = []

for line in lines.tar:
  temp_x = []
  for w in line:
    temp_x.append(tar_to_index[w])
  decoder_input.append(temp_x)

decoder_input

In [47]:
decoder_target = []

for line in lines.tar:
  t = 0
  temp_x = []
  for w in line:
    if t>0:
      temp_x.append(tar_to_index[w])
    t = t+1
  decoder_target.append(temp_x)

In [48]:
print(decoder_target[:10])

[[3, 43, 45, 3, 4, 3, 2], [3, 34, 45, 62, 47, 52, 49, 12, 3, 2], [3, 23, 59, 65, 51, 49, 3, 4, 3, 2], [3, 40, 45, 56, 65, 64, 3, 4, 3, 2], [3, 40, 45, 56, 65, 64, 12, 3, 2], [3, 24, 59, 65, 62, 63, 92, 4, 3, 2], [3, 24, 59, 65, 62, 49, 70, 92, 4, 3, 2], [3, 37, 62, 49, 58, 49, 70, 3, 66, 59, 63, 3, 54, 45, 57, 46, 49, 63, 3, 78, 3, 66, 59, 63, 3, 47, 59, 65, 63, 3, 4, 3, 2], [3, 27, 53, 56, 49, 3, 4, 3, 2], [3, 27, 53, 56, 49, 70, 3, 4, 3, 2]]
