In [2]:
import tensorflow as tf

text_file = tf.keras.utils.get_file(
    fname="fra-eng.zip",
    origin="https://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip",
    extract=False
)
print(text_file)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip
[1m3423204/3423204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
/root/.keras/datasets/fra-eng.zip


In [3]:
import zipfile, os

with zipfile.ZipFile(text_file, "r") as z:
    z.extractall(os.path.dirname(text_file))

In [4]:
from pathlib import Path
import random

text_file = Path(text_file).parent / "fra.txt"

with open(text_file, encoding="utf-8") as f:
    text_pairs = f.readlines()

for _ in range(5):
    print(random.choice(text_pairs))

Don't you think we deserve some answers?	Ne pensez-vous pas que nous méritions quelques réponses ?

Have you ever eaten at that restaurant?	Avez-vous jamais mangé dans ce restaurant ?

That hit the spot.	C'était parfait.

I slammed the door shut.	J'ai claqué la porte.

She forced him to sit down.	Elle le contraignit à s'asseoir.



In [5]:
import unicodedata, re

def normalize(line):
  line = unicodedata.normalize("NFKC",line.strip().lower())
  line = re.sub(r"^([^\w])(?!\s)",r"\1",line)
  line = re.sub(r"(\s[^\w])(?!\s)",r"\1",line)
  line = re.sub(r"(?!\s)([^\w])$",r"\1",line)
  line = re.sub(r"(?!\s)([^\w]\s)",r"\1",line)
  eng, fre = line.split("\t")
  fre = '[start]' + fre + '[end]'
  return eng, fre

In [6]:
with open(text_file) as fp :
  test_pairs = [normalize(line) for line in fp]

In [7]:
for _ in range(5):
  print(random.choice(test_pairs))

('where did you put my keys?', '[start]où avez-vous mis mes clés ?[end]')
("tom didn't tell mary why he was late.", "[start]tom n'a pas dit à mary pourquoi il était en retard.[end]")
('it was kind of fun.', "[start]c'était en quelque sorte amusant.[end]")
('have a good weekend!', '[start]passe un bon week-end ![end]')
("we didn't need to do that.", "[start]nous n'avons pas eu besoin de faire ça.[end]")


In [8]:
eng_tokens, fre_tokens = set(), set()
eng_maxlen, fre_maxlen = 0, 0
for eng, fre in test_pairs:
  eng_token, fre_token = eng.split(), fre.split()
  eng_maxlen = max(eng_maxlen, len(eng_token))
  fre_maxlen = max(fre_maxlen, len(fre_token))
  eng_tokens.update(eng_token)
  fre_tokens.update(fre_token)

print(f"Total token in english is : {len(eng_tokens)}")
print(f"Total token in french is : {len(fre_tokens)}")
print(f"Maximum length of line is : {eng_maxlen}")
print(f"Maximum length of line is : {fre_maxlen}")

Total token in english is : 25365
Total token in french is : 44581
Maximum length of line is : 47
Maximum length of line is : 54


In [9]:
import pickle
with open("test_pairs.pickle","wb") as fp:
  pickle.dump(test_pairs,fp)

In [1]:
# embedding layer
# positional encoding
# attention model

In [10]:
from tensorflow.keras.layers import TextVectorization
import pickle

with open("test_pairs.pickle",'rb') as fp:
  text_pairs = pickle.load(fp)

In [12]:
random.shuffle(text_pairs)

In [32]:
n_val = int(.15*len(text_pairs))
n_train = len(text_pairs) - 2*n_val
train_pair = text_pairs[:n_train]
test_pair = text_pairs[n_train: n_train + n_val]

In [33]:
vacab_en = 10000
vocab_fr = 20000
seq_len = 25

In [34]:
eng_vec = TextVectorization(
    max_tokens=vacab_en,
    standardize = None,
    split = 'whitespace',
    output_mode = 'int',
    output_sequence_length = seq_len
)


In [35]:
fr_vec = TextVectorization(
    max_tokens=vocab_fr,
    standardize = None,
    split = 'whitespace',
    output_mode = 'int',
    output_sequence_length = seq_len
)

In [36]:
train_eng = [pair[0] for pair in train_pair]
train_fr = [pair[1] for pair in train_pair]

In [37]:
eng_vec.adapt(train_eng)
fr_vec.adapt(train_fr)

In [38]:
with open('vectorization.pickle', 'wb') as fp:
  data = {'train': train_pair,
          'test': test_pair,
          'eng_vec': eng_vec.get_config(),
          'fr_vec': fr_vec.get_config(),
          'eng_weights':eng_vec.get_weights(),
          'fr_weights':fr_vec.get_weights()
          }
  pickle.dump(data, fp)

In [39]:
with open('vectorization.pickle', 'rb') as fp:
  data = pickle.load(fp)

In [40]:
train_pair = data['train']
test_pair = data['test']

eng_vec = TextVectorization.from_config(data['eng_vec'])
fr_vec = TextVectorization.from_config(data['fr_vec'])

eng_vec.set_weights(data['eng_weights'])
fr_vec.set_weights(data['fr_weights'])

In [46]:
def format_datasets(eng, fr):
  eng = eng_vec(eng)
  fr = fr_vec(fr)
  source = {
            'encoder_inputs': eng,
            'decoder_inputs': fr[:,:-1]
            }
  target = fr[:,1:]
  return (source, target)

def make_datasets(pairs, batchsize = 64):
  eng_text, fr_text = zip(*pairs)
  dataset = tf.data.Dataset.from_tensor_slices((eng_text), (fr_text))

  return dataset.shuffle(2048).batch(batchsize).map(format_datasets).prefetch(16).cache()

In [None]:
test_pair


In [48]:
train_ds = make_datasets(train_pair)

KeyboardInterrupt: 