In [4]:
!pip install -q tensorflow


In [5]:
import tensorflow as tf
import pathlib

In [6]:
# download dataset provided by Anki: https://www.manythings.org/anki/
text_file = tf.keras.utils.get_file(
    fname="fra-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip",
    extract=True,
)
# show where the file is located now
text_file = pathlib.Path(text_file).parent / "fra.txt"
print(text_file)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip
[1m3423204/3423204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
/Users/goldyrana/.keras/datasets/fra.txt


### Lesson 2: Data Normalization

French letters have accents which would be represented as Unicode characters, but such representation is not unique in Unicode. Therefore, you will convert the string into NFKC (compatibility and composition normal form).

In [7]:
import pickle
import random
import re
import unicodedata

text_file = pathlib.Path(text_file).parent / "fra.txt"

def normalize(line):
    """Normalize a line of text and split into two at the tab character"""
    line = unicodedata.normalize("NFKC", line.strip().lower())
    line = re.sub(r"^([^ \w])(?!\s)", r"\1 ", line)
    line = re.sub(r"(\s[^ \w])(?!\s)", r"\1 ", line)
    line = re.sub(r"(?!\s)([^ \w])$", r" \1", line)
    line = re.sub(r"(?!\s)([^ \w]\s)", r" \1", line)
    eng, fra = line.split("\t")
    fra = "[start] " + fra + " [end]"
    return eng, fra

# normalize each line and separate into English and French
with open(text_file) as fp:
    text_pairs = [normalize(line) for line in fp]

# print some samples
for _ in range(5):
    print(random.choice(text_pairs))

with open("text_pairs.pickle", "wb") as fp:
    pickle.dump(text_pairs, fp)

('just looking at her , you can tell that she likes you .', "[start] juste en la regardant , vous pouvez voir qu'elle vous aime . [end]")
('that made me cry .', "[start] ça m'a fait pleurer . [end]")
('i want to cut down on the time it takes to process records .', '[start] je veux réduire le temps que prend le traitement des dossiers . [end]')
('she can count from one to ten .', '[start] elle sait compter de un à dix . [end]')
('i just have to do something .', "[start] il me faut faire quelque chose , un point c'est tout . [end]")
