In [1]:
!pip install torchtext --upgrade
!python -m spacy download fr
!python -m spacy download en

Requirement already up-to-date: torchtext in /usr/local/lib/python3.6/dist-packages (0.5.0)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/fr_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/fr
You can now load the model via spacy.load('fr')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [0]:
import os
import tqdm
import random
import numpy as np

import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Example, Field, Dataset

In [0]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SEED = 781
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
if not os.path.exists('./data'):
    !mkdir './data'

!wget --no-check-certificate \
    http://www.statmt.org/europarl/v7/fr-en.tgz \
    -O './data/fr-en.tgz'

--2020-02-14 20:23:34--  http://www.statmt.org/europarl/v7/fr-en.tgz
Resolving www.statmt.org (www.statmt.org)... 129.215.197.184
Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 202718517 (193M) [application/x-gzip]
Saving to: ‘./data/fr-en.tgz’


2020-02-14 20:27:12 (909 KB/s) - ‘./data/fr-en.tgz’ saved [202718517/202718517]



In [5]:
!tar -xzvf ./data/fr-en.tgz -C ./data/

europarl-v7.fr-en.en
europarl-v7.fr-en.fr


In [0]:
def read_file(filepath):
    try:
        with open(filepath, mode='rt', encoding='utf-8') as file:
            content = file.read().strip().split('\n')
        return content
    except:
        raise NotImplementedError(f'File {filepath} doesn\'t exist')

In [6]:
%%time
pairs = [*zip(read_file('./data/europarl-v7.fr-en.fr'),
             read_file('./data/europarl-v7.fr-en.en'))]
pairs = [*map(lambda x: {'fr': x[0], 'en': x[1]}, pairs)]
print('Number of examples:', len(pairs))
pairs = np.random.choice(pairs, size=30000, replace=False)
print('Number of examples after sampling:', len(pairs))
print('Example:', pairs[0])

Number of examples: 2007723
Number of examples after sampling: 30000
Example: {'fr': "Lors de la conférence des donateurs qui a eu lieu dans le cadre du pacte de stabilité, M. Patten a promis des projets précis et montré que la Commission pouvait être active sur place par des mesures d'aide concrètes.", 'en': 'At the donor conference on the Stability Pact, Mr Patten approved certain projects and made it clear that the Commission can provide practical measures of assistance on the ground.'}
CPU times: user 5.96 s, sys: 865 ms, total: 6.83 s
Wall time: 6.84 s


In [10]:
%%time
FR = Field(init_token='<soe>',
           eos_token='<eos>',
           pad_token='<pad>',
           unk_token='<unk>',
           lower=True,
           tokenize='spacy',
           tokenizer_language='fr')
EN = Field(init_token='<soe>',
           eos_token='<eos>',
           pad_token='<pad>',
           unk_token='<unk>',
           lower=True,
           tokenize='spacy',
           tokenizer_language='en')

examples = [Example.fromdict(data=pair, fields={'fr': ('src', FR),
                                                'en': ('dest', EN)})
            for pair in pairs]
data = Dataset(examples, fields={'src': FR, 'dest': EN})
train, valid, test = data.split(split_ratio=[0.7, 0.2, 0.1])
print('train size:', len(train.examples))
print('valid size:', len(valid.examples))
print('test size:', len(test.examples))
print(vars(train.examples[0]))

train size: 21000
valid size: 3000
test size: 6000
{'src': ['\xa0\xa0 ', '.', '-', 'monsieur', 'le', 'président', ',', 'l’', 'iran', 'développe', 'un', 'programme', 'nucléaire', 'dont', 'il', 'n’', 'a', 'pas', 'besoin', ',', 'puisqu’', 'il', 'est', 'assis', 'sur', 'une', 'mer', 'de', 'pétrole', '.'], 'dest': ['\xa0\xa0 ', '.', 'mr', 'president', ',', 'iran', 'is', 'developing', 'a', 'nuclear', 'programme', 'that', 'it', 'does', 'not', 'need', 'since', 'it', 'is', 'sitting', 'on', 'a', 'sea', 'of', 'oil', '.']}
CPU times: user 46.9 s, sys: 27.6 ms, total: 46.9 s
Wall time: 47 s


In [0]:
FR