## Here, we generate the input data for the neural network
* word-level tokenize train, dev and test set with sacremoses
* bpe all files

In [2]:
import pandas as pd
import csv
import os

In [None]:
source_file_train= os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_source_train.txt")
target_file_train= os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_target_train.txt")
tok_source_file_train = source_file_train+".tok"
tok_target_file_train=target_file_train+".tok"

source_file_test= os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_source_test.txt")
target_file_test= os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_target_test.txt")
tok_source_file_test = source_file_test+".tok"
tok_target_file_test=target_file_test+".tok"

source_file_dev= os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_source_dev.txt")
target_file_dev= os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_target_dev.txt")
tok_source_file_dev = source_file_dev+".tok"
tok_target_file_dev=target_file_dev+".tok"

! sacremoses -l "en" -j 8 tokenize < $source_file_train > $tok_source_file_train
! sacremoses -l "ro" -j 8 tokenize < $target_file_train > $tok_target_file_train

! sacremoses -l "en" -j 8 tokenize < $source_file_test > $tok_source_file_test
! sacremoses -l "ro" -j 8 tokenize < $target_file_test > $tok_target_file_test

! sacremoses -l "en" -j 8 tokenize < $source_file_dev > $tok_source_file_dev
! sacremoses -l "ro" -j 8 tokenize < $target_file_dev > $tok_target_file_dev

In [None]:
source_train_basic= os.path.join(os.pardir,"/data/02-preprocessed/basic_cleaned/basic_source_train.txt")
target_train_basic= os.path.join(os.pardir,"/data/02-preprocessed/basic_cleaned/basic_target_train.txt")

tok_source_train_basic = source_train_basic+".tok"
tok_target_train_basic = target_train_basic+".tok"

! sacremoses -l "en" -j 8 tokenize < $source_train_basic > $tok_source_train_basic
! sacremoses -l "ro" -j 8 tokenize < $target_train_basic > $tok_target_train_basic

##  Learn and apply subword tokenization with subword-nmt, an implementation of byte-pair-encoding for subword splitting

In [3]:
#this bpe_size is recommended for small to medium sized datasets (30K-1.3M)
bpe_size=8000

tok_source_train_basic=os.path.join(os.pardir,"/data/02-preprocessed/basic_cleaned/basic_source_train.txt.tok")
tok_target_train_basic=os.path.join(os.pardir,"/data/02-preprocessed/basic_cleaned/basic_target_train.txt.tok")

tok_source_train_bicleaner=os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_source_train.txt.tok")
tok_target_train_bicleaner=os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_target_train.txt.tok")

tok_source_dev_bicleaner=os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_source_dev.txt.tok")
tok_target_dev_bicleaner=os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_target_dev.txt.tok")

tok_source_test_bicleaner=os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_source_test.txt.tok")
tok_target_test_bicleaner=os.path.join(os.pardir,"/data/02-preprocessed/bicleaner_cleaned/bicleaner_target_test.txt.tok")

#learn the vocab from the bigger training files resulted after basic cleaning
! subword-nmt learn-joint-bpe-and-vocab --input $tok_source_train_basic $tok_target_train_basic -s $bpe_size -o bpe.codes.$bpe_size --write-vocabulary vocab.en vocab.ro

#apply BPE
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $tok_source_train_basic > tok_train_basic.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $tok_target_train_basic > tok_train_basic.bpe.ro

! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $tok_source_train_bicleaner > tok_train_bicleaner.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $tok_target_train_bicleaner > tok_train_bicleaner.bpe.ro

! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $tok_source_dev_bicleaner > tok_dev_bicleaner.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $tok_target_dev_bicleaner > tok_dev_bicleaner.bpe.ro

! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $tok_source_test_bicleaner > tok_test_bicleaner.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $tok_target_test_bicleaner > tok_test_bicleaner.bpe.ro

In [4]:
#! wget https://raw.githubusercontent.com/joeynmt/joeynmt/master/scripts/build_vocab.py

! python build_vocab.py tok_source_train_basic.bpe.en tok_source_train_basic.bpe.ro --output_path vocab.txt


In [7]:
from sacremoses import MosesTokenizer, MosesDetokenizer
mt = MosesTokenizer(lang='en')
text = u'In order to comply with the obligation to provide effective non-discriminatory access to existing infrastructures, policies or procedures within the meaning of A rticle 4 (1) (a), Member States may, if necessary, have an additional period of 10 years [from the deadline for transposition] to comply with that obligation                                                      .'

tokenized_text = mt.tokenize(text, return_str=True)

In [8]:
tokenized_text

'In order to comply with the obligation to provide effective non-discriminatory access to existing infrastructures , policies or procedures within the meaning of A rticle 4 ( 1 ) ( a ) , Member States may , if necessary , have an additional period of 10 years &#91; from the deadline for transposition &#93; to comply with that obligation .'