## Here, we generate the input data for the neural network
* word-level tokenize train, dev and test set with sacremoses
* bpe all files

In [7]:
import pandas as pd
import csv
import os

### word-level tokenize L1, L2 and L3 splits

In [3]:
L2_train_en= "../data/DCEP/01-intermediate/L2_strong/L2_train.en"
L2_train_ro= "../data/DCEP/01-intermediate/L2_strong/L2_train.ro"
L2_train_en_tok = L2_train_en+".tok"
L2_train_ro_tok = L2_train_ro+".tok"

L2_test_en= "../data/DCEP/01-intermediate/L2_strong/L2_test.en"
L2_test_ro= "../data/DCEP/01-intermediate/L2_strong/L2_test.ro"
L2_test_en_tok = L2_test_en+".tok"
L2_test_ro_tok=L2_test_ro+".tok"

L2_dev_en= "../data/DCEP/01-intermediate/L2_strong/L2_dev.en"
L2_dev_ro= "../data/DCEP/01-intermediate/L2_strong/L2_dev.ro"
L2_dev_en_tok = L2_dev_en+".tok"
L2_dev_ro_tok = L2_dev_ro+".tok"

! sacremoses -l "en" -j 8 tokenize < $L2_train_en > $L2_train_en_tok
! sacremoses -l "ro" -j 8 tokenize < $L2_train_ro > $L2_train_ro_tok

! sacremoses -l "en" -j 8 tokenize < $L2_test_en > $L2_test_en_tok
! sacremoses -l "ro" -j 8 tokenize < $L2_test_ro > $L2_test_ro_tok

! sacremoses -l "en" -j 8 tokenize < $L2_dev_en > $L2_dev_en_tok
! sacremoses -l "ro" -j 8 tokenize < $L2_dev_ro > $L2_dev_ro_tok

In [4]:
L1_train_en= "../data/DCEP/01-intermediate/L1_basic/L1_train.en"
L1_train_ro= "../data/DCEP/01-intermediate/L1_basic/L1_train.ro"

L1_train_en_tok = L1_train_en+".tok"
L1_train_ro_tok = L1_train_ro+".tok"

! sacremoses -l "en" -j 8 tokenize < $L1_train_en > $L1_train_en_tok
! sacremoses -l "ro" -j 8 tokenize < $L1_train_ro > $L1_train_ro_tok

In [5]:
L3_train_en= "../data/DCEP/01-intermediate/L3_intermediate/L3_train.en"
L3_train_ro= "../data/DCEP/01-intermediate/L3_intermediate/L3_train.ro"

L3_train_en_tok = L3_train_en+".tok"
L3_train_ro_tok = L3_train_ro+".tok"

! sacremoses -l "en" -j 8 tokenize < $L3_train_en > $L3_train_en_tok
! sacremoses -l "ro" -j 8 tokenize < $L3_train_ro > $L3_train_ro_tok

###  Learn and apply subword tokenization with subword-nmt, an implementation of byte-pair-encoding for subword splitting

In [6]:
#this bpe_size is recommended for small to medium sized datasets (30K-1.3M)
bpe_size=8000

#learn the vocab from the bigger training files resulted after basic cleaning
#the problem with this approach is that the L1 dataset contains wrong languages, which makes it possible to have wrong translations
! subword-nmt learn-joint-bpe-and-vocab --input $L1_train_en_tok $L1_train_ro_tok -s $bpe_size -o bpe.codes.$bpe_size --write-vocabulary vocab.en vocab.ro

#apply BPE
#todo:change path such that the bpe files point to 02-preprocessed!
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $L1_train_en_tok > L1_train_tok.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $L1_train_ro_tok > L1_train_tok.bpe.ro

! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $L3_train_en_tok > L3_train_tok.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $L3_train_ro_tok > L3_train_tok.bpe.ro

! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $L2_train_en_tok > L2_train_tok.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $L2_train_ro_tok > L2_train_tok.bpe.ro

! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $L2_dev_en_tok > L2_dev_tok.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $L2_dev_ro_tok > L2_dev_tok.bpe.ro

! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.en --vocabulary-threshold 50 < $L2_test_en_tok > L2_test_tok.bpe.en
! subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.ro --vocabulary-threshold 50 < $L2_test_ro_tok > L2_test_tok.bpe.ro


In [7]:
! wget https://raw.githubusercontent.com/joeynmt/joeynmt/master/scripts/build_vocab.py

! python build_vocab.py L1_train_tok.bpe.en L1_train_tok.bpe.ro --output_path vocab.txt


--2021-08-19 14:30:50--  https://raw.githubusercontent.com/joeynmt/joeynmt/master/scripts/build_vocab.py
Auflösen des Hostnamen »raw.githubusercontent.com (raw.githubusercontent.com)«... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Verbindungsaufbau zu raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... verbunden.
HTTP-Anforderung gesendet, warte auf Antwort... 200 OK
Länge: 2034 (2,0K) [text/plain]
In »»build_vocab.py«« speichern.


2021-08-19 14:30:51 (81,4 MB/s) - »»build_vocab.py«« gespeichert [2034/2034]



In [8]:
#move all files to the right destination: in DCEP/02-preporcessed/
!mv bpe.codes.8000 ../data/DCEP/02-preprocessed/
!mv vocab.en ../data/DCEP/02-preprocessed/
!mv vocab.ro ../data/DCEP/02-preprocessed/
!mv vocab.txt ../data/DCEP/02-preprocessed/
!mv build_vocab.py ../data/DCEP/02-preprocessed/

!mv L1_train_tok.bpe.en ../data/DCEP/02-preprocessed/L1_basic/
!mv L1_train_tok.bpe.ro ../data/DCEP/02-preprocessed/L1_basic/

!mv L3_train_tok.bpe.en ../data/DCEP/02-preprocessed/L3_intermediate/
!mv L3_train_tok.bpe.ro ../data/DCEP/02-preprocessed/L3_intermediate/

!mv L2_train_tok.bpe.en ../data/DCEP/02-preprocessed/L2_strong/
!mv L2_train_tok.bpe.ro ../data/DCEP/02-preprocessed/L2_strong/

!mv L2_dev_tok.bpe.en ../data/DCEP/02-preprocessed/L2_strong/
!mv L2_dev_tok.bpe.ro ../data/DCEP/02-preprocessed/L2_strong/

!mv L2_test_tok.bpe.en ../data/DCEP/02-preprocessed/L2_strong/
!mv L2_test_tok.bpe.ro ../data/DCEP/02-preprocessed/L2_strong/

mv: der Aufruf von stat für „bpe.codes“ ist nicht möglich: Datei oder Verzeichnis nicht gefunden
