In [6]:
import sys
sys.path.append("/workspace")

In [66]:
import os
import re
import typing
from src.utils import iterate_lines

In [99]:
def is_python_diff(line: str) -> bool:
    pattern = r"^<file> \w*.py .*$"
    match = re.match(pattern, line)
    return match != None

def is_english(line: str) -> bool:
    try:
        line.encode(encoding="utf-8").decode("ascii")
    except UnicodeDecodeError:
        return False
    else:
        return True

In [23]:
def iterate_line_pairs(
    prefix_path: str, 
    exts: typing.Tuple[str, str]
) -> typing.Generator[str, None, None]:
    source_file = prefix_path + exts[0]
    target_file = prefix_path + exts[1]
    
    for source_line, target_line in zip(iterate_lines(file_path=source_file), 
                                        iterate_lines(file_path=target_file)):
        yield source_line, target_line

In [44]:
INPUT_FILE_PREF = "/workspace/tmp/train"
OUTPUT_FILE_PREF = "/workspace/tmp/1k_py/train"
EXTS = (".diff", ".msg")

In [102]:
dirname = os.path.dirname(OUTPUT_FILE_PREF)
if not os.path.exists(dirname):
    os.makedirs(dirname, exist_ok=True)
    
src_file = open(OUTPUT_FILE_PREF + EXTS[0], mode="w")
trg_file = open(OUTPUT_FILE_PREF + EXTS[1], mode="w")

MAX_SIZE = 100000
COUNT = 0

with src_file, trg_file:
    for src, trg in iterate_line_pairs(
        prefix_path=INPUT_FILE_PREF, 
        exts=EXTS
    ):
        if COUNT >= MAX_SIZE:
            break
        if ".py " not in src:
            continue
        if not is_python_diff(src):
            continue
        if not (is_english(src) and is_english(trg)):
            continue
        COUNT += 1
        src_file.write(src)
        trg_file.write(trg)

In [74]:
!python -m src.tokenize train-shared \
        --source-input-path=/workspace/tmp/1k_py/train.diff \
        --target-input-path=/workspace/tmp/1k_py/train.msg \
        --model-name=sentencepiece \
        --vocab-size=32000

/opt/conda/bin/python: Error while finding module specification for '/workspace/src/tokenize.py' (ModuleNotFoundError: No module named '/workspace/src/tokenize')


In [None]:
!python -m src.tokenize tokenize-shared-bpe \
        --shared-model=/workspace/tmp/1k_py/sentencepiece.model \
        --source-path=/workspace/tmp/1k_py/train.diff \
        --target-path=/workspace/tmp/1k_py/train.msg \
        --dest-source-path=/workspace/tmp/1k_py/bpe/train.diff \
        --dest-target-path=/workspace/tmp/1k_py/bpe/train.msg \
        --max-size=10000

In [None]:
!python -m src.split_dataset split \
            --dataset_prefix=/workspace/tmp/1k_py/bpe/train \
            --exts='.diff, .msg' \
            --split-ratio='0.8, 0.15, 0.05' \
            --dest-path=/workspace/tmp/1k_py/bpe_splitted

In [103]:
train_pref = "/workspace/tmp/1k_py/bpe_splitted/train"
valid_pref = "/workspace/tmp/1k_py/bpe_splitted/valid"
test_pref = "/workspace/tmp/1k_py/bpe_splitted/test"
dest_path = "/workspace/tmp/1k_py/10k.bpe.diff-msg"

In [79]:
!apt-get update \
    && apt-get -y install build-essential \
    && pip install fairseq sentencepiece

Hit:1 http://deb.debian.org/debian buster InRelease
Get:2 http://security.debian.org/debian-security buster/updates InRelease [65.4 kB]
Get:3 http://deb.debian.org/debian buster-updates InRelease [49.3 kB]
Get:4 http://security.debian.org/debian-security buster/updates/main amd64 Packages [167 kB]
Fetched 281 kB in 0s (606 kB/s)   
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  dirmngr dpkg-dev fakeroot g++ g++-8 gnupg gnupg-l10n gnupg-utils gpg
  gpg-agent gpg-wks-client gpg-wks-server gpgconf gpgsm libalgorithm-diff-perl
  libalgorithm-diff-xs-perl libalgorithm-merge-perl libassuan0 libdpkg-perl
  libfakeroot libfile-fcntllock-perl libksba8 liblocale-gettext-perl libnpth0
  libstdc++-8-dev make pinentry-curses
Suggested packages:
  dbus-user-session libpam-systemd pinentry-gnome3 tor debian-keyring
  g++-multilib g++-8-multilib gcc-8-doc libstdc++6-8-d

Selecting previously unselected package libalgorithm-diff-xs-perl.
Preparing to unpack .../25-libalgorithm-diff-xs-perl_0.04-5+b1_amd64.deb ...
Unpacking libalgorithm-diff-xs-perl (0.04-5+b1) ...
Selecting previously unselected package libalgorithm-merge-perl.
Preparing to unpack .../26-libalgorithm-merge-perl_0.08-3_all.deb ...
Unpacking libalgorithm-merge-perl (0.08-3) ...
Selecting previously unselected package libfile-fcntllock-perl.
Preparing to unpack .../27-libfile-fcntllock-perl_0.22-3+b5_amd64.deb ...
Unpacking libfile-fcntllock-perl (0.22-3+b5) ...
Setting up libksba8:amd64 (1.3.5-2) ...
Setting up libfile-fcntllock-perl (0.22-3+b5) ...
Setting up libalgorithm-diff-perl (1.19.03-2) ...
Setting up libnpth0:amd64 (1.6-1) ...
Setting up libassuan0:amd64 (2.5.2-1) ...
Setting up libfakeroot:amd64 (1.23-1) ...
Setting up fakeroot (1.23-1) ...
update-alternatives: using /usr/bin/fakeroot-sysv to provide /usr/bin/fakeroot (fakeroot) in auto mode
Setting up make (4.2.1-1.2) ...
Setti

In [104]:
!fairseq-preprocess \
    --source-lang diff --target-lang msg \
    --trainpref $train_pref \
    --validpref $valid_pref \
    --testpref $test_pref \
    --destdir $dest_path \
    --nwordssrc 32000 --nwordstgt 32000 \
    --joined-dictionary \
    --bpe sentencepiece \
    --workers 60

Namespace(align_suffix=None, alignfile=None, bpe='sentencepiece', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/workspace/tmp/1k_py/10k.bpe.diff-msg', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=True, log_format=None, log_interval=1000, lr_scheduler='fixed', memory_efficient_fp16=False, min_loss_scale=0.0001, no_progress_bar=False, nwordssrc=32000, nwordstgt=32000, only_source=False, optimizer='nag', padding_factor=8, seed=1, source_lang='diff', srcdict=None, target_lang='msg', task='translation', tensorboard_logdir='', testpref='/workspace/tmp/1k_py/bpe_splitted/test', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, tokenizer=None, trainpref='/workspace/tmp/1k_py/bpe_splitted/train', user_dir=None, validpref='/workspace/tmp/1k_py/bpe_splitted/valid', workers=60)
| [diff] Dictionary: 12895 types
| [diff] /workspace/tmp/1k_py/bpe_splitted/train.diff: 8000 sents, 1