# Code2AST dataset prepare

In [1]:
import sys
sys.path.append("/workspace")

In [2]:
import os

## Steps to prepare a parallel dataset for code2ast:

1. Make up a list of repositories to clone
2. Clone selected repositories
3. Parse every .py file (returning a pair of .src and .ast files) for every cloned repository
4. Merge parsed pairs into two large files (train.src, train.ast)
5. Train a BPE tokenizer model on both files (model_src, model_ast)
6. Apply tokenization for all lines in the files and filter out ones which are longer than the threshold value (512 tokens).
(This will result in making two tokenized files with lines of a length not greater than the threshold value)
7. Detokenize files using trained BPE models and write results to updated files
8. Train new BPE tokenization models on updated files
9. Tokenize updated files using new BPE models
10. Split tokenized files into train/valid/test subsets
11. Preprocess prepared subsets using fairseq-preprocess utils

### TODO

Filter out duplicates in detokenized files

### Step 1: 
Make up a list of repositories to clone

In [3]:
# TODO

### Step 2: 
Clone selected repositories

In [4]:
# TODO

### Step 3: 
Parse every .py file (returning a pair of .src and .ast files) for every cloned repository

In [5]:
# TODO

### Step 4: 
Merge parsed pairs into two large files (train.src, train.ast)

In [6]:
# TODO

### Step 5: 
Train a BPE tokenizer model on both files (model_src, model_ast)

In [24]:
extensions = (".src", ".ast")
merged_dataset_prefix = "/workspace/tmp/ast_test/code2ast_medium/train"

source_input_path = merged_dataset_prefix + extensions[0]
target_input_path = merged_dataset_prefix + extensions[1]

source_vocab_size = 32_000
target_vocab_size = 32_000
source_model_name = "src_model"
target_model_name = "ast_model"
source_model_path = os.path.join("/workspace", source_model_name + ".model")
target_model_path = os.path.join("/workspace", target_model_name + ".model")

In [25]:
!cd /workspace && python -m src.tokenize train \
        --source-input-path=$source_input_path \
        --source-model-name=$source_model_name \
        --source-vocab-size=$source_vocab_size \
        --target-input-path=$target_input_path \
        --target-model-name=$target_model_name \
        --target-vocab-size=$target_vocab_size

sentencepiece_trainer.cc(116) LOG(INFO) Running command: --input=/workspace/tmp/ast_test/code2ast_medium/train.src             --user_defined_symbols=<file>,<chunk>,<nl>,<add>,<del>,<url>,<num>,<ref>,<sha>             --model_prefix=src_model             --pad_id=3             --pad_piece=<pad>             --vocab_size=32000             --hard_vocab_limit=False             --input_sentence_size=1000             --model_type=bpe
sentencepiece_trainer.cc(49) LOG(INFO) Starts training with : 
TrainerSpec {
  input: /workspace/tmp/ast_test/code2ast_medium/train.src
  input_format: 
  model_prefix: src_model
  model_type: BPE
  vocab_size: 32000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 1000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  t

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=9 size=1820 all=10312 active=1025 piece=tern
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=9 size=1840 all=10335 active=1048 piece=▁Make
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=9 size=1860 all=10339 active=1052 piece=RYPTOGR
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=9 size=1880 all=10336 active=1049 piece=▁initial
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=8 size=1900 all=10324 active=1037 piece=1:
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=8 min_freq=5
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=8 size=1920 all=10373 active=1045 piece=64-
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=8 size=1940 all=10408 active=1080 piece=tun
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=8 size=1960 all=10434 active=1106 piece=link
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=8 size=1980 all=10452 active=1124 piece=▁ter
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=8 size=2000 all=10464 active

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=4020 all=10976 active=984 piece=membership
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=4040 all=10960 active=968 piece=parametrize
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=4060 all=10940 active=948 piece=▁exponential
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=4080 all=10926 active=933 piece=0:
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=4100 all=10952 active=959 piece=DL
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=2 min_freq=2
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=4120 all=10973 active=1021 piece=OW
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=4140 all=10981 active=1029 piece=ci
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=4160 all=10995 active=1043 piece=")}
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=4180 all=10999 active=1047 piece=535
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=4200 all=10999 acti

bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=1 min_freq=1
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=5920 all=10544 active=994 piece=uldn
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=5940 all=10538 active=988 piece=ittle
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=5960 all=10536 active=986 piece=publi
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=5980 all=10526 active=976 piece=▁'50-
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=6000 all=10513 active=963 piece=▁Deal
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=1 min_freq=1
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=6020 all=10506 active=994 piece=ICALI
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=6040 all=10501 active=989 piece=▁Matt
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=6060 all=10489 active=977 piece=▁cnos
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=6080 all=10471 active=959 piece=▁wav

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=7860 all=9607 active=956 piece=▁stage
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=7880 all=9598 active=947 piece=junction
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=7900 all=9582 active=931 piece=returned
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=1 min_freq=1
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=7920 all=9565 active=984 piece=slic
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=7940 all=9552 active=971 piece=▁mand
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=7960 all=9538 active=957 piece=stopped
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=7980 all=9523 active=942 piece=starting
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=8000 all=9509 active=928 piece=▁1004]],
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=1 min_freq=1
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=8020 all=9490 active=982 piec

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9780 all=7789 active=921 piece=accent
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9800 all=7769 active=901 piece=dedKey
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9820 all=7749 active=981 piece=subs
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9840 all=7729 active=961 piece=nsure
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9860 all=7709 active=941 piece=dsTemp
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9880 all=7689 active=921 piece=hether
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9900 all=7669 active=901 piece=mageId
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9920 all=7649 active=981 piece=aim
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=9940 all=7629 active=961 piece=pars
bpe_m

bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12820 all=4748 active=981 piece=dm
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12840 all=4728 active=961 piece=agw
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12860 all=4708 active=941 piece=WORK
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12880 all=4688 active=921 piece=portS
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12900 all=4668 active=901 piece=ription
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12920 all=4648 active=981 piece=dS
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12940 all=4627 active=960 piece=htl
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12960 all=4607 active=940 piece=▁[5
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=12980 all=4587 active=920 piece=▁SER

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14740 all=2825 active=961 piece=anis
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14760 all=2805 active=941 piece=unds
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14780 all=2785 active=921 piece=▁setU
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14800 all=2765 active=901 piece=▁proces
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14820 all=2745 active=981 piece=yte
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14840 all=2725 active=961 piece=rebo
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14860 all=2705 active=941 piece=lters
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14880 all=2685 active=921 piece=ission
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=14900 all=2665 active=901 piece=▁shuffl
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=

bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16320 all=1241 active=981 piece=TAD
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16340 all=1221 active=961 piece=▁sy
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16360 all=1201 active=941 piece=▁tex
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16380 all=1181 active=921 piece=▁nati
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16400 all=1161 active=901 piece=comparis
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16420 all=1141 active=981 piece=lor
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16440 all=1121 active=961 piece=bero
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16460 all=1101 active=941 piece=nstan
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=16480 all=1081 active=921 piece=ublicF
b

trainer_interface.cc(317) LOG(INFO) Sampled 1000 sentences from 308257 sentences.
trainer_interface.cc(321) LOG(INFO) Skipped 10769 too long sentences.
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <s>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: </s>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <pad>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <file>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <chunk>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <nl>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <add>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <del>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <url>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <num>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <ref>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <sha>
trainer_interface.cc(335) LOG(INFO) Normalizing sentence

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1640 all=3914 active=1015 piece=boun
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1660 all=3910 active=1011 piece=intf
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1680 all=3913 active=1014 piece=visi
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1700 all=3923 active=1024 piece=bigiq
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=2 min_freq=1
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1720 all=3910 active=988 piece=ority
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1740 all=3904 active=982 piece=Parser
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1760 all=3899 active=977 piece=former
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1780 all=3891 active=969 piece=search
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=2 size=1800 all=3879 active=957 piece=collect
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=2 min_freq=1


bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3740 all=2569 active=961 piece=anF
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3760 all=2549 active=941 piece=atri
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3780 all=2529 active=921 piece=comb
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3800 all=2509 active=901 piece=etBo
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3820 all=2489 active=981 piece=eD
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3840 all=2469 active=961 piece=ilu
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3860 all=2449 active=941 piece=cCli
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3880 all=2429 active=921 piece=grif
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=3900 all=2409 active=901 piece=inse
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.c

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5500 all=809 active=758 piece=cessfu
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5520 all=789 active=738 piece=datase
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5540 all=769 active=718 piece=exList
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5560 all=749 active=698 piece=iction
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5580 all=729 active=678 piece=ithout
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5600 all=709 active=658 piece=mobile
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5620 all=689 active=638 piece=onents
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5640 all=669 active=618 piece=patter
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=5660 all=649 active=598 piece=rError
bpe_mo

### Step 6: 
Apply tokenization for all lines in the files and filter out ones which are longer than the threshold value (512 tokens)

In [26]:
dest_source_path = "/workspace/tmp/ast_test/code2ast_tokenized/train.src"
dest_target_path = "/workspace/tmp/ast_test/code2ast_tokenized/train.ast"

In [27]:
!cd /workspace && python -m src.tokenize tokenize-bpe \
        --task=code2ast \
        --source-model=$source_model_path \
        --source-path=$source_input_path \
        --target-model=$target_model_path \
        --target-path=$target_input_path \
        --dest-source-path=$dest_source_path \
        --dest-target-path=$dest_target_path

### Step 7: 
Detokenize files using trained BPE models and write results to updated files

In [28]:
detokenized_source_path = "/workspace/tmp/ast_test/code2ast_detokenized/train.src"
detokenized_target_path = "/workspace/tmp/ast_test/code2ast_detokenized/train.ast"

In [29]:
!cd /workspace && python -m src.tokenize detokenize-bpe \
        --source-model=$source_model_path \
        --source-path=$dest_source_path \
        --target-model=$target_model_path \
        --target-path=$dest_target_path \
        --dest-source-path=$detokenized_source_path \
        --dest-target-path=$detokenized_target_path

### Step 8: 
Train new BPE tokenization models on updated files

In [30]:
detokenized_source_vocab_size = 32_000
detokenized_target_vocab_size = 32_000
detokenized_source_model_name = "detokenized_src_model"
detokenized_target_model_name = "detokenized_ast_model"
detokenized_source_model_path = os.path.join("/workspace", detokenized_source_model_name + ".model")
detokenized_target_model_path = os.path.join("/workspace", detokenized_target_model_name + ".model")

In [31]:
!cd /workspace && python -m src.tokenize train \
        --source-input-path=$detokenized_source_path \
        --source-model-name=$detokenized_source_model_name \
        --source-vocab-size=$detokenized_source_vocab_size \
        --target-input-path=$detokenized_target_path \
        --target-model-name=$detokenized_target_model_name \
        --target-vocab-size=$detokenized_target_vocab_size

sentencepiece_trainer.cc(116) LOG(INFO) Running command: --input=/workspace/tmp/ast_test/code2ast_detokenized/train.src             --user_defined_symbols=<file>,<chunk>,<nl>,<add>,<del>,<url>,<num>,<ref>,<sha>             --model_prefix=detokenized_src_model             --pad_id=3             --pad_piece=<pad>             --vocab_size=32000             --hard_vocab_limit=False             --input_sentence_size=1000             --model_type=bpe
sentencepiece_trainer.cc(49) LOG(INFO) Starts training with : 
TrainerSpec {
  input: /workspace/tmp/ast_test/code2ast_detokenized/train.src
  input_format: 
  model_prefix: detokenized_src_model
  model_type: BPE
  vocab_size: 32000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 1000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_numb

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2180 all=6734 active=1016 piece=lator
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2200 all=6738 active=1020 piece=▁eval
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=3 min_freq=2
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2220 all=6730 active=992 piece=hidden
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2240 all=6725 active=987 piece=verify
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2260 all=6716 active=978 piece=▁{0}".
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2280 all=6714 active=976 piece=softmax
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2300 all=6707 active=969 piece=▁tenant
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=3 min_freq=1
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2320 all=6710 active=1004 piece=▁dataset
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=3 size=2340 all=6698 active=992 piece

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4260 all=6135 active=967 piece=skipif
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4280 all=6127 active=959 piece=wanopt
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4300 all=6109 active=941 piece=▁after
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=1 min_freq=1
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4320 all=6098 active=990 piece=then
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4340 all=6082 active=974 piece=='2.2.
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4360 all=6071 active=963 piece=▁blade
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4380 all=6053 active=945 piece=▁encap
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4400 all=6041 active=933 piece=▁munoz
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=1 min_freq=1
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=4420 all=6028 active=988 piece=▁'-'
b

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6080 all=4504 active=921 piece=▁cust
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6100 all=4484 active=901 piece=▁lear
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6120 all=4464 active=981 piece=▁yi
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6140 all=4444 active=961 piece=pyth
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6160 all=4424 active=941 piece=irele
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6180 all=4404 active=921 piece=▁mari
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6200 all=4384 active=901 piece=▁tech
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6220 all=4364 active=981 piece=het
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=6240 all=4344 active=961 piece=vin
bpe_model_tra

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8720 all=1862 active=981 piece=did
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8740 all=1842 active=961 piece=▁ev
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8760 all=1822 active=941 piece=redi
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8780 all=1802 active=921 piece=ollin
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8800 all=1782 active=901 piece=asharra
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8820 all=1761 active=980 piece=unk
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8840 all=1741 active=960 piece=mera
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8860 all=1721 active=940 piece=vcen
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8880 all=1701 active=920 piece=dedata
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=8900 all=1681 active=900 piece=autopop
b

trainer_interface.cc(531) LOG(INFO) Saving vocabs: detokenized_src_model.vocab
sentencepiece_trainer.cc(116) LOG(INFO) Running command: --input=/workspace/tmp/ast_test/code2ast_detokenized/train.ast             --user_defined_symbols=<file>,<chunk>,<nl>,<add>,<del>,<url>,<num>,<ref>,<sha>             --model_prefix=detokenized_ast_model             --pad_id=3             --pad_piece=<pad>             --vocab_size=32000             --hard_vocab_limit=False             --input_sentence_size=1000             --model_type=bpe
sentencepiece_trainer.cc(49) LOG(INFO) Starts training with : 
TrainerSpec {
  input: /workspace/tmp/ast_test/code2ast_detokenized/train.ast
  input_format: 
  model_prefix: detokenized_ast_model
  model_type: BPE
  vocab_size: 32000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 1000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1680 all=3234 active=1000 piece=ves
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1700 all=3228 active=994 piece=cach
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=1 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1720 all=3225 active=997 piece=gpus
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1740 all=3220 active=992 piece=modu
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1760 all=3215 active=987 piece=ping
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1780 all=3213 active=985 piece=soli
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1800 all=3208 active=980 piece=wire
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=1 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1820 all=3200 active=992 piece=clien
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=1 size=1840 all=3196 active=988 piece=freqs
bpe_model_tra

bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4060 all=1143 active=941 piece=pcon
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4080 all=1123 active=921 piece=pound
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4100 all=1103 active=901 piece=rited
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4120 all=1083 active=981 piece=pai
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4140 all=1063 active=961 piece=mbda
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4160 all=1043 active=941 piece=dwith
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4180 all=1023 active=921 piece=scali
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4200 all=1003 active=901 piece=splay
bpe_model_trainer.cc(166) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(257) LOG(INFO) Added: freq=0 size=4220 all=983 active=951 piece=maz
bpe_model_tra

### Step 9: 
Tokenize updated files using new BPE models

In [32]:
prepared_source_path = "/workspace/tmp/ast_test/code2ast_prepared/train.src"
prepared_target_path = "/workspace/tmp/ast_test/code2ast_prepared/train.ast"

In [33]:
!cd /workspace && python -m src.tokenize tokenize-bpe \
        --task=code2ast \
        --source-model=$detokenized_source_model_path \
        --source-path=$detokenized_source_path \
        --target-model=$detokenized_target_model_path \
        --target-path=$detokenized_target_path \
        --dest-source-path=$prepared_source_path \
        --dest-target-path=$prepared_target_path

### Step 10: 
Split tokenized files into train/valid/test subsets

In [34]:
prepared_dataset_prefix = os.path.splitext(prepared_source_path)[0]
splitted_dataset_path = "/workspace/tmp/ast_test/code2ast_medium_splitted"

In [35]:
!cd /workspace && python -m src.split_dataset split \
            --dataset_prefix=$prepared_dataset_prefix \
            --exts="{extensions[0]}, {extensions[1]}" \
            --split-ratio='0.8, 0.15, 0.05' \
            --dest-path=$splitted_dataset_path

### Step 11: 
Preprocess prepared subsets using fairseq-preprocess utils

In [36]:
!apt-get update \
    && apt-get -y install build-essential \
    && pip install fairseq sentencepiece

Hit:1 http://deb.debian.org/debian buster InRelease
Hit:2 http://security.debian.org/debian-security buster/updates InRelease
Hit:3 http://deb.debian.org/debian buster-updates InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.6).
0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.


In [37]:
train_pref = os.path.join(splitted_dataset_path, "train")
valid_pref = os.path.join(splitted_dataset_path, "valid")
test_pref = os.path.join(splitted_dataset_path, "test")
preprocessed_path = "/workspace/tmp/ast_test/code2ast_medium_splitted.src-ast"

In [38]:
!rm -rf $preprocessed_path && fairseq-preprocess \
    --source-lang src --target-lang ast \
    --trainpref $train_pref \
    --validpref $valid_pref \
    --testpref $test_pref \
    --destdir $preprocessed_path \
    --nwordssrc 32000 --nwordstgt 32000 \
    --bpe sentencepiece \
    --workers 60

Namespace(align_suffix=None, alignfile=None, bpe='sentencepiece', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/workspace/tmp/ast_test/code2ast_medium_splitted.src-ast', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=1000, lr_scheduler='fixed', memory_efficient_fp16=False, min_loss_scale=0.0001, no_progress_bar=False, nwordssrc=32000, nwordstgt=32000, only_source=False, optimizer='nag', padding_factor=8, seed=1, source_lang='src', srcdict=None, target_lang='ast', task='translation', tensorboard_logdir='', testpref='/workspace/tmp/ast_test/code2ast_medium_splitted/test', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, tokenizer=None, trainpref='/workspace/tmp/ast_test/code2ast_medium_splitted/train', user_dir=None, validpref='/workspace/tmp/ast_test/code2ast_medium_splitted/valid', workers=60)
| [src] Dictionary: 8287 types
| [src

## Clear temp paths and BPE models

In [22]:
!rm -rf {os.path.dirname(dest_source_path)} \
    {os.path.dirname(detokenized_source_path)} \
    {os.path.dirname(prepared_source_path)} \
    {splitted_dataset_path}

In [23]:
!rm -rf {os.path.splitext(source_model_path)[0] + ".*"} \
    {os.path.splitext(target_model_path)[0] + ".*"} \
    {os.path.splitext(detokenized_source_model_path)[0] + ".*"} \
    {os.path.splitext(detokenized_target_model_path)[0] + ".*"}