# Code2AST dataset prepare

In [None]:
import sys
sys.path.append("/workspace")

In [None]:
import os
import pandas as pd

In [None]:
def name_to_url(name: str) -> str:
    return f"https://github.com/{name}.git"

## Steps to prepare a parallel dataset for code2ast:

1. Make up a list of repositories to clone
2. Clone selected repositories
3. Parse every .py file (returning a pair of .src and .ast files) for every cloned repository
4. Merge parsed pairs into two large files (train.src, train.ast)
5. Remove duplicate lines in .src file along with aligned lines in .ast file
6. Train a BPE tokenizer model on both files (model_src, model_ast)
7. Apply tokenization for all lines in the files and filter out ones which are longer than the threshold value (512 tokens).
(This will result in making two tokenized files with lines of a length not greater than the threshold value)
8. Detokenize files using trained BPE models and write results to updated files
9. Train new BPE tokenization models on updated files
10. Tokenize updated files using new BPE models
11. Split tokenized files into train/valid/test subsets
12. Preprocess prepared subsets using fairseq-preprocess utils

In [None]:
extensions = ("src", "ast")
language = "python"
language_ext = "py"
library_path = "/workspace/data/langs.so"
exp_name = "/code2ast/code2ast_pretraining_mlm"

### Step 1: 
Make up a list of repositories to clone

In [None]:
repositories = pd.read_json("/workspace/data/repositories/top_18k.jsonl", lines=True)
unique_repositories = repositories.drop_duplicates("full_name")
urls = [name_to_url(name) for name in unique_repositories["full_name"]]
urls = "\n".join(urls)

repo_filepath = "/workspace/data/repo_list.txt"
repo_output = "/workspace/tmp/repositories"

with open(repo_filepath, mode="w") as file:
    file.write(urls)

### Step 2: 
Clone selected repositories

In [None]:
!cd /workspace && python -m src.clone_repository \
    --repo_file $repo_filepath \
    --output $repo_output \
    --clear_before 1

### Step 3: 
Parse every .py file (returning a pair of .src and .ast files) for every cloned repository

In [None]:
parsed_dataset_path = f"/workspace/tmp/{exp_name}_datset_parsed"

In [None]:
!cd /workspace && python -m src.ast_dataset_prepare parse-nodes --rule-all \
    --library-path=$library_path \
    --language=$language \
    --language-ext=$language_ext \
    --root-input-path=$repo_output \
    --output-path=$parsed_dataset_path\
    --extensions="{extensions[0]}, {extensions[1]}"

### Step 4: 
Merge parsed pairs into two large files (train.src, train.ast)

In [None]:
merged_dataset_path_prefix = f"/workspace/tmp/{exp_name}_datset_merged/all"

In [None]:
!cd /workspace && python -m src.merge_files merge-pairs \
    --input-path=$parsed_dataset_path \
    --output-prefix=$merged_dataset_path_prefix \
    --extensions="{extensions[0]}, {extensions[1]}" \
    --remove-files

### Step 5:
Remove duplicate lines in .src file along with aligned lines in .ast file

In [None]:
deduplicated_dataset_path =  f"/workspace/tmp/{exp_name}_dataset_dedup"

In [None]:
!cd /workspace && python -m src.remove_duplicates \
    --reference-filepath={merged_dataset_path_prefix + ".src"} \
    --aligned-filepath={merged_dataset_path_prefix + ".ast"} \
    --destination-path=$deduplicated_dataset_path

Clean tmp directories & files

In [None]:
!rm -rf {os.path.dirname(merged_dataset_path_prefix)} \
    {parsed_dataset_path} \

### Step 6: 
Train a BPE tokenizer model on both files (model_src, model_ast)

In [None]:
# merged_dataset_prefix = "/workspace/tmp/ast_test/code2ast_medium/train"
merged_prefix = os.path.join(deduplicated_dataset_path, os.path.basename(merged_dataset_path_prefix))
source_input_path = merged_prefix + "." + extensions[0]
target_input_path = merged_prefix + "." + extensions[1]

source_vocab_size = 32_000
target_vocab_size = 32_000
source_model_name = "src_model"
target_model_name = "ast_model"
source_model_path = os.path.join("/workspace", source_model_name + ".model")
target_model_path = os.path.join("/workspace", target_model_name + ".model")

In [None]:
!cd /workspace && python -m src.tokenize train \
        --source-input-path=$source_input_path \
        --source-model-name=$source_model_name \
        --source-vocab-size=$source_vocab_size \
        --target-input-path=$target_input_path \
        --target-model-name=$target_model_name \
        --target-vocab-size=$target_vocab_size

### Step 7: 
Apply tokenization for all lines in the files and filter out ones which are longer than the threshold value (512 tokens)

In [None]:
dest_source_path = f"/workspace/tmp/{exp_name}_dataset_tokenized/all.src"
dest_target_path = f"/workspace/tmp/{exp_name}_dataset_tokenized/all.ast"

In [None]:
!cd /workspace && python -m src.tokenize tokenize-bpe \
        --task=code2ast \
        --source-model=$source_model_path \
        --source-path=$source_input_path \
        --target-model=$target_model_path \
        --target-path=$target_input_path \
        --dest-source-path=$dest_source_path \
        --dest-target-path=$dest_target_path

### Step 8: 
Detokenize files using trained BPE models and write results to updated files

In [None]:
detokenized_source_path = f"/workspace/tmp/{exp_name}_dataset_detokenized/all.src"
detokenized_target_path = f"/workspace/tmp/{exp_name}_dataset_detokenized/all.ast"

In [None]:
!cd /workspace && python -m src.tokenize detokenize-bpe \
        --source-model=$source_model_path \
        --source-path=$dest_source_path \
        --target-model=$target_model_path \
        --target-path=$dest_target_path \
        --dest-source-path=$detokenized_source_path \
        --dest-target-path=$detokenized_target_path

### Step 9: 
Train new BPE tokenization models on updated files

In [None]:
detokenized_source_vocab_size = 32_000
detokenized_target_vocab_size = 32_000
detokenized_source_model_name = "detokenized_src_model"
detokenized_target_model_name = "detokenized_ast_model"
detokenized_source_model_path = os.path.join("/workspace", detokenized_source_model_name + ".model")
detokenized_target_model_path = os.path.join("/workspace", detokenized_target_model_name + ".model")

In [None]:
!cd /workspace && python -m src.tokenize train \
        --source-input-path=$detokenized_source_path \
        --source-model-name=$detokenized_source_model_name \
        --source-vocab-size=$detokenized_source_vocab_size \
        --target-input-path=$detokenized_target_path \
        --target-model-name=$detokenized_target_model_name \
        --target-vocab-size=$detokenized_target_vocab_size

### Step 10: 
Tokenize updated files using new BPE models

In [None]:
prepared_source_path = f"/workspace/tmp/{exp_name}_dataset_prepared/all.src"
prepared_target_path = f"/workspace/tmp/{exp_name}_dataset_prepared/all.ast"

In [None]:
!cd /workspace && python -m src.tokenize tokenize-bpe \
        --task=code2ast \
        --source-model=$detokenized_source_model_path \
        --source-path=$detokenized_source_path \
        --target-model=$detokenized_target_model_path \
        --target-path=$detokenized_target_path \
        --dest-source-path=$prepared_source_path \
        --dest-target-path=$prepared_target_path

### Step 11: 
Split tokenized files into train/valid/test subsets

In [None]:
prepared_dataset_prefix = os.path.splitext(prepared_source_path)[0]
splitted_dataset_path = f"/workspace/tmp/{exp_name}_dataset_splitted"

In [None]:
!cd /workspace && python -m src.split_dataset split \
            --dataset_prefix=$prepared_dataset_prefix \
            --exts=".{extensions[0]}, .{extensions[1]}" \
            --split-ratio='0.8, 0.15, 0.05' \
            --dest-path=$splitted_dataset_path

### Step 12: 
Preprocess prepared subsets using fairseq-preprocess utils

In [None]:
!apt-get update \
    && apt-get -y install build-essential \
    && pip install fairseq sentencepiece

In [None]:
train_pref = os.path.join(splitted_dataset_path, "train")
valid_pref = os.path.join(splitted_dataset_path, "valid")
test_pref = os.path.join(splitted_dataset_path, "test")
preprocessed_path = f"/workspace/tmp/{exp_name}_dataset_splitted.src-ast"

In [None]:
!rm -rf $preprocessed_path && fairseq-preprocess \
    --source-lang src --target-lang ast \
    --trainpref $train_pref \
    --validpref $valid_pref \
    --testpref $test_pref \
    --destdir $preprocessed_path \
    --nwordssrc 32000 --nwordstgt 32000 \
    --bpe sentencepiece \
    --workers 60

## Clear temp paths and BPE models

In [None]:
!rm -rf {os.path.dirname(dest_source_path)} \
    {os.path.dirname(detokenized_source_path)} \
    {os.path.dirname(prepared_source_path)} \
    {splitted_dataset_path} \
    {deduplicated_dataset_path}

In [None]:
!rm -rf {os.path.splitext(source_model_path)[0] + ".*"} \
    {os.path.splitext(target_model_path)[0] + ".*"} \
    {os.path.splitext(detokenized_source_model_path)[0] + ".*"} \
    {os.path.splitext(detokenized_target_model_path)[0] + ".*"}