In [20]:
import sys
sys.path.append("/workspace")

# Test AST parser utils

In [21]:
from src.ast_parser import ASTParser, LanguageRepr

In [22]:
lang_repr = LanguageRepr(
    library_path="/workspace/tmp/ast_test/my-languages.so", lang="python"
)

parser = ASTParser(lang_repr)

In [27]:
program = """\
    @test(test)
    class A(abc):
        @diffable
        def __init__():
            pass\
"""

In [28]:
parser.parse_root_children(program)

[('@test(test)<nl>    class A(abc):<nl>        @diffable<nl>        def __init__():<nl>            pass',
  '(decorated_definition (decorator (dotted_name (test)) arguments: (argument_list (test))) definition: (class_definition name: (A) superclasses: (argument_list (abc)) body: (block (decorated_definition (decorator (dotted_name (diffable))) definition: (function_definition name: (__init__) parameters: (parameters) body: (block (pass_statement)))))))'),
 ('@test(test)',
  '(decorator (dotted_name (test)) arguments: (argument_list (test)))'),
 ('class A(abc):',
  '(class_definition name: (A) superclasses: (argument_list (abc)))'),
 ('class A(abc):<nl>        @diffable<nl>        def __init__():<nl>            pass',
  '(class_definition name: (A) superclasses: (argument_list (abc)) body: (block (decorated_definition (decorator (dotted_name (diffable))) definition: (function_definition name: (__init__) parameters: (parameters) body: (block (pass_statement))))))'),
 ('@diffable<nl>     

In [None]:
tree = parser.parser.parse(bytes(program, "utf8"))
root_node = tree.root_node

In [None]:
root_node.children

In [None]:
root_node.children[0].children

In [None]:
root_node.children[0].children[2].sexp()

# Utils to train a BPE tokenization model

In [None]:
import sentencepiece as spm
import os

In [1]:
def train_bpe_model(
    input_path: str, 
    model_name: str, 
    vocab_size: int, 
    input_sentence_size: int = 1000
):
    symbols_str = "<nl>,\",(,),.,{,}"
    train_args = f"""--input={input_path} \
            --user_defined_symbols={symbols_str} \
            --model_prefix={model_name} \
            --pad_id=3 \
            --pad_piece=<pad> \
            --vocab_size={vocab_size} \
            --hard_vocab_limit={False} \
            --input_sentence_size={input_sentence_size} \
            --model_type=bpe"""
    spm.SentencePieceTrainer.Train(train_args)

In [None]:
train_bpe_model(
    input_path="/workspace/tmp/data_test/dataset_tmp.train", 
    model_name="ast_model", 
    vocab_size=16_000
)

# Utils to decode word pieces into words

In [14]:
def decode_pieces(model_path, input_path, output_path):
    proc = spm.SentencePieceProcessor()
    if not proc.load(model_path):
        return
    
    dirname = os.path.dirname(output_path)
    os.makedirs(dirname, exist_ok=True)
    
    input_file = open(input_path, mode="r")
    output_file = open(output_path, mode="w")
    
    with input_file, output_file:
        encoded_lines = input_file.readlines()
        decoded_lines = [proc.decode_pieces(line.split(" ")) for line in encoded_lines]
        output_file.writelines(decoded_lines)

In [19]:
decode_pieces(
    "/workspace/src_model.model", 
    "/workspace/tmp/ast_test/code2ast_tokenized/train2.src", 
    "/workspace/tmp/ast_test/code2ast_detokenized/train.src"
)

In [13]:
decode_pieces(
    "/workspace/ast_model.model", 
    "/workspace/tmp/ast_test/code2ast_bpe/train.ast", 
    "/workspace/tmp/ast_test/code2ast_bpe_decoded/train.ast"
)

# Make up a list of repositories to clone

In [9]:
import pandas as pd

In [63]:
df = pd.read_json("/workspace/data/repositories/top1000_page1.jsonl", lines=False)

In [21]:
df.sort_values("size", ascending=False)[:50]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
55,odoo/odoo,Python,131512,15608,15608,11183,2741739,False,False
705,googleapis/google-cloud-python,Python,8689,2874,2874,1209,1778249,False,False
887,RasaHQ/rasa_core,Python,4698,2291,2291,1048,1059972,True,False
335,frappe/erpnext,Python,28974,5306,5306,2322,804911,False,False
650,librosa/librosa,Python,2936,3106,3106,543,760281,False,False
359,ytisf/theZoo,Python,200,5066,5066,1474,688056,False,False
244,Yorko/mlcourse.ai,Python,4980,6348,6348,4164,644447,False,False
737,iGhibli/iOS-DeviceSupport,Python,66,2798,2798,413,554436,False,False
332,samuelclay/NewsBlur,Python,12738,5333,5333,918,525458,False,False
3,tensorflow/models,Python,3936,58983,58983,37248,523487,False,False


In [53]:
selected_list_idx = [705, 887, 244, 3, 214, 146, 218, 12, 21, 215, 30, 61, 366]

In [62]:
selected_urls = [f"https://github.com/{df.iloc[idx].full_name}.git" for idx in selected_list_idx]
selected_urls_path = "/workspace/tmp/ast_test/repo_list.txt"
with open(selected_urls_path, mode="w") as file:
    file.writelines("\n".join(selected_urls))