In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import os
import pwd
username = pwd.getpwuid(os.getuid()).pw_name
home_path = f"/home/{username}"

In [None]:
os.chdir(os.path.join(home_path, "CodeXGLUE/code"))

In [None]:
model_type = "roberta"
pretrained_model = "microsoft/codebert-base"

lang = "python"  # programming language
output_dir = f"model/{lang}"

data_dir = "../dataset"
train_file = f"{data_dir}/{lang}/train.jsonl"
dev_file = f"{data_dir}/{lang}/valid.jsonl"

source_length = 256
target_length = 128

lr = 5e-5
beam_size = 10
batch_size = 256  # increased batch size for better GPU utilization
decay = 0.01
warmup = 500
epochs = 500

In [None]:
!python run_earlystopping.py \
  --do_train \
  --do_eval \
  --model_type $model_type \
  --model_name_or_path $pretrained_model \
  --train_filename $train_file \
  --dev_filename $dev_file \
  --output_dir $output_dir \
  --max_source_length $source_length \
  --max_target_length $target_length \
  --beam_size $beam_size \
  --train_batch_size $batch_size \
  --eval_batch_size $batch_size \
  --learning_rate $lr \
  --weight_decay $decay \
  --warmup_steps $warmup \
  --num_train_epochs $epochs

05/02/2023 08:47:03 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='model/python', load_model_path=None, train_filename='../dataset/python/train.jsonl', dev_filename='../dataset/python/valid.jsonl', test_filename=None, config_name='', tokenizer_name='', max_source_length=256, max_target_length=128, do_train=True, do_eval=True, do_test=False, do_lower_case=False, no_cuda=False, train_batch_size=64, eval_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.01, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=500, local_rank=-1, seed=42)
Downloading (…)lve/main/config.json: 100% 498/498 [00:00<00:00, 3.09MB/s]
Downloading (…)olve/main/vocab.json: 100% 899k/899k [00:00<00:00, 6.20MB/s]
Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 10.2MB/s]
Downloading (…)cial_tokens_map.json: 100% 150/150 [00:00<0

In [None]:
test_model=f"{output_dir}/checkpoint-best-bleu/pytorch_model.bin" #checkpoint for test
test_file = f"{data_dir}/{lang}/test.jsonl"

In [None]:
!python run.py \
  --do_test \
  --model_type $model_type \
  --model_name_or_path $pretrained_model \
  --load_model_path $test_model \
  --dev_filename $dev_file \
  --test_filename $test_file \
  --output_dir $output_dir \
  --max_source_length $source_length \
  --max_target_length $target_length \
  --beam_size $beam_size \
  --eval_batch_size $batch_size

05/02/2023 13:49:00 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='model/python', load_model_path='model/python/checkpoint-best-bleu/pytorch_model.bin', train_filename=None, dev_filename='../dataset/python/valid.jsonl', test_filename='../dataset/python/test.jsonl', config_name='', tokenizer_name='', max_source_length=256, max_target_length=128, do_train=False, do_eval=False, do_test=True, do_lower_case=False, no_cuda=False, train_batch_size=8, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
05/02/2023 13:49:02 - INFO - __main__ -   reload model from model/python/checkpoint-best-bleu/pytorch_model.bin
05/02/2023 13:49:13 - INFO - __main__ -   Test file: ../dataset/python/valid.jsonl
100% 109/109 [32:31<00:00, 17.90s/it]
Total

In [None]:
!python ../evaluator/evaluator.py model/$lang/test_1.gold < model/$lang/test_1.output

Total: 14918
17.91415264096644


In [None]:
import torch

import torch.nn as nn

from transformers import AutoTokenizer
from model import Seq2Seq
from transformers import RobertaConfig, RobertaModel

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

config = RobertaConfig.from_pretrained(pretrained_model)
encoder = RobertaModel.from_pretrained(pretrained_model, config = config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model = Seq2Seq(encoder = encoder,decoder = decoder,config=config,
                beam_size=beam_size,max_length=target_length,
                sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
model.load_state_dict(torch.load(test_model))
model.to('cuda')

Seq2Seq(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [None]:
import pandas as pd
import json

def jsonl_to_dataframe(jsonl_file):
    data = []

    with open(jsonl_file, 'r') as file:
        for line in file:
            json_data = json.loads(line)
            code = ' '.join(json_data['code_tokens']).replace('\n',' ')
            code = ' '.join(code.strip().split())
            docstring = ' '.join(json_data['docstring_tokens']).replace('\n','')
            docstring = ' '.join(docstring.strip().split())   
            data.append({'code': code, 'docstring': docstring})

    df = pd.DataFrame(data, columns=['code', 'docstring'])
    return df

In [None]:
df_train = jsonl_to_dataframe(train_file)
df_dev = jsonl_to_dataframe(dev_file)
df_test = jsonl_to_dataframe(test_file)

len(df_train), len(df_dev), len(df_test)

(251820, 13914, 14918)

In [None]:
df_test.head(5)

Unnamed: 0,code,docstring
0,def sina_xml_to_url_list ( xml_data ) : rawurl...,str - > list Convert XML to URL List . From Bi...
1,"def dailymotion_download ( url , output_dir = ...",Downloads Dailymotion videos by URL .
2,"def sina_download ( url , output_dir = '.' , m...",Downloads Sina videos by URL .
3,"def sprint ( text , * colors ) : return ""\33[{...",Format text with color or other effects into A...
4,"def print_log ( text , * colors ) : sys . stde...",Print a log message to standard error .


In [None]:
from run import convert_examples_to_features, Example
from tqdm.auto import tqdm

class Args:
    max_source_length = source_length
    max_target_length = target_length

args = Args()

def get_preds(df: pd.DataFrame):
    ps = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        examples = [
            Example(idx, source = row.code, target = row.docstring)
        ]
        eval_features = convert_examples_to_features(
            examples, tokenizer, args, stage='test'
        )
        source_ids = torch.tensor(eval_features[0].source_ids, dtype = torch.long).unsqueeze(0).to('cuda')
        source_mask = torch.tensor(eval_features[0].source_mask, dtype = torch.long).unsqueeze(0).to('cuda')

        with torch.no_grad():
            preds = model(source_ids = source_ids, source_mask = source_mask)  
            for pred in preds:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                ps.append(text)
    
    return ps

In [None]:
preds = get_preds(df_test.head(5))
for idx, row in df_test.head(5).iterrows():
    print('CODE:\n', row.code)
    print('DOCSTRING (reference):\n', row.docstring)
    print('DOCSTRING (model):\n', preds[idx])
    print('='*75)

  0%|          | 0/5 [00:00<?, ?it/s]

CODE:
 def sina_xml_to_url_list ( xml_data ) : rawurl = [ ] dom = parseString ( xml_data ) for node in dom . getElementsByTagName ( 'durl' ) : url = node . getElementsByTagName ( 'url' ) [ 0 ] rawurl . append ( url . childNodes [ 0 ] . data ) return rawurl
DOCSTRING (reference):
 str - > list Convert XML to URL List . From Biligrab .
DOCSTRING (model):
 Convert an SIE XML string into a list of URLs .
CODE:
 def dailymotion_download ( url , output_dir = '.' , merge = True , info_only = False , * * kwargs ) : html = get_content ( rebuilt_url ( url ) ) info = json . loads ( match1 ( html , r'qualities":({.+?}),"' ) ) title = match1 ( html , r'"video_title"\s*:\s*"([^"]+)"' ) or match1 ( html , r'"title"\s*:\s*"([^"]+)"' ) title = unicodize ( title ) for quality in [ '1080' , '720' , '480' , '380' , '240' , '144' , 'auto' ] : try : real_url = info [ quality ] [ 1 ] [ "url" ] if real_url : break except KeyError : pass mime , ext , size = url_info ( real_url ) print_info ( site_info , title 

In [None]:
import io
import tokenize

def get_code_tokens(code):
    tokens = []
    code_stream = io.StringIO(code)
    for token in tokenize.generate_tokens(code_stream.readline):
        if token.type not in (tokenize.INDENT, tokenize.DEDENT, tokenize.NEWLINE, tokenize.NL, tokenize.COMMENT, tokenize.ENDMARKER):
            tokens.append(token.string)
    return tokens

In [None]:
# Define a sample string as code
sample_code = """
def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n-1)
"""

sample_tokens = get_code_tokens(sample_code)
code=' '.join(sample_tokens).replace('\n',' ')
code=' '.join(code.strip().split())
print(code)

# Create a pandas DataFrame with the code as the 'code' column
sample_df = pd.DataFrame([{'code': code, 'docstring': ''}])

# Make predictions on the code
predictions = get_preds(sample_df)
print("DOCSTRING (model):\n", predictions[0])

def factorial ( n ) : if n == 0 : return 1 else : return n * factorial ( n - 1 )


  0%|          | 0/1 [00:00<?, ?it/s]

DOCSTRING (model):
 Returns the factorial of n .


In [None]:
# Define a sample string as code
sample_code = """
def prime(number):
    if number < 2:
        return False
    for i in range(2, number):
        if number % i == 0:
            return False
    return True
"""

sample_tokens = get_code_tokens(sample_code)
code=' '.join(sample_tokens).replace('\n',' ')
code=' '.join(code.strip().split())
print(code)

# Create a pandas DataFrame with the code as the 'code' column
sample_df = pd.DataFrame([{'code': code, 'docstring': ''}])

# Make predictions on the code
predictions = get_preds(sample_df)
print("DOCSTRING (model):\n", predictions[0])

def prime ( number ) : if number < 2 : return False for i in range ( 2 , number ) : if number % i == 0 : return False return True


  0%|          | 0/1 [00:00<?, ?it/s]

DOCSTRING (model):
 Check if number is prime


In [None]:
# Define a sample string as code
sample_code = """
import base64

PAYLOAD = b"cat /home/bobby/flag.txt"

encoded = base64.b64encode(PAYLOAD)
print(encoded)

command = "python3 -c '__import__(\"os\").system((__import__(\"base64\").b64decode(\"" + encoded.decode() + "\")))'"
print(command)
"""

sample_tokens = get_code_tokens(sample_code)
code=' '.join(sample_tokens).replace('\n',' ')
code=' '.join(code.strip().split())
print(code)

# Create a pandas DataFrame with the code as the 'code' column
sample_df = pd.DataFrame([{'code': code, 'docstring': ''}])

# Make predictions on the code
predictions = get_preds(sample_df)
print("DOCSTRING (model):\n", predictions[0])

import base64 PAYLOAD = b"cat /home/bobby/flag.txt" encoded = base64 . b64encode ( PAYLOAD ) print ( encoded ) command = "python3 -c '__import__(" os ").system((__import__(" base64 ").b64decode(" " + encoded.decode() + " ")))'" print ( command )


  0%|          | 0/1 [00:00<?, ?it/s]

DOCSTRING (model):
 Print the contents of a file .
