In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
drive_path = "./drive/MyDrive"

In [4]:
os.chdir(os.path.join(drive_path, "GithubPythonCode/code"))

In [5]:
model_type = "roberta"
pretrained_model = "microsoft/codebert-base"

lang = "python"  # programming language
output_dir = f"model/{lang}"

data_dir = "../dataset"
train_file = f"{data_dir}/{lang}/train.jsonl"
dev_file = f"{data_dir}/{lang}/valid.jsonl"

source_length = 256
target_length = 128

lr = 5e-5
beam_size = 10
batch_size = 16  # increased batch size for better GPU utilization
decay = 0.01
warmup = 500
epochs = 50

In [6]:
!python run_earlystopping.py \
  --do_train \
  --do_eval \
  --model_type $model_type \
  --model_name_or_path $pretrained_model \
  --train_filename $train_file \
  --dev_filename $dev_file \
  --output_dir $output_dir \
  --max_source_length $source_length \
  --max_target_length $target_length \
  --beam_size $beam_size \
  --train_batch_size $batch_size \
  --eval_batch_size $batch_size \
  --learning_rate $lr \
  --weight_decay $decay \
  --warmup_steps $warmup \
  --num_train_epochs $epochs

05/10/2023 10:30:10 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='model/python', load_model_path=None, train_filename='../dataset/python/train.jsonl', dev_filename='../dataset/python/valid.jsonl', test_filename=None, config_name='', tokenizer_name='', max_source_length=256, max_target_length=128, do_train=True, do_eval=True, do_test=False, do_lower_case=False, no_cuda=False, train_batch_size=16, eval_batch_size=16, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.01, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=50, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=500, local_rank=-1, seed=42)
Downloading (…)lve/main/config.json: 100% 498/498 [00:00<00:00, 2.16MB/s]
Downloading (…)olve/main/vocab.json: 100% 899k/899k [00:00<00:00, 3.57MB/s]
Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 1.85MB/s]
Downloading (…)cial_tokens_map.json: 100% 150/150 [00:00<

In [7]:
test_model=f"{output_dir}/checkpoint-best-bleu/pytorch_model.bin" #checkpoint for test
test_file = f"{data_dir}/{lang}/test.jsonl"

In [8]:
!python run.py \
  --do_test \
  --model_type $model_type \
  --model_name_or_path $pretrained_model \
  --load_model_path $test_model \
  --dev_filename $dev_file \
  --test_filename $test_file \
  --output_dir $output_dir \
  --max_source_length $source_length \
  --max_target_length $target_length \
  --beam_size $beam_size \
  --eval_batch_size $batch_size

05/10/2023 13:38:08 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='model/python', load_model_path='model/python/checkpoint-best-bleu/pytorch_model.bin', train_filename=None, dev_filename='../dataset/python/valid.jsonl', test_filename='../dataset/python/test.jsonl', config_name='', tokenizer_name='', max_source_length=256, max_target_length=128, do_train=False, do_eval=False, do_test=True, do_lower_case=False, no_cuda=False, train_batch_size=8, eval_batch_size=16, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
05/10/2023 13:38:10 - INFO - __main__ -   reload model from model/python/checkpoint-best-bleu/pytorch_model.bin
05/10/2023 13:38:14 - INFO - __main__ -   Test file: ../dataset/python/valid.jsonl
100% 10/10 [04:54<00:00, 29.47s/it]
Total: 1

In [9]:
!python ../evaluator/evaluator.py model/$lang/test_1.gold < model/$lang/test_1.output

Total: 150
24.96619717236244


In [10]:
import torch

import torch.nn as nn

from transformers import AutoTokenizer
from model import Seq2Seq
from transformers import RobertaConfig, RobertaModel

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

config = RobertaConfig.from_pretrained(pretrained_model)
encoder = RobertaModel.from_pretrained(pretrained_model, config = config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model = Seq2Seq(encoder = encoder,decoder = decoder,config=config,
                beam_size=beam_size,max_length=target_length,
                sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
model.load_state_dict(torch.load(test_model))
model.to('cuda')

Seq2Seq(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [11]:
import pandas as pd
import json

def jsonl_to_dataframe(jsonl_file):
    data = []

    with open(jsonl_file, 'r') as file:
        for line in file:
            json_data = json.loads(line)
            code = ' '.join(json_data['code_tokens']).replace('\n',' ')
            code = ' '.join(code.strip().split())
            docstring = ' '.join(json_data['docstring_tokens']).replace('\n','')
            docstring = ' '.join(docstring.strip().split())   
            data.append({'code': code, 'docstring': docstring})

    df = pd.DataFrame(data, columns=['code', 'docstring'])
    return df

In [12]:
df_train = jsonl_to_dataframe(train_file)
df_dev = jsonl_to_dataframe(dev_file)
df_test = jsonl_to_dataframe(test_file)

len(df_train), len(df_dev), len(df_test)

(700, 150, 150)

In [13]:
df_test.head(5)

Unnamed: 0,code,docstring
0,# ! /usr/bin/env python2 # Used to generate so...,This code snippet generates icons of various s...
1,import torch from torch.autograd import Functi...,This code snippet implements a truncated expon...
2,import argparse from ast import arg from recbo...,This code snippet runs a RecBole model on a gi...
3,"import bpy import os , sys , subprocess class ...",`` `` '' This code snippet creates an operator...
4,import re import html xmlbody = `` '' '' < ? x...,`` `` '' This code snippet generates an OPML f...


In [14]:
from run import convert_examples_to_features, Example
from tqdm.auto import tqdm

class Args:
    max_source_length = source_length
    max_target_length = target_length

args = Args()

def get_preds(df: pd.DataFrame):
    ps = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        examples = [
            Example(idx, source = row.code, target = row.docstring)
        ]
        eval_features = convert_examples_to_features(
            examples, tokenizer, args, stage='test'
        )
        source_ids = torch.tensor(eval_features[0].source_ids, dtype = torch.long).unsqueeze(0).to('cuda')
        source_mask = torch.tensor(eval_features[0].source_mask, dtype = torch.long).unsqueeze(0).to('cuda')

        with torch.no_grad():
            preds = model(source_ids = source_ids, source_mask = source_mask)  
            for pred in preds:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                ps.append(text)
    
    return ps

In [15]:
preds = get_preds(df_test.head(5))
for idx, row in df_test.head(5).iterrows():
    print('CODE:\n', row.code)
    print('DOCSTRING (reference):\n', row.docstring)
    print('DOCSTRING (model):\n', preds[idx])
    print('='*75)

  0%|          | 0/5 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1080 > 512). Running this sequence through the model will result in indexing errors


CODE:
 # ! /usr/bin/env python2 # Used to generate some icons # Requires inkscape and imagemagick pacages import os , subprocess , colorsys from xml.etree import ElementTree as ET ICODIR = `` ./images/ '' # Directory with icons CICONS = `` ./images/controller-icons/ '' # Directory controller-icons RECOLORS = { # Defines set of hue shifts for controller-icons # `` 0 '' : 0.0 , # Green - original '' 1 '' : 0.3 , # Blue '' 2 '' : 0.7 , # Red '' 3 '' : 0.9 , # Yellow '' 4 '' : 0.2 , # Cyan '' 5 '' : 0.8 , # Orange '' 6 '' : 0.5 , # Purple } # Generate svg state icons for size in ( 24 , 256 ) : for state in ( 'alive ' , 'dead ' , 'error ' , 'unknown ' ) : print `` scc-statusicon- % s.png '' % ( state , ) subprocess.call ( [ '' inkscape '' , '' % s/scc-statusicon- % s.svg '' % ( ICODIR , state ) , '' -- export-area-page '' , '' -- export-png= % s/ % sx % s/status/scc- % s.png '' % ( ICODIR , size , size , state ) , '' -- export-width= % s '' % ( size , ) , '' -- export-height= % s '' % ( siz

In [16]:
import io
import tokenize

def get_code_tokens(code):
    tokens = []
    code_stream = io.StringIO(code)
    for token in tokenize.generate_tokens(code_stream.readline):
        if token.type not in (tokenize.INDENT, tokenize.DEDENT, tokenize.NEWLINE, tokenize.NL, tokenize.COMMENT, tokenize.ENDMARKER):
            tokens.append(token.string)
    return tokens

In [17]:
# Define a sample string as code
sample_code = """
def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n-1)
"""

sample_tokens = get_code_tokens(sample_code)
code=' '.join(sample_tokens).replace('\n',' ')
code=' '.join(code.strip().split())

# Create a pandas DataFrame with the code as the 'code' column
sample_df = pd.DataFrame([{'code': code, 'docstring': ''}])

# Make predictions on the code
predictions = get_preds(sample_df)
print("DOCSTRING (model):\n", predictions[0])

  0%|          | 0/1 [00:00<?, ?it/s]

DOCSTRING (model):
 `` `` '' This code snippet defines a function for a neural network using PyTorch . It takes in a function from a list of size , and then calculates the device . Finally , it returns a rate of size of the number of a function . '' '' '' ''


In [18]:
# Define a sample string as code
sample_code = """
def prime(number):
    if number < 2:
        return False
    for i in range(2, number):
        if number % i == 0:
            return False
    return True
"""

sample_tokens = get_code_tokens(sample_code)
code=' '.join(sample_tokens).replace('\n',' ')
code=' '.join(code.strip().split())

# Create a pandas DataFrame with the code as the 'code' column
sample_df = pd.DataFrame([{'code': code, 'docstring': ''}])

# Make predictions on the code
predictions = get_preds(sample_df)
print("DOCSTRING (model):\n", predictions[0])

  0%|          | 0/1 [00:00<?, ?it/s]

DOCSTRING (model):
 `` `` '' This code snippet creates a command line interface for a given domain . It takes in an index , a list of time , and creates a list of size . '' '' ''


In [19]:
# Define a sample string as code
sample_code = """
import base64

PAYLOAD = b"cat /home/bobby/flag.txt"

encoded = base64.b64encode(PAYLOAD)
print(encoded)

command = "python3 -c '__import__(\"os\").system((__import__(\"base64\").b64decode(\"" + encoded.decode() + "\")))'"
print(command)
"""

sample_tokens = get_code_tokens(sample_code)
code=' '.join(sample_tokens).replace('\n',' ')
code=' '.join(code.strip().split())

# Create a pandas DataFrame with the code as the 'code' column
sample_df = pd.DataFrame([{'code': code, 'docstring': ''}])

# Make predictions on the code
predictions = get_preds(sample_df)
print("DOCSTRING (model):\n", predictions[0])

  0%|          | 0/1 [00:00<?, ?it/s]

DOCSTRING (model):
 `` `` '' This code snippet sets up a Python package using the setuptools library . It imports the necessary modules and writes the version of the __init__.py file , and then prints a message to the file . '' '' ''
