In [None]:
!git clone https://github.com/EdinburghNLP/code-docstring-corpus.git
!gzip -d code-docstring-corpus/parallel-corpus/data_ps.declbodies.train.gz

Cloning into 'code-docstring-corpus'...
remote: Enumerating objects: 357, done.[K
remote: Total 357 (delta 0), reused 0 (delta 0), pack-reused 357[K
Receiving objects: 100% (357/357), 858.21 MiB | 30.33 MiB/s, done.
Resolving deltas: 100% (186/186), done.
Updating files: 100% (163/163), done.


In [None]:
orig_samples = []

with open("code-docstring-corpus/parallel-corpus/data_ps.declbodies.train") as f:
    orig_samples += f.readlines()

with open("code-docstring-corpus/parallel-corpus/data_ps.declbodies.valid") as f:
    orig_samples += f.readlines()

with open("code-docstring-corpus/parallel-corpus/data_ps.declbodies.test") as f:
    orig_samples += f.readlines()

In [None]:
import re

def prepare_declbody(declbody):
    declbody = re.sub(' +', ' ', declbody)
    declbody = re.sub(' DCNL ', '\n', declbody)
    declbody = re.sub('DCSP ', '\t', declbody)
    return declbody

def prepare_tokens_for_search(tokens):
    merged = "".join(tokens)
    return re.sub(r'[^a-zA-Z0-9]+', '', merged)

def prepare_sample_for_search(sample):
    result = re.sub(' DCNL ', '', sample)
    result = re.sub('DCSP ', '', result)
    result = re.sub(' +', '', result)
    return re.sub(r'[^a-zA-Z0-9]+', '', result)

In [None]:
def is_original_of_tokens(tokens, prep_sample):
    prep_tokens = prepare_tokens_for_search(tokens)
    return prep_tokens == prep_sample

# return index of the original sample
def find_tokens_original(tokens):
    for i, sample in enumerate(prep_samples):
        if is_original_of_tokens(tokens, sample):
            return i
    raise Exception('Tokens original not found')

In [None]:
prep_samples = [prepare_sample_for_search(sample) for sample in orig_samples]

In [None]:
prep_samples_dict = {prep_sample: i for i, prep_sample in enumerate(prep_samples)}

In [None]:
!unzip drive/MyDrive/UnixCoderQAData.zip

Archive:  drive/MyDrive/UnixCoderQAData.zip
  inflating: data/train.py.jsonl     
  inflating: data/test.py.jsonl      
  inflating: data/dev.py.jsonl       


# Train data

In [None]:
import json
with open('data/train.py.jsonl') as f:
    train_data = [json.loads(line) for line in f]

In [None]:
prep_tokens = [prepare_tokens_for_search(tokens['code']) for tokens in train_data if tokens['src'] == 'codeqa']

In [None]:
from tqdm import tqdm

tokens2orig = {}

for i, prep_token in enumerate(tqdm(prep_tokens)):
    tokens2orig[i] = prep_samples_dict[prep_token]

100%|██████████| 56085/56085 [00:00<00:00, 185392.97it/s]


In [None]:
for i, item in enumerate(train_data):
    if item['src'] == 'codeqa':
        orig_code_idx = tokens2orig[i]
        item['code'] = prepare_declbody(orig_samples[orig_code_idx])

In [None]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0


In [None]:
import jsonlines

with jsonlines.open('train.py.jsonl', mode='w') as writer:
    writer.write_all(train_data)

In [None]:
len(train_data)

71628

# Dev data

In [None]:
import json
with open('data/dev.py.jsonl') as f:
    dev_data = [json.loads(line) for line in f]

In [None]:
prep_tokens = [prepare_tokens_for_search(tokens['code']) for tokens in dev_data if tokens['src'] == 'codeqa']

In [None]:
from tqdm import tqdm

tokens2orig = {}

for i, prep_token in enumerate(tqdm(prep_tokens)):
    tokens2orig[i] = prep_samples_dict[prep_token]

100%|██████████| 7000/7000 [00:00<00:00, 198064.75it/s]


In [None]:
for i, item in enumerate(dev_data):
    if item['src'] == 'codeqa':
        orig_code_idx = tokens2orig[i]
        item['code'] = prepare_declbody(orig_samples[orig_code_idx])

In [None]:
import jsonlines

with jsonlines.open('dev.py.jsonl', mode='w') as writer:
    writer.write_all(dev_data)

In [None]:
!head -n5 dev.py.jsonl

{"code": "@pytest.mark.django_db\ndef test_vfolder_with_no_filter_rules():\n\tvfolder_item = {'name': 'whatever', 'priority': 4, 'is_public': True, 'filter_rules': ''}\n\twith pytest.raises(ValidationError) as excinfo:\n\t\tVirtualFolder.objects.create(**vfolder_item)\n\tassert (u'Some \tfiltering \trule \tmust \tbe \tspecified.' in str(excinfo.value))\n\tvfolder_item['filter_rules'] = 'FOO'\n\tvf = VirtualFolder.objects.create(**vfolder_item)\n\tvf.filter_rules = ''\n\twith pytest.raises(ValidationError) as excinfo:\n\t\tvf.save()\n\tassert (u'Some \tfiltering \trule \tmust \tbe \tspecified.' in str(excinfo.value))\n", "code_processed": "@pytest mark django dbdef test vfolder with no filter rules vfolder item {'name' 'whatever' 'priority' 4 'is public' True 'filter rules' ''}with pytest raises Validation Error as excinfo Virtual Folder objects create **vfolder item assert u' Somefilteringrulemustbespecified ' in str excinfo value vfolder item['filter rules'] 'FOO'vf Virtual Folder obj

In [None]:
len(dev_data)

9847

# Test data

In [None]:
import json
with open('data/test.py.jsonl') as f:
    test_data = [json.loads(line) for line in f]

In [None]:
prep_tokens = [prepare_tokens_for_search(tokens['code']) for tokens in test_data if tokens['src'] == 'codeqa']

In [None]:
from tqdm import tqdm

tokens2orig = {}

for i, prep_token in enumerate(tqdm(prep_tokens)):
    tokens2orig[i] = prep_samples_dict[prep_token]

100%|██████████| 7000/7000 [00:00<00:00, 121471.42it/s]


In [None]:
for i, item in enumerate(test_data):
    if item['src'] == 'codeqa':
        orig_code_idx = tokens2orig[i]
        item['code'] = prepare_declbody(orig_samples[orig_code_idx])

In [None]:
import jsonlines

with jsonlines.open('test.py.jsonl', mode='w') as writer:
    writer.write_all(test_data)

In [None]:
!head -n5 test.py.jsonl

{"code": "def add_bucket_default_owner(bucket_name, user_email):\n\tstorage_client = storage.Client()\n\tbucket = storage_client.bucket(bucket_name)\n\tbucket.acl.reload()\n\tbucket.default_object_acl.user(user_email).grant_owner()\n\tbucket.default_object_acl.save()\n\tprint 'Added \tuser \t{} \tas \tan \towner \tin \tthe \tdefault \tacl \ton \tbucket \t{}.'.format(user_email, bucket_name)\n", "code_processed": "def add bucket default owner bucket name user email storage client storage Client bucket storage client bucket bucket name bucket acl reload bucket default object acl user user email grant owner bucket default object acl save print ' Addeduser{}asanownerinthedefaultaclonbucket{} ' format user email bucket name\n", "question": "How does the code add a user in the given buckets default object access control list ?\n", "answers": "as an owner\n", "src": "codeqa"}
{"code": "def add_bucket_default_owner(bucket_name, user_email):\n\tstorage_client = storage.Client()\n\tbucket = stor

In [None]:
len(test_data)

9847

In [None]:
!zip -r UltimateQADataset.zip UltimateQADataset/

  adding: UltimateQADataset/ (stored 0%)
  adding: UltimateQADataset/test.py.jsonl (deflated 80%)
  adding: UltimateQADataset/dev.py.jsonl (deflated 80%)
  adding: UltimateQADataset/train.py.jsonl (deflated 81%)


In [None]:
!cp UltimateQADataset.zip drive/MyDrive/