In [None]:
import os
import json
import random
import subprocess

def retrieve_file(repo_path: str, file_path: str, commit_sha: str):
    '''
    Func: Retrieve file of a given version
    Args:
        repo_path: str, the repository directory
        file_path: str, the relative path for the file in 
                        terms of the repository
        commit_sha: the commit version
    '''
    # Save the current working directory
    original_dir = os.getcwd()

    try:
        # Change to the repository directory
        os.chdir(repo_path)

        # Check out the specific commit
        checkout_command = f'git checkout {commit_sha}'
        subprocess.run(checkout_command, shell=True, check=True)

        # Copy the file to the desired location
        source_file = os.path.normpath(file_path)
        with open(source_file, 'r', encoding='utf-8') as f:
            content = f.readlines()

        # Always return to the original working directory
        os.chdir(original_dir)
        return content
    except:
        # Always return to the original working directory
        os.chdir(original_dir)
        raise KeyError('Unable to find the file.')

In [None]:
# 1. Load raw dataset int jsonl format
dataset = []
with open('javascript_dataset.jsonl', 'r', encoding='utf-8') as file:
    for i in file.readlines():
        data = json.loads(i)
        dataset.append(data)
print('Dataset size:',len(dataset))
print('Dataset example:',dataset[0])

In [3]:
# 2. categorize the dataset by commit_id
# the unit of datasample of the raw dataset is a changed file, with multiple changed hunks
# commit_id_dict: {commit_url: [the index of data samples in the raw dataset that belongs to this commit]}
commit_id_dict = {}
for idx, i in enumerate(dataset):
    if i['html_url'] in commit_id_dict:
        commit_id_dict[i['html_url']].append(idx)
    else:
        commit_id_dict[i['html_url']] = [idx]

print('The number of commits:' ,len(commit_id_dict))    

19951

# General

In [5]:
# 3. convert each edit hunk into a data sample
add = []
replace = []
remove = []
prev_line = [3,4,5]
repos_dir = '/media/chenyan/Backup Plus/CodeEdit_raw_dataset/repos'
processed_data_dir = ''

for idx, i in enumerate(dataset):
    cnt = 0
    for j in i['changes']:
        # add
        if j['del_line_idx'] == [] and j['add_line_idx']:
            # with open('../'+i['new_file_path'], encoding='utf-8') as file:
            proj_name = i['proj_name']
            commit_sha = i['new_sha']
            file_path = i['file_path']
            repo_path = os.path.join(repos_dir, proj_name)
            # Get the content of this file of the given commit
            lines = retrieve_file(repo_path, file_path, commit_sha)

            label = ['keep']*len(lines)
            
            add_line = j['add_line_idx'][0]-1
            start_line = j['add_line_idx'][0]-random.choice(prev_line)
            mid_line = j['add_line_idx'][-1]
            end_line = mid_line+random.choice(prev_line)+1
            
            if start_line < 0:
                start_line = 0
            if end_line > len(lines):
                end_line = -1
                
            code_window = lines[start_line:add_line] + lines[mid_line:end_line]
            label_window = label[start_line:add_line] + ['add'] + label[mid_line:end_line]
            
            add.append({
                        'code_window': code_window, 
                        'label_window': label_window, 
                        'commit_msg': i['commit_msg'], 
                        'html_url': i['html_url'], 
                        'add_line': j['add_line'], 
                        'method_name': j['func_name'],
                        'old_file_path': file_path, 
                        'idx': idx,
                        'hunk_type': 'add'
                    })
                
        # replace
        elif j['del_line_idx'] and j['add_line_idx']:
            # with open('../'+i['old_file_path'], encoding='utf-8') as file:
            proj_name = i['proj_name']
            commit_sha = i['old_sha']
            file_path = i['file_path']
            repo_path = os.path.join(repos_dir, proj_name)
            lines = retrieve_file(repo_path, file_path, commit_sha)

            label = ['keep']*len(lines)
            for del_line in j['del_line_idx']:
                label[del_line-1] = 'replace'
            
            start_line = j['del_line_idx'][0]-random.choice(prev_line)
            end_line = j['del_line_idx'][-1]+random.choice(prev_line)
            if start_line < 0:
                start_line = 0
            if end_line > len(lines):
                end_line = -1
            code_window = lines[start_line:end_line]
            label_window = label[start_line:end_line]
            
            replace.append({
                        'code_window': code_window, 
                        'label_window': label_window, 
                        'commit_msg': i['commit_msg'], 
                        'html_url': i['html_url'], 
                        'add_line': j['add_line'], 
                        'method_name': j['func_name'],
                        'old_file_path': file_path, 
                        'idx': idx,
                        'hunk_type': 'replace'
                    })
        
        # remove
        elif j['del_line_idx'] and j['add_line_idx'] == []:
            # with open('../'+i['old_file_path'], encoding='utf-8') as file:
            proj_name = i['proj_name']
            commit_sha = i['old_sha']
            file_path = i['file_path']
            repo_path = os.path.join(repos_dir, proj_name)
            lines = retrieve_file(repo_path, file_path, commit_sha)
            
            label = ['keep']*len(lines)
            for del_line in j['del_line_idx']:
                label[del_line-1] = 'remove'
            
            start_line = j['del_line_idx'][0]-random.choice(prev_line)
            end_line = j['del_line_idx'][-1]+random.choice(prev_line)
            if start_line < 0:
                start_line = 0
            if end_line > len(lines):
                end_line = -1
            code_window = lines[start_line:end_line]
            label_window = label[start_line:end_line]
            
            remove.append({
                        'code_window': code_window, 
                        'label_window': label_window, 
                        'commit_msg': i['commit_msg'], 
                        'html_url': i['html_url'], 
                        'add_line': j['add_line'], 
                        'method_name': j['func_name'],
                        'old_file_path': file_path, 
                        'idx': idx,
                        'hunk_type': 'remove'
                    })
            
print('The number of add type hunks:', len(add))
print('The number of replace type hunks:', len(replace))
print('The number of remove type hunks:', len(remove))

FileNotFoundError: [Errno 2] No such file or directory: '.././repos/alekseykulikov_storage_0065c63f6ffa6af38fcdcebf4f3a1af21ce7af4e/component.json'

In [120]:
# 4. categorize the data samples by commit_id
# {commit url: [data samples in this commit]}
result_dict = {} 
for i in add+replace+remove:
    html_url = i['html_url']
    if html_url in result_dict:
        result_dict[html_url].append(i)
    else:
        result_dict[html_url] = [i]

# create dataset: edit generation

In [131]:
# code window + label_window + commit message + prev_change

In [144]:
# rank by similarity
from rank_bm25 import BM25Okapi
output = []
for commit in sorted(result_dict.keys()):
    for co_change in result_dict[commit]:
        code_window = ''.join(co_change['code_window'])
        label_window = ' '.join(co_change['label_window'])
        commit_message = co_change['commit_msg']
        context = []
        if len(code_window) == 0:
            continue
        
        # BM25 search for related context
        prev_edit = result_dict[commit].copy()
        prev_edit.remove(co_change)
        try:
            tokenized_corpus = [''.join(i['code_window']+[i['add_line']]).split() for i in prev_edit]
            bm25 = BM25Okapi(tokenized_corpus) # build a BM25 object with other hunks
            tokenized_query = code_window.split()
            retrieval_code = bm25.get_top_n(tokenized_query, tokenized_corpus, n=5)
            context_index = [tokenized_corpus.index(i) for i in retrieval_code] # get the index of the top 5 similar hunks

            # form context, which are the deleted and added lines in the top 5 similar hunkss
            for idx in context_index:
                if prev_edit[idx]['hunk_type'] == 'replace': 
                    replace = prev_edit[idx]['label_window'].index('replace')
                    context.append('remove '+ prev_edit[idx]['code_window'][replace])
                    context.append('add '+ prev_edit[idx]['add_line'])

                elif prev_edit[idx]['hunk_type'] == 'remove':
                    remove = prev_edit[idx]['label_window'].index('remove')
                    context.append('remove '+ prev_edit[idx]['code_window'][remove])

                elif prev_edit[idx]['label_window'] == 'add':
                    context.append('add '+ prev_edit[idx]['add_line'])
        except:
            pass
        
        input_ = ' </s> '.join([code_window, label_window, commit_message] + context)
        output_ =   co_change['add_line']
        html_url =  co_change['html_url']
        file_name = co_change['file_path']
        output.append({"docstring_tokens":output_, "code_tokens":input_, "html_url":html_url, "file_name":file_name})
 

In [150]:
len(output)

209585

In [153]:
import jsonlines
# final data format: {"docstring_tokens":doc_tokens, "code_tokens":code_tokens}
os.path.join(processed_data_dir, 'generator/train.jsonl')
with jsonlines.open(os.path.join(processed_data_dir, 'generator/train.jsonl'), 'w') as f:
    for item in output[:int(0.7*len(output))]:
        f.write(item)
with jsonlines.open(os.path.join(processed_data_dir, 'generator/dev.jsonl'), 'w') as f:
    for item in output[int(0.7*len(output)): int(0.8*len(output))]:
        f.write(item)
with jsonlines.open(os.path.join(processed_data_dir, 'generator/test.jsonl'), 'w') as f:
    for item in output[int(0.8*len(output)):]:
        f.write(item)

# create dataset: edit locator

In [None]:
# rank by similarity
from rank_bm25 import BM25Okapi
output = []
for commit in result_dict:
    for co_change in result_dict[commit]:
        code_window = ''.join(co_change['code_window'])
        label_window = ' '.join(co_change['label_window'])
        commit_message = co_change['commit_msg']
        context = []
        if len(code_window) == 0:
            continue
        
        # BM25 search for related context
        prev_edit = result_dict[commit].copy()
        prev_edit.remove(co_change)
        try:
            tokenized_corpus = [''.join(i['code_window']+[i['add_line']]).split() for i in prev_edit]
            bm25 = BM25Okapi(tokenized_corpus)
            tokenized_query = code_window.split()
            retrieval_code = bm25.get_top_n(tokenized_query, tokenized_corpus, n=5)
            context_index = [tokenized_corpus.index(i) for i in retrieval_code]

            for idx in context_index:
                if prev_edit[idx]['hunk_type'] == 'replace':
                    replace = prev_edit[idx]['label_window'].index('replace')
                    context.append('remove '+ prev_edit[idx]['code_window'][replace])
                    context.append('add '+ prev_edit[idx]['add_line'])

                elif prev_edit[idx]['hunk_type'] == 'remove':
                    remove = prev_edit[idx]['label_window'].index('remove')
                    context.append('remove '+ prev_edit[idx]['code_window'][remove])

                elif prev_edit[idx]['hunk_type'] == 'add':
                    context.append('add '+ prev_edit[idx]['add_line'])
        except:
            pass
        
        input_ = ' </s> '.join([code_window, commit_message] + context)
        output_ =   co_change['add_line']
        html_url =  co_change['html_url']
        file_name = co_change['file_path']
        output.append({"docstring_tokens":label_window, "code_tokens":input_, "html_url":html_url, "file_name":file_name})
 

In [None]:
import jsonlines
# create dataset: code generation
# final data format: {"docstring_tokens":doc_tokens, "code_tokens":code_tokens}

with jsonlines.open(os.path.join(processed_data_dir, 'locator/train.jsonl'), 'w') as f:
    for item in output[:int(0.7*len(output))]:
        f.write(item)
with jsonlines.open(os.path.join(processed_data_dir, 'locator/dev.jsonl'), 'w') as f:
    for item in output[int(0.7*len(output)): int(0.8*len(output))]:
        f.write(item)
with jsonlines.open(os.path.join(processed_data_dir, 'locator/test.jsonl'), 'w') as f:
    for item in output[int(0.8*len(output)):]:
        f.write(item)