In [1]:
import json
from pprint import pprint
from pathlib import Path

import pandas as pd
pd.set_option('max_colwidth',300)

In [2]:
with open('data/python/final/jsonl/train/python_train_0.jsonl', 'r') as f:
    sample_file = f.readlines()
pprint(json.loads(sample_file[0]))

{'code': 'def train(train_dir, model_save_path=None, n_neighbors=None, '
         "knn_algo='ball_tree', verbose=False):\n"
         '    """\n'
         '    Trains a k-nearest neighbors classifier for face recognition.\n'
         '\n'
         '    :param train_dir: directory that contains a sub-directory for '
         'each known person, with its name.\n'
         '\n'
         '     (View in source code to see train_dir example tree structure)\n'
         '\n'
         '     Structure:\n'
         '        <train_dir>/\n'
         '        ├── <person1>/\n'
         '        │   ├── <somename1>.jpeg\n'
         '        │   ├── <somename2>.jpeg\n'
         '        │   ├── ...\n'
         '        ├── <person2>/\n'
         '        │   ├── <somename1>.jpeg\n'
         '        │   └── <somename2>.jpeg\n'
         '        └── ...\n'
         '\n'
         '    :param model_save_path: (optional) path to save model on disk\n'
         '    :param n_neighbors: (optional) number of 

In [3]:
# files = sorted(Path('data/').glob('**/*.jsonl'))
files = sorted(Path('data/python/').glob('**/*.jsonl'))

columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                #    compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

df = jsonl_list_to_dataframe(files, columns_long_list)
df.head(1)

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition
0,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L46-L108,"def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):\n """"""\n Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n (View in s...","[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, verbose, =, False, ), :, X, =, [, ], y, =, [, ], # Loop through each person in the training set, for, class_dir, in, os, ., listdir, (, train_dir, ), :, if, not, os, ., path, ., isdir...","Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n (View in source code to see train_dir example tree structure)\n\n Structure:\n <train_dir>/\n ├── <person...","[Trains, a, k, -, nearest, neighbors, classifier, for, face, recognition, .]",python,train


In [4]:
# focus on a python file first
print(df.language.value_counts())

# What happens next?
# 1. filter out code_tokens that start with #(comments)
# 2. concatenate all the tokens into a code string
# done with further processing

columns_short_list = ['code_tokens']
code = jsonl_list_to_dataframe(files, columns_short_list)

language
python    30000
Name: count, dtype: int64


In [5]:
code['filtered_code_tokens'] = [[token for token in row if len(token) > 0 and token[0] != '#']
                                for row in code['code_tokens']]
code.head(1)

Unnamed: 0,code_tokens,filtered_code_tokens
0,"[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, verbose, =, False, ), :, X, =, [, ], y, =, [, ], # Loop through each person in the training set, for, class_dir, in, os, ., listdir, (, train_dir, ), :, if, not, os, ., path, ., isdir...","[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, verbose, =, False, ), :, X, =, [, ], y, =, [, ], for, class_dir, in, os, ., listdir, (, train_dir, ), :, if, not, os, ., path, ., isdir, (, os, ., path, ., join, (, train_dir, ,, clas..."


In [6]:
code['code_string'] = [' '.join(row) for row in code['filtered_code_tokens']]
code.head(1)

Unnamed: 0,code_tokens,filtered_code_tokens,code_string
0,"[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, verbose, =, False, ), :, X, =, [, ], y, =, [, ], # Loop through each person in the training set, for, class_dir, in, os, ., listdir, (, train_dir, ), :, if, not, os, ., path, ., isdir...","[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, verbose, =, False, ), :, X, =, [, ], y, =, [, ], for, class_dir, in, os, ., listdir, (, train_dir, ), :, if, not, os, ., path, ., isdir, (, os, ., path, ., join, (, train_dir, ,, clas...","def train ( train_dir , model_save_path = None , n_neighbors = None , knn_algo = 'ball_tree' , verbose = False ) : X = [ ] y = [ ] for class_dir in os . listdir ( train_dir ) : if not os . path . isdir ( os . path . join ( train_dir , class_dir ) ) : continue for img_path in image_files_in_folde..."


In [7]:
# introduce the local language model to do the "multi mask filling"

from transformers import RobertaTokenizer, RobertaForMaskedLM

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")

print(tokenizer.mask_token_id)
print(tokenizer.vocab_size)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


50264
50265


In [8]:
code = "if ( <mask> is not None ) <mask> ( x > 1 )"  # simulate the multi mask scenario
token_ids = tokenizer.encode(code, return_tensors='pt')
masked_position = (token_ids.squeeze() == tokenizer.mask_token_id).nonzero()
masked_pos = [mask.item() for mask in masked_position]
# masked_pos  # [3, 8]

In [9]:
import torch
import torch.nn.functional as F

with torch.no_grad():
    output = model(token_ids)
output.logits.shape  # output[0].shape is torch.Size([1, 15, 50265])

torch.Size([1, 15, 50265])

In [10]:
last_hidden_state = output[0].squeeze()

list_of_list = []  # multiple guessings for each masked token
for mask_index in masked_pos:
    mask_hidden_state = last_hidden_state[mask_index]
    top_values, top_indices = torch.topk(mask_hidden_state, k=5, dim=0)
    top_prob = F.softmax(top_values, dim=0)
    top_words = [tokenizer.decode(i.item()).strip() for i in top_indices]
    list_of_list.append((top_words, top_indices.tolist(), top_prob.tolist()))

list_of_list

[(['x', 'x', 'y', 'y', 'z'],
  [3023, 1178, 1423, 219, 992],
  [0.7283328175544739,
   0.20617249608039856,
   0.05017666518688202,
   0.010286793112754822,
   0.005031223874539137]),
 (['&', '#', '//', '*', '*'],
  [359, 849, 21277, 3226, 1009],
  [0.5606948137283325,
   0.15611280500888824,
   0.11635690182447433,
   0.11041409522294998,
   0.0564214363694191])]