### Testing CodeBERT embeddings

In [6]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
# import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [4]:
DATA_PATH = "../data/"

# 1. TESTS

Basic test

In [5]:
nl_tokens = tokenizer.tokenize("returns maximum value")
code_tokens = tokenizer.tokenize("def max(a,b): if a>b: return a else return b")

tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]

tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

context_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]
context_embeddings

tensor([[[-0.1342,  0.3599,  0.0361,  ..., -0.2329, -0.3161,  0.3294],
         [-0.7013,  0.1173,  0.0651,  ..., -0.3564, -0.2514,  0.2654],
         [-0.3371,  0.1115,  0.4299,  ..., -0.2361, -0.1156,  0.8037],
         ...,
         [-0.4057,  0.1638,  0.4813,  ..., -0.1657, -0.2869,  0.7310],
         [-0.3968,  0.4617,  0.5130,  ..., -0.3096, -0.6014,  0.4400],
         [-0.1354,  0.3618,  0.0367,  ..., -0.2342, -0.3183,  0.3317]]],
       grad_fn=<NativeLayerNormBackward0>)

Test codeBERT for md only

In [4]:
nl_tokens = tokenizer.tokenize("returns category of a notebook")

tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]

tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

context_embeddings_md = model(torch.tensor(tokens_ids)[None,:])[0]
context_embeddings_md

tensor([[[-0.1342,  0.3342,  0.0396,  ..., -0.2193, -0.3251,  0.3286],
         [-0.2917,  0.4377,  0.1195,  ..., -0.3086, -0.5924,  0.1050],
         [ 0.0023,  0.2300, -0.0107,  ...,  0.0430, -0.4059,  0.0055],
         [-0.1335,  0.3338,  0.0403,  ..., -0.2191, -0.3248,  0.3279]]],
       grad_fn=<NativeLayerNormBackward0>)

Test codeBERT for code only

In [38]:
code_tokens = tokenizer.tokenize(
    """for i in data.index:
    if data.loc[i]['subcategory'] == sub:
        categ = data.loc[i]['category']
    """)

tokens=[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]

tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

context_embeddings_code = model(torch.tensor(tokens_ids)[None,:])[0]
context_embeddings_code

tensor([[[-0.1264,  0.3699,  0.0476,  ..., -0.1938, -0.2802,  0.3096],
         [-0.5152,  0.4615,  0.2745,  ..., -0.4479, -0.3483,  0.1919],
         [-0.1253,  0.3687,  0.0483,  ..., -0.1938, -0.2787,  0.3072]]],
       grad_fn=<NativeLayerNormBackward0>)

In [15]:
context_embeddings.shape

torch.Size([1, 24, 768])

In [12]:
type(context_embeddings)

torch.Tensor

In [14]:
context_embeddings.shape
# 1 = number of batches
# 23 = number of tokens
# 768 = number of hidden units

torch.Size([1, 23, 768])

##### Testing embedding aggregation

In [40]:
# convert tensor into np array
tensor_np = context_embeddings_md.cpu().detach().numpy()
# average of embeddings of the tokens in the sequence
avg_md = np.mean(tensor_np[0], axis=0)
len(avg_md)

768

In [41]:
# convert tensor into np array
tensor_np = context_embeddings_code.cpu().detach().numpy()
# average of embeddings of the tokens in the sequence
avg_code = np.mean(tensor_np[0], axis=0)
len(avg_code)

768

Check whether the code and md sequences are semantically similar using cosine similarity

In [6]:
from scipy.spatial.distance import cosine

In [42]:
cos = 1 - cosine(avg_md, avg_code)
cos

0.9352593421936035

##### Test CodeBERT for mask prediction

In [13]:
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline

In [14]:
# load model and tokenizer
mlm_model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base-mlm")
mlm_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

In [26]:
CODE = "<mask> matplotlib.pyplot as plt"
mlm_model_name = "microsoft/codebert-base-mlm"
fill_mask = pipeline('fill-mask', model=mlm_model, tokenizer=mlm_tokenizer)

outputs = fill_mask(CODE)
print(outputs)

[{'score': 0.7230142951011658, 'token': 6595, 'token_str': ' import', 'sequence': ' importmatplotlib.pyplot as plt'}, {'score': 0.1273956149816513, 'token': 4, 'token_str': '.', 'sequence': '.matplotlib.pyplot as plt'}, {'score': 0.03073594532907009, 'token': 1215, 'token_str': '_', 'sequence': '_matplotlib.pyplot as plt'}, {'score': 0.023831013590097427, 'token': 479, 'token_str': '.', 'sequence': '.matplotlib.pyplot as plt'}, {'score': 0.010915211401879787, 'token': 18134, 'token_str': ' _', 'sequence': ' _matplotlib.pyplot as plt'}]


##### Test PLBART for code-to-text & text-to-code translation

In [2]:
from transformers import PLBartForConditionalGeneration, PLBartTokenizer

In [4]:
plbarttokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
plbart_model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX")

In [8]:
example_python_phrase = "plbart_model = PLBartForConditionalGeneration.from_pretrained('uclanlp/plbart-python-en_XX')"
inputs = plbarttokenizer(example_python_phrase, return_tensors="pt")
translated_tokens = plbart_model.generate(**inputs, decoder_start_token_id=plbarttokenizer.lang_code_to_id["en_XX"])
plbarttokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]



'Load the PLBartFornans class.'

# ^^Note to self: pretty good results!

##### Train and test T5 for text to text tranformation


Use this https://huggingface.co/docs/transformers/model_doc/t5

# 2. EMBEDDING DATA

In [7]:
import pickle

Load dataset containing clean text (code and md cells for each notebook and their category)

In [8]:
with open(DATA_PATH + 'text_dataset.pkl', 'rb') as f:
    text_data = pickle.load(f)

In [9]:
text_data.head()

Unnamed: 0,cell-type,source,title,tag
0,code,import pandas as pd import numpy as np import ...,0-9-try-better-parameters-better-score.ipynb,regression
1,markdown,try to overfit more pls upvote if you fork lik...,0-9-try-better-parameters-better-score.ipynb,regression
2,markdown,sub1 0 869,0-9-try-better-parameters-better-score.ipynb,regression
3,markdown,credit very simple code with score 0 886 by ay...,0-9-try-better-parameters-better-score.ipynb,regression
4,code,import pandas as pd import numpy as np from sk...,0-9-try-better-parameters-better-score.ipynb,regression


In [10]:
text_data.shape

(332354, 4)

In [14]:
text_data.tail()

Unnamed: 0,cell-type,source,title,tag
344966,code,"POI_data = gpd.read_file(""../input/geospatial...",your-first-map.ipynb,reinforcement learning
344967,markdown,next we create a map from all four geodatafram...,your-first-map.ipynb,reinforcement learning
344968,code,"ax = counties.plot(figsize=(10,10), color='no...",your-first-map.ipynb,reinforcement learning
344969,markdown,it looks like the northeastern part of the sta...,your-first-map.ipynb,reinforcement learning
344970,markdown,have questions or comments visit the course di...,your-first-map.ipynb,reinforcement learning


In [11]:
test = text_data.copy()
test.shape

(332354, 4)

Embedding each cell's tokens and averaging them to get a representation for the row

In [31]:
def embed_single_row(text, row):
    
    bert_tokens = tokenizer.tokenize(text)
    # print('before', len(bert_tokens))
    if len(bert_tokens) > 510: # we choose max_len to be 510 as the tokenizer then adds 2 special tokens <s> and </s>
        bert_tokens = bert_tokens[:510]
    # print('after', len(bert_tokens))
    tokens=[tokenizer.cls_token]+bert_tokens+[tokenizer.sep_token]

    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    row_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

    if row_embeddings is not None:
        return row_embeddings


In [13]:
def average_embeddings(embedding_tensor):
    avg_rep = np.empty(1)
    if embedding_tensor is not None:
        # convert tensor into np array
        tensor_np = embedding_tensor.cpu().detach().numpy()
        # average of embeddings of the tokens in the sequence
        avg_rep = np.mean(tensor_np[0], axis=0)
        # return average representation of a sequence of tokens
        return avg_rep

We proceed by chunks of the dataset for time optimization reasons

In [None]:
row_avg_embedding = []
for row in text_data.index[:166177]:
    embedding = embed_single_row(text_data.loc[row, 'source'], row)
    row_avg_embedding.append((average_embeddings(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))
    print(row)
# build intermediary dataframe of averaged row vectors, titles and tags 
interm_df01 = pd.DataFrame(row_avg_embedding, columns = ['row_vector', 'title', 'tag'])

In [None]:
row_avg_embedding = []
for row in text_data.index[166177:(166177+100000)]:
    embedding = embed_single_row(text_data.loc[row, 'source'], row)
    row_avg_embedding.append((average_embeddings(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))
    print(row)
# build intermediary dataframe of averaged row vectors, titles and tags 
interm_df02 = pd.DataFrame(row_avg_embedding, columns = ['row_vector', 'title', 'tag'])

In [None]:
row_avg_embedding = []
for row in text_data.index[266177:]:
    embedding = embed_single_row(text_data.loc[row, 'source'], row)
    row_avg_embedding.append((average_embeddings(embedding),  text_data.loc[row, 'title'], text_data.loc[row, 'tag']))
    print(row)
# build intermediary dataframe of averaged row vectors, titles and tags 
interm_df03 = pd.DataFrame(row_avg_embedding, columns = ['row_vector', 'title', 'tag'])

Save the vectorized chunks to pkl files

In [49]:
interm_df01.to_pickle(DATA_PATH + 'interm_df_0_166176.pkl')
interm_df02.to_pickle(DATA_PATH + 'interm_df_166177_266177.pkl')
interm_df03.to_pickle(DATA_PATH + 'interm_df_266177_332354.pkl')