In [1]:
import json
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
pd.set_option('display.max_columns', None)
with open('repobench/data/easy_fixed.jsonl', 'r') as json_file:
    raw_data = list(json_file)
len(raw_data)

12000

In [2]:
data = [json.loads(row) for row in raw_data]

In [3]:
data[0].keys()

dict_keys(['repo_name', 'file_path', 'context', 'import_statement', 'code', 'next_line', 'gold_snippet_index', 'nl_code', 'nl_context', 'nl_next_line'])

In [4]:
gold_snippets = [data[i]['gold_snippet_index'] for i in range(len(data))]

In [5]:
for row in data:
    row['cropped_context'] = ['\n'.join(row['context'][i].split('\n')[:5]) for i in range(len(row['context']))]

## Choosing dataset row

In [6]:
ind = 1234

In [7]:
data[ind]['next_line']

'        flat_camera=DifferentiableProjectiveCamera('

In [8]:
with pd.option_context("display.max_colwidth", None):
    display(pd.DataFrame(data[ind]['cropped_context']))

Unnamed: 0,0
0,"class DifferentiableCameraBatch(ABC):\n """"""\n Annotate a differentiable camera with a multi-dimensional batch shape.\n """"""\n"
1,"class DifferentiableProjectiveCamera(DifferentiableCamera):\n """"""\n Implements a batch, differentiable, standard pinhole camera\n """"""\n"
2,"class Transmitter(nn.Module):\n def __init__(self, encoder: Encoder, renderer: Renderer):\n super().__init__()\n self.encoder = encoder\n self.renderer = renderer"
3,"class VectorDecoder(nn.Module):\n def __init__(\n self,\n *,\n device: torch.device,"
4,"class TorchMesh:\n """"""\n A 3D triangle mesh with optional data at the vertices and faces.\n """"""\n"
5,"class AttrDict(OrderedDict):\n """"""\n An attribute dictionary that automatically handles nested keys joined by ""/"".\n\n Originally copied from: https://stackoverflow.com/questions/3031219/recursively-access-dict-via-attributes-as-well-as-index-access"


In [9]:
gold_snippets[ind]

1

In [11]:
embedder = SentenceTransformer('intfloat/e5-large')
y_pred_e5_large = []
for i in range(ind, ind+1):
    # Corpus with example sentences
    corpus_embeddings = embedder.encode(['query: ' + snippet for snippet in data[i]["cropped_context"]], convert_to_tensor=True)
    # Query sentences:
    query_embedding = embedder.encode('query: '+data[i]["next_line"], convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    y_pred_e5_large.append(torch.argsort(cos_scores, descending=True))
y_pred_e5_large

[tensor([1, 0, 2, 3, 4, 5], device='cuda:0')]

In [12]:
embedder = SentenceTransformer('all-MiniLM-L12-v2')
y_pred_all_mpnet = []
for i in range(ind, ind+1):
    # Corpus with example sentences
    corpus_embeddings = embedder.encode(data[i]['cropped_context'], convert_to_tensor=True)
    # Query sentences:
    query_embedding = embedder.encode(data[i]['next_line'], convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    y_pred_all_mpnet.append(torch.argsort(cos_scores, descending=True))
y_pred_all_mpnet

[tensor([1, 0, 3, 4, 2, 5], device='cuda:0')]

## Analyze results

In [13]:
from transformers import AutoTokenizer, AutoModel, utils
from bertviz import model_view, head_view

utils.logging.set_verbosity_error()  # Suppress standard warnings

input_text = data[ind]['next_line']

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2', output_attentions=True)

encoded_input = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = model(**encoded_input)
attention = model_output[-1]  # Retrieve attention from model outputs
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])  # Convert input ids to token strings


In [14]:
head_view(model_output.attentions, tokens)  # Display model view

<IPython.core.display.Javascript object>

In [15]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

input_texts = data[ind]['next_line']

tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large')
model = AutoModel.from_pretrained('intfloat/e5-large', output_attentions=True)

# Tokenize the input texts
encoded_input = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**encoded_input)
outputs
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])  # Convert input ids to token strings

In [16]:
head_view(outputs.attentions, tokens)  # Display model view

<IPython.core.display.Javascript object>