In [2]:
# import torch
# from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# # Load the pre-trained GPT-Neo model tokenizer (Replace with smaller or larger version)
# model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-1.3B')
# tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B')

# # Load the text dataset
# with open('text.txt', 'r') as f:
#     text = f.read()

# # Tokenize the text dataset
# encoding = tokenizer.encode(text, return_tensors='pt')

# # Fine-tune the model on the text dataset
# model.train()
# optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
# for i in range(100):
#     loss = model(encoding, labels=encoding)[0]
#     loss.backward()
#     optimizer.step()
#     optimizer.zero_grad()
#     print(f'Epoch {i+1}, Loss: {loss.item()}')

# # Save the fine-tuned model and tokenizer
# model.save_pretrained('fine-tuned-gpt-neo')
# tokenizer.save_pretrained('fine-tuned-gpt-neo')

In [1]:
import sys, os
import sys
sys.path.insert(1, '../repos')

In [2]:
from repobench.archive_data.utils import load_data
from repobench.evaluation.metrics import accuracy_at_k



In [3]:
data = load_data(split='test', task='retrieval', language='python', settings=['cross_file_first'])

Loading data: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.34s/it]


In [4]:
data = data['easy']

In [5]:
data[0]

{'repo_name': 'jtydhr88/sd-webui-txt-img-to-3d-model',
 'file_path': 'shap_e/models/nn/ops.py',
 'context': ['class AttrDict(OrderedDict[K, V], Generic[K, V]):\n    """\n    An attribute dictionary that automatically handles nested keys joined by "/".\n\n    Originally copied from: https://stackoverflow.com/questions/3031219/recursively-access-dict-via-attributes-as-well-as-index-access\n    """\n\n    MARKER = object()\n\n    # pylint: disable=super-init-not-called\n    def __init__(self, *args, **kwargs):\n        if len(args) == 0:\n            for key, value in kwargs.items():\n                self.__setitem__(key, value)\n        else:\n            assert len(args) == 1\n            assert isinstance(args[0], (dict, AttrDict))\n            for key, value in args[0].items():\n                self.__setitem__(key, value)\n\n    def __contains__(self, key):\n        if "/" in key:\n            keys = key.split("/")\n            key, next_key = keys[0], "/".join(keys[1:])\n           

In [12]:
from datasets import Dataset
ds = Dataset.from_list(data)

In [13]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
len(data)

In [21]:
import numpy as np
gold_snippets = [data[i]['gold_snippet_index'] for i in range(10000)]

## Random

In [16]:
y_pred_rand = []
for i in range(10000):
    y_pred_rand.append(np.random.permutation(range(len(data[i]['context']))))

In [18]:
for k in range(1, 6, 2):
    print(f'accuracy@{k}: {accuracy_at_k(y_pred_rand, gold_snippets, k=k)}')

accuracy@1: 0.1549
accuracy@3: 0.4698
accuracy@5: 0.7832


## Jaccard


In [22]:
def jaccard_similarity(doc1, doc2): 
    doc1 = set(doc1)
    doc2 = set(doc2)
    intersection = doc1.intersection(doc2)
    union = doc1.union(doc2)
    return float(len(intersection)) / len(union)

In [38]:
y_pred_jac = []
for i in range(10000):
    dists = []
    ids_next_line = tokenizer(data[i]['next_line']).input_ids
    for num, snippet in enumerate(data[i]['context']):
        dists.append(jaccard_similarity(tokenizer(snippet).input_ids, ids_next_line))
    y_pred_jac.append(np.argsort(dists)[::-1])

In [39]:
for k in range(1, 6, 2):
    print(f'accuracy@{k}: {accuracy_at_k(y_pred_jac, gold_snippets, k=k)}')

accuracy@1: 0.3972
accuracy@3: 0.6787
accuracy@5: 0.8817


In [40]:
# Edit similarity
y_pred_es = []
from fuzzywuzzy import fuzz
for i in range(10000):
    dists = []
    ids_next_line = tokenizer(data[i]['next_line']).input_ids
    for num, snippet in enumerate(data[i]['context']):
        dists.append(fuzz.ratio(tokenizer(snippet).input_ids, ids_next_line))
    y_pred_es.append(np.argsort(dists)[::-1])

In [44]:
for k in range(1, 6, 2):
    print(f'accuracy@{k}: {accuracy_at_k(y_pred_es, gold_snippets, k=k)}')

accuracy@1: 0.18
accuracy@3: 0.4771
accuracy@5: 0.7793


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["next_line"], padding="max_length", truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)

## all-MiniLM-L6-v2

In [26]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')
y_pred_all_mini = []
for i in range(10000):
    # Corpus with example sentences
    corpus_embeddings = embedder.encode(data[i]["context"], convert_to_tensor=True)
    # Query sentences:
    query_embedding = embedder.encode(data[i]["next_line"], convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    y_pred_all_mini.append(torch.argsort(cos_scores, descending=True))


In [27]:
for k in range(1, 6, 2):
    print(f'accuracy@{k}: {accuracy_at_k(y_pred_all_mini, gold_snippets, k=k)}')

accuracy@1: 0.7491
accuracy@3: 0.9391
accuracy@5: 0.9824


## UnixCoder

In [65]:
embedder.encode(data[i]["context"], convert_to_tensor=True).shape

torch.Size([5, 384])

In [66]:
embedder.encode(data[i]["next_line"], convert_to_tensor=True).shape

torch.Size([384])

In [47]:
from unixcoder import UniXcoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UniXcoder("microsoft/unixcoder-base")
model.to(device)

UniXcoder(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [59]:
def unixcoder_encode(snippet):
    tokens_ids = model.tokenize([snippet],max_length=512,mode="<encoder-only>")
    return torch.tensor(model(torch.tensor(tokens_ids).to(device))[1])

In [76]:
y_pred_unixcoder = []
for i in range(10000):
    # Corpus with example sentences
    corpus_embeddings = torch.cat(list(map(unixcoder_encode, data[i]["context"])))
    # Query sentences:
    query_embedding = unixcoder_encode(data[i]["next_line"])
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    y_pred_unixcoder.append(torch.argsort(cos_scores, descending=True))

  return torch.tensor(model(torch.tensor(tokens_ids).to(device))[1])


In [77]:
for k in range(1, 6, 2):
    print(f'accuracy@{k}: {accuracy_at_k(y_pred_all_mini[10000:], gold_snippets, k=k)}')

accuracy@1: 0.6583
accuracy@3: 0.8884
accuracy@5: 0.9702
