In [1]:
import sys
sys.path.append('../')
from utils import load_data
from metrics import accuracy_at_k
from tokenizer import CodexTokenizer, CodeGenTokenizer



In [2]:
import pandas as pd
import numpy as np
import scipy
from scipy.spatial.distance import cosine
from tqdm import tqdm

In [3]:
np.random.seed(42)

In [4]:
tokenizer = CodexTokenizer()

In [5]:
# settings :`cross_file_first`, `cross_file_random`, or `in_file`
settings = 'cross_file_first'
data = load_data('train', 'r', 'python', settings)

Loading data: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it]


In [6]:
raw_samples = np.random.choice(data['hard'], 5_000)
# raw_samples = data['easy']


In [7]:
samples = []
for i, raw_sample in tqdm(enumerate(raw_samples), total=len(raw_samples)):
    sample = {}
    sample['code'] = tokenizer.encode(raw_sample['code'])
    sample['context'] = [tokenizer.encode(c) for c in raw_sample['context']]
    sample['target'] = raw_sample['golden_snippet_index']
    samples.append(sample)

100%|██████████| 5000/5000 [01:03<00:00, 78.93it/s] 


# Jaccard

In [8]:
class JaccardSearcher:
    def __init__(self, code: list):
        """code - list of tokens"""
        self.code = code
        self.code_set = set(code)
    
    def compute_dist(self, context):
        distances = []
        for c in context:
            c = set(c)
            iou = len(self.code_set & c) / len(self.code_set | c)
            distances.append(1 - iou) 
        return np.array(distances)

In [9]:
preds, gts = [], []

for sample in tqdm(samples):
    dists = JaccardSearcher(sample['code']).compute_dist(sample['context'])
    preds.append(np.argsort(dists))
    gts.append(sample['target'])

100%|██████████| 5000/5000 [00:03<00:00, 1568.54it/s]


In [10]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k(preds, gts, k=k)}')

accuracy@1: 0.0662
accuracy@2: 0.139
accuracy@3: 0.2012
accuracy@4: 0.276
accuracy@5: 0.3352
accuracy@6: 0.4018
accuracy@7: 0.4636
accuracy@8: 0.5278
accuracy@9: 0.5994


# Bag of words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
bow_vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=None,
    lowercase=False,
    )

In [13]:
corpus = []
for sample in tqdm(samples):
    corpus += sample['context']

100%|██████████| 5000/5000 [00:00<00:00, 175587.93it/s]


In [14]:
bow_vectorizer.fit(corpus)



In [15]:
gts, preds = [], []

for sample in tqdm(samples):
    vectorized_code = bow_vectorizer.transform([sample['code']])
    vectorized_context = bow_vectorizer.transform( sample['context'])
    dists = vectorized_context.dot(vectorized_code.T).toarray().flatten()
    preds.append(np.argsort(dists))
    gts.append(sample['target'])
    

100%|██████████| 5000/5000 [00:18<00:00, 263.69it/s]


In [16]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k(preds, gts, k=k)}')

accuracy@1: 0.0742
accuracy@2: 0.1416
accuracy@3: 0.2104
accuracy@4: 0.2766
accuracy@5: 0.3372
accuracy@6: 0.4032
accuracy@7: 0.4644
accuracy@8: 0.5308
accuracy@9: 0.5936


# TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=None,
    lowercase=False,
    )

In [18]:
tf_idf_vectorizer.fit(corpus)

In [19]:
gts, preds = [], []

for sample in tqdm(samples):
    vectorized_code = tf_idf_vectorizer.transform([sample['code']])
    vectorized_context = tf_idf_vectorizer.transform( sample['context'])
    dists = vectorized_context.dot(vectorized_code.T).toarray().flatten()
    preds.append(np.argsort(dists))
    gts.append(sample['target'])

100%|██████████| 5000/5000 [00:26<00:00, 186.94it/s]


In [22]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k(preds, gts, k=k)}')

accuracy@1: 0.0722
accuracy@2: 0.1426
accuracy@3: 0.2058
accuracy@4: 0.2784
accuracy@5: 0.3414
accuracy@6: 0.403
accuracy@7: 0.4664
accuracy@8: 0.5318
accuracy@9: 0.5974


In [23]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k([np.random.permutation(p) for p in preds], gts, k=k)}')

accuracy@1: 0.0656
accuracy@2: 0.1388
accuracy@3: 0.2024
accuracy@4: 0.2644
accuracy@5: 0.3336
accuracy@6: 0.3926
accuracy@7: 0.4814
accuracy@8: 0.5288
accuracy@9: 0.5926
