In [1]:
import sys
sys.path.append('../')
from utils import load_data
from metrics import accuracy_at_k
from tokenizer import CodexTokenizer, CodeGenTokenizer



In [2]:
import pandas as pd
import numpy as np
import scipy
from scipy.spatial.distance import cosine
from tqdm import tqdm

In [3]:
np.random.seed(42)

In [4]:
tokenizer = CodexTokenizer()

In [5]:
# settings :`cross_file_first`, `cross_file_random`, or `in_file`
settings = 'cross_file_first'
data = load_data('train', 'r', 'python', settings)

Loading data: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]


In [6]:
raw_samples = np.random.choice(data['easy'], 10_000)

In [7]:
samples = []
for i, raw_sample in enumerate(raw_samples):
    sample = {}
    sample['code'] = tokenizer.encode(raw_sample['code'])
    sample['context'] = [tokenizer.encode(c) for c in raw_sample['context']]
    sample['target'] = raw_sample['golden_snippet_index']
    samples.append(sample)

# Jaccard

In [8]:
class JaccardSearcher:
    def __init__(self, code: list):
        """code - list of tokens"""
        self.code = code
        self.code_set = set(code)
    
    def compute_dist(self, context):
        distances = []
        for c in context:
            c = set(c)
            iou = len(self.code_set & c) / len(self.code_set | c)
            distances.append(1 - iou) 
        return np.array(distances)

In [9]:
preds, gts = [], []

for sample in tqdm(samples):
    dists = JaccardSearcher(sample['code']).compute_dist(sample['context'])
    preds.append(np.argsort(dists))
    gts.append(sample['target'])

100%|██████████| 10000/10000 [00:02<00:00, 3774.83it/s]


In [10]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k(preds, gts, k=k)}')

accuracy@1: 0.1418
accuracy@2: 0.2924
accuracy@3: 0.4453
accuracy@4: 0.6083
accuracy@5: 0.786
accuracy@6: 0.889
accuracy@7: 0.9523
accuracy@8: 0.9855
accuracy@9: 0.9993


In [12]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k([p[::-1] for p in preds], gts, k=k)}')

accuracy@1: 0.1769
accuracy@2: 0.3457
accuracy@3: 0.5074
accuracy@4: 0.6624
accuracy@5: 0.8092
accuracy@6: 0.898
accuracy@7: 0.9561
accuracy@8: 0.9859
accuracy@9: 0.9993


# Bag of words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
bow_vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=None,
    lowercase=False,
    )

In [15]:
corpus = []
for sample in tqdm(samples):
    corpus += sample['context']

100%|██████████| 10000/10000 [00:00<00:00, 196448.08it/s]


In [16]:
bow_vectorizer.fit(corpus)



In [18]:
gts, preds = [], []

for sample in tqdm(samples):
    vectorized_code = bow_vectorizer.transform([sample['code']])
    vectorized_context = bow_vectorizer.transform( sample['context'])
    dists = vectorized_context.dot(vectorized_code.T).toarray().flatten()
    preds.append(np.argsort(dists))
    gts.append(sample['target'])
    

100%|██████████| 10000/10000 [00:18<00:00, 529.86it/s]


In [25]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k(preds, gts, k=k)}')

accuracy@1: 0.1637
accuracy@2: 0.3231
accuracy@3: 0.4796
accuracy@4: 0.6368
accuracy@5: 0.7925
accuracy@6: 0.8931
accuracy@7: 0.9537
accuracy@8: 0.9853
accuracy@9: 0.9993


# TF-IDF

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=None,
    lowercase=False,
    )

In [27]:
tf_idf_vectorizer.fit(corpus)



In [28]:
gts, preds = [], []

for sample in tqdm(samples):
    vectorized_code = tf_idf_vectorizer.transform([sample['code']])
    vectorized_context = tf_idf_vectorizer.transform( sample['context'])
    dists = vectorized_context.dot(vectorized_code.T).toarray().flatten()
    preds.append(np.argsort(dists))
    gts.append(sample['target'])

100%|██████████| 10000/10000 [00:35<00:00, 282.67it/s]


In [29]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k(preds, gts, k=k)}')

accuracy@1: 0.1761
accuracy@2: 0.3364
accuracy@3: 0.4989
accuracy@4: 0.6565
accuracy@5: 0.8081
accuracy@6: 0.9036
accuracy@7: 0.9584
accuracy@8: 0.9864
accuracy@9: 0.9993


In [33]:
np.random.permutation([1, 2, 3])

array([1, 3, 2])

In [36]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k(preds, gts, k=k)}')

accuracy@1: 0.1761
accuracy@2: 0.3364
accuracy@3: 0.4989
accuracy@4: 0.6565
accuracy@5: 0.8081
accuracy@6: 0.9036
accuracy@7: 0.9584
accuracy@8: 0.9864
accuracy@9: 0.9993


In [37]:
for k in range(1, 10):
    print(f'accuracy@{k}: {accuracy_at_k([np.random.permutation(p) for p in preds], gts, k=k)}')

accuracy@1: 0.1563
accuracy@2: 0.3129
accuracy@3: 0.4712
accuracy@4: 0.6401
accuracy@5: 0.7939
accuracy@6: 0.8898
accuracy@7: 0.9563
accuracy@8: 0.9863
accuracy@9: 0.9993
