In [61]:
import code_patterns.factorial_patterns as fp
import code_patterns.binsearch_patterns as bp
import code_patterns.sort_patterns as sp
from code_formatter import CodeFormatter

from random import choice
from json import dump, load

In [71]:
SAMPLES_COUNT = 25
SNIPPETS_DIR = "04_code_representation/code_snippets"

cf = CodeFormatter()

factorial_snippets = dict()
binsearch_snippets = dict()
sort_snippets = dict()

for i in range(SAMPLES_COUNT):
    factorial_snippets[i] = cf.format(choice(fp.ALL))
    binsearch_snippets[i] = cf.format(choice(bp.ALL))
    sort_snippets[i] = cf.format(choice(sp.ALL))

with open(f"{SNIPPETS_DIR}/snippets.json", "w") as f:
    dump({
        "factorial_snippets": factorial_snippets, 
        "binsearch_snippets": binsearch_snippets,
        "sort_snippets": sort_snippets
        }, 
        f
    )

In [72]:
from typing import Dict, Iterable
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=25.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=498.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=898822.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=150.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=498627950.0), HTML(value='')))




In [73]:
def max_context_size(tasks: Iterable[Dict[int, str]]) -> int:
    return max(
        [
            len(tokenizer.tokenize(snippet))
            for task_snippets in tasks
            for snippet in task_snippets.values()
        ]
    )

In [74]:
from ipywidgets import IntProgress
from IPython.display import display

def create_context_embeddings(snippets: Dict[int, str], max_context_size: int = 512) -> Iterable[str]:
    
    progress = IntProgress(min=0, max=len(snippets))
    display(progress)
    
    for snippet in snippets.values():
        code_tokens = tokenizer.tokenize(snippet)
        code_tokens +=  ["<pad>"] * (max_context_size - len(code_tokens))
        tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
        tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
        context_embedding = model(torch.tensor(tokens_ids)[None,:])[0]

        progress.value += 1
        yield context_embedding

In [76]:
context_size = max_context_size([factorial_snippets, binsearch_snippets])

factorial_embeddings = list(create_context_embeddings(factorial_snippets, context_size))
binsearch_embeddings = list(create_context_embeddings(binsearch_snippets, context_size))
# sort_embeddings = list(create_context_embedding(sort_snippets, context_size))

IntProgress(value=0, max=25)

IntProgress(value=0, max=25)

Среднеквадратическая ошибка по матрице не подходит, потому что не отражает семантические свойства, либо полученные эмбеддинги их не содержат

In [14]:
from torchmetrics import MeanSquaredError
mse = MeanSquaredError()

In [15]:
mse(factorial_embeddings[0], factorial_embeddings[1])

tensor(0.1123, grad_fn=<DivBackward0>)

In [20]:
mse(binsearch_embeddings[0], binsearch_embeddings[4])

tensor(0.2869, grad_fn=<DivBackward0>)

In [19]:
mse(factorial_embeddings[1], binsearch_embeddings[1])

tensor(0.1193, grad_fn=<DivBackward0>)

Поэтому нужно как-нибудь понизить размерность, и превратить контекстные эмбеддинги в один вектор. Например, через среднее.

In [77]:
import numpy as np
import pandas as pd

In [78]:
factorial_vectors = [torch.mean(factorial_embeddings[i], axis = 1).flatten().tolist() for i in range(len(factorial_embeddings))]
binsearch_vectors = [torch.mean(binsearch_embeddings[i], axis = 1).flatten().tolist() for i in range(len(binsearch_embeddings))]

In [79]:
fdf = pd.DataFrame(data = factorial_vectors)
fdf["label"] = 0

bdf = pd.DataFrame(data = binsearch_vectors)
bdf["label"] = 1

df = pd.concat([fdf, bdf]).sample(frac = 1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
18,0.061167,-0.058735,0.250418,-0.121777,-1.103654,-0.245905,0.074985,-0.200544,0.207962,0.391431,...,0.078434,0.190813,0.662801,-0.082366,-0.04291,0.753072,-0.741615,-0.153185,0.258659,1
17,0.073156,-0.054352,0.259826,-0.211052,-1.213067,-0.214851,0.068947,-0.245076,0.208511,0.39274,...,0.115117,0.19736,0.595134,-0.141721,-0.027971,0.778348,-0.689331,-0.165247,0.240995,1
5,-0.296074,0.048974,0.209351,0.103714,-0.70652,-0.29945,0.003905,0.086961,0.396154,0.286388,...,-0.093706,-0.350163,0.578731,-0.240972,0.252958,0.852001,-0.68211,-0.363485,0.596861,1
13,-0.060665,-0.05475,0.232139,0.026206,-0.996675,-0.434936,-0.007312,-0.046766,0.345992,0.38788,...,-0.048406,0.05026,0.672718,-0.153239,0.029613,0.752568,-0.718666,-0.191955,0.37682,0
2,-0.091507,-0.037592,0.261951,0.009798,-1.05966,-0.343006,-0.010116,-0.037551,0.307459,0.378772,...,-0.05042,0.004826,0.642486,-0.182456,0.039956,0.749913,-0.799586,-0.190686,0.367412,0


In [80]:
df.shape

(50, 769)

In [81]:
from sklearn.neighbors import KNeighborsClassifier

In [82]:
clf = KNeighborsClassifier(
    n_neighbors = 5,
    metric = "cosine"
)

In [83]:
X = df.drop(["label"], axis = 1).values
y = df["label"].values

In [84]:
X.shape, y.shape

((50, 768), (50,))

In [85]:
from sklearn.metrics import accuracy_score, classification_report

clf = clf.fit(X, y)
y_pred = clf.predict(X)

accuracy_score(y, y_pred)

0.94