In [1]:
cd /home/mateuszg/http2vec

/home/mateuszg/http2vec


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [3]:
lm = "bow"
dataset = "CSIC2010"
clf_name = "rf"
size = 3072
sample_id = "112"

In [4]:
import json
import pandas as pd
from transformers import RobertaTokenizer

from http2vec.evaluation import (
    get_data,
    get_classifier_fn,
    get_vectorizer,
    get_classifier,
    get_vectors
)

In [5]:
data_text = get_data(dataset)
data_text.index = data_text["id"].astype(str)

clf = get_classifier(lm, dataset, size, clf_name)
vectorizer = get_vectorizer(lm, dataset, size)
clf_fun = get_classifier_fn(
    vectorizer,
    f"data/tokenizers/{dataset}",
    clf,
)

tokenizer = RobertaTokenizer.from_pretrained(
    f"data/tokenizers/{dataset}"
)

In [6]:
from sklearn.neighbors import NearestNeighbors


def get_closest(vector, context, n_neighbors=200):

    neigh = NearestNeighbors()
    neigh.fit(context)

    closest_n = neigh.kneighbors(
        vector.reshape(1, -1),
        n_neighbors=n_neighbors,
        return_distance=False
    ).tolist()
    return closest_n[0]

In [7]:
data, labels, ids = get_vectors(lm, dataset, size)

In [8]:
vector = data[ids == sample_id]

In [9]:
n_closest = get_closest(vector, data)

In [10]:
n_closest_data = data_text.loc[ids[n_closest]]["text"]
n_closest_data.to_csv("CSIC2010-n-closest.csv")

In [11]:
n_closest_data[100]

'GET http://localhost:8080/tienda1/publico/entrar.jsp?errorMsg=Credenciales+incorrectas%3C%21--%23exec+cmd%3D%22rm+-rf+%2F%3Bcat+%2Fetc%2Fpasswd%22+--%3E HTTP/1.1\\r\\n\nPragma: no-cache\\r\\n\nCache-control: no-cache\\r\\n\nAccept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5\\r\\n\nAccept-Encoding: x-gzip, x-deflate, gzip, deflate\\r\\n\nAccept-Charset: utf-8, utf-8;q=0.5, *;q=0.5\\r\\n\nAccept-Language: en\\r\\n\nHost: localhost:8080\\r\\n\nConnection: close\\r\\n\n\\r\\n\n\\r\\n'

In [12]:
n_closest_data[0].splitlines()

['GET http://localhost:8080/tienda1/publico/entrar.jsp?errorMsg=Credenciales+incorrectas%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25 HTTP/1.1\\r\\n',
 'Pragma: no-cache\\r\\n',
 'Cache-control: no-cache\\r\\n',
 'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5\\r\\n',
 'Accept-Encoding: x-gzip, x-deflate, gzip, deflate\\r\\n',
 'Accept-Charset: utf-8, utf-8;q=0.5, *;q=0.5\\r\\n',
 'Accept-Language: en\\r\\n',
 'Host: localhost:8080\\r\\n',
 'Connection: close\\r\\n',
 '\\r\\n',
 '\\r\\n']

In [13]:
tokenizer = RobertaTokenizer.from_pretrained(
    f"data/tokenizers/{dataset}"
)

In [14]:
for sentence in n_closest_data[0].splitlines():
    sentence = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        max_length=512,
    )["input_ids"].tolist()[0]
    sentence = tokenizer.convert_ids_to_tokens(sentence)
    print(" ".join(sentence).replace("<s>", ""))

 GET Ġhttp :// localhost : 8080 / tienda 1 / publico / entrar . jsp ? errorMsg = Credenciales + incorrectas % 27 % 3 B + DROP + TABLE + usuarios % 3 B + SELECT +*+ FROM + datos + WHERE + nombre + LIKE +% 27 % 25 ĠHTTP / 1 . 1 \ r \ n </s>
 Pragma : Ġno - cache \ r \ n </s>
 Cache - control : Ġno - cache \ r \ n </s>
 Accept : Ġtext / xml , application / xml , application / xhtml + xml , text / html ; q = 0 . 9 , text / plain ; q = 0 . 8 , image / png ,*/*; q = 0 . 5 \ r \ n </s>
 Accept - Encoding : Ġx - gzip , Ġx - deflate , Ġgzip , Ġdeflate \ r \ n </s>
 Accept - Charset : Ġutf - 8 , Ġutf - 8 ; q = 0 . 5 , Ġ*; q = 0 . 5 \ r \ n </s>
 Accept - Language : Ġen \ r \ n </s>
 Host : Ġlocalhost : 8080 \ r \ n </s>
 Connection : Ġclose \ r \ n </s>
 \ r \ n </s>
 \ r \ n </s>
