In [59]:
import joblib
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn import preprocessing
from tqdm import tqdm
from numba import jit

In [5]:
import torch.nn.functional as F


class Probability(nn.Module):
    def __init__(self):
        super(Probability, self).__init__()
        self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=False)
        #self.fc = nn.Linear(768*2,1)

    def get_config_dict(self):
        return {'alpha': self.alpha}

    def forward(self, sentence1: Tensor, sentence2: Tensor):
        distances = F.pairwise_distance(sentence1, sentence2, p=2)
        prob = 2/(1+torch.exp(distances*(self.alpha ** 2)))
        #out = self.fc(torch.cat((sentence1, sentence2),1))
        #prob = 1 / (torch.exp(-out)+1)
        return prob

    def device(self):
        return self.alpha.device

class ProkurorProbModel(nn.Module):
    def __init__(self, metalurg_prob_model, prokuror_bert):
        super(ProkurorProbModel, self).__init__()
        
        self.metalurg_prob_model = metalurg_prob_model
        
        self.prokuror_bert = prokuror_bert
        
        self.dense = nn.Linear(768,768)
        self.probability = Probability()

    def forward(self, sentence1: Tensor, sentence2: Tensor):
        with torch.no_grad():
            self.metalurg_prob_model.eval()
            a = self.metalurg_prob_model.get_embedding(sentence1)
        b = self.get_embedding(sentence2)

        p = self.probability(a, b)

        return p

    def get_embedding(self, sentence):
        device = self.device()
        # sentence.to(device)

        anchor_ids = sentence["input_ids"].to(device)
        anchor_mask = sentence["attention_mask"].to(device)
        with torch.no_grad():
            a = self.prokuror_bert(anchor_ids, attention_mask=anchor_mask)[0][:, 0]
            a = self.dense(a)

        return a
    
    def to(self, device):
        self.prokuror_bert.to(device)
        self.dense.to(device)
        self.probability.to(device)
        
    def device(self):
        return self.metalurg_prob_model.device()
        
class ProbModel(nn.Module):
    def __init__(self, bert):
        super(ProbModel, self).__init__()
        self.bert = bert
        self.dense = nn.Linear(768,768)
        self.probability = Probability()
        
    def forward(self,sentence1: Tensor, sentence2: Tensor):
        '''
        device = self.probability.device()
        
        anchor_ids = sentence1["input_ids"].to(device)
        pos_ids = sentence2["input_ids"].to(device)

        anchor_mask = sentence1['attention_mask'].to(device)
        pos_mask = sentence2['attention_mask'].to(device)

        a = self.bert(anchor_ids, attention_mask=anchor_mask).pooler_output
        b = self.bert(pos_ids, attention_mask=pos_mask).pooler_output
        '''        
        
        a = self.get_embedding(sentence1)
        b = self.get_embedding(sentence2)

        p = self.probability(a, b)
        
        return p
    
    def get_embedding(self, sentence):
        device = self.device()

        anchor_ids = sentence["input_ids"].to(device)
        anchor_mask = sentence['attention_mask'].to(device)
        with torch.no_grad():
            a = self.bert(anchor_ids, attention_mask=anchor_mask)[0][:,0]
            a = self.dense(a)
        #a = self.bert(anchor_ids, attention_mask=anchor_mask).pooler_output
        
        return a
    
    def to(self, device):
        self.bert.to(device)
        self.dense.to(device)
        self.probability.to(device)

    def device(self):
        return self.bert.device
    
    
def load_model(model_name: str = "DeepPavlov/rubert-base-cased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    orig_model = AutoModel.from_pretrained(model_name)
    pmodel = ProbModel(orig_model)
    model = ProkurorProbModel(pmodel, orig_model)
    d = torch.load("/home/rsolomatin/metalurgi/prokuror_model/learned_model/35.pth")
    model.load_state_dict(d["model"])
    return tokenizer, model

In [6]:
metalurgs = pd.read_csv("data/parsed_metalurgs_v3_only_prokuror.csv", index_col=0, dtype={"Код налогоплательщика":str})
metalurgs = metalurgs.rename(columns={"Код налогоплательщика": "INN"})
prokuror = pd.read_csv("data/prokuror_results.csv", dtype={"INN":str})
job = joblib.load("data/prokuror_test_inn.joblib")

metalurgs.shape, prokuror.shape, len(job)

((4517712, 5), (3019791, 5), 86)

In [7]:
%%time
le = preprocessing.LabelEncoder()
prokuror["cat_INN"] = le.fit_transform(prokuror["INN"])
cat_job = le.transform(job)
metalurgs["cat_INN"] = le.transform(metalurgs["INN"])

CPU times: user 3.97 s, sys: 42.8 ms, total: 4.02 s
Wall time: 4.02 s


In [5]:
test_prokuror = prokuror[prokuror["cat_INN"].isin(cat_job)]
test_metalurgs = metalurgs[metalurgs["cat_INN"].isin(cat_job)]
test_metalurgs.shape, test_prokuror.shape, len(job)

((935699, 6), (707, 6), 86)

In [6]:
test_metalurgs.head()

Unnamed: 0,path,sent_num,line,INN,p1_topic 1,cat_INN
58748,"Sber_parser/801-901/data/КРАНСТРОЙМОНТАЖ, ООО_...",0,Краны козловые мостовые и,3525299900,,218727
58749,"Sber_parser/801-901/data/КРАНСТРОЙМОНТАЖ, ООО_...",25,Взрывобезопасные ВБИ и пожаробезопасные ПБИ краны,3525299900,,218727
58750,"Sber_parser/801-901/data/КРАНСТРОЙМОНТАЖ, ООО_...",26,Выбор режима работы крана,3525299900,,218727
58751,"Sber_parser/801-901/data/КРАНСТРОЙМОНТАЖ, ООО_...",28,Замена стяжной балки козлового крана,3525299900,,218727
58752,"Sber_parser/801-901/data/КРАНСТРОЙМОНТАЖ, ООО_...",29,Козловые краны грузоподъемность до тонн,3525299900,,218727


In [9]:
# torch.save(model, "model/save_35.pth")
model = torch.load("model/save_35.pth")
model.eval();
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model.metalurg_prob_model.dense.to(device)
model.to(device)

In [16]:
arr = []
num_split = test_metalurgs.shape[0] / 32
for row in tqdm(np.split(test_metalurgs, num_split)):
    sentences = row["line"].values.tolist()
    sentense_embeddings = tokenizer(sentences, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    embeddings = model.metalurg_prob_model.get_embedding(sentense_embeddings).cpu().numpy()  # .astype(np.float16)
    for sentence_id, inn, embedding in zip(row.index, row["INN"], embeddings):
        arr.append({"sentence_id":sentence_id, "inn":inn, "arr":embedding})

100%|██████████| 29240/29240 [1:43:51<00:00,  4.69it/s]


In [17]:
arr = np.array(arr)
with open('data/metalurgs_emb_32.npy', 'wb') as f:
    np.save(f, arr)

In [20]:
arr = []
for index, row in tqdm(test_prokuror.iterrows(), total=test_prokuror.shape[0]):
    sentence = row["line"]
    sentense_embedding = tokenizer(sentence, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    embedding = model.get_embedding(sentense_embedding).cpu().numpy()  # .astype(np.float16)
    arr.append({"sentence_id":index, "inn":row["INN"], "arr":embedding})

100%|██████████| 707/707 [00:06<00:00, 103.25it/s]


In [21]:
arr = np.array(arr)
with open('data/prokuror_emb_32.npy', 'wb') as f:
    np.save(f, arr)

In [10]:
arr = []
num_split = prokuror.shape[0] / 32
for row in tqdm(np.split(prokuror, num_split)):
    sentences = row["line"].values.tolist()
    sentense_embeddings = tokenizer(sentences, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    embeddings = model.metalurg_prob_model.get_embedding(sentense_embeddings).cpu().numpy()  # .astype(np.float16)
    for sentence_id, inn, embedding in zip(row.index, row["INN"], embeddings):
        arr.append({"sentence_id":sentence_id, "inn":inn, "arr":embedding})

100%|██████████| 94368/94368 [5:38:52<00:00,  4.64it/s]  


In [11]:
arr = np.array(arr)
with open('data/allprokuror_emb_32.npy', 'wb') as f:
    np.save(f, arr)

# Compute distances

In [60]:
%%time
with open('data/metalurgs_emb_32.npy','rb') as f:
     metalurg = np.load(f, allow_pickle=True)

CPU times: user 2.76 s, sys: 1.22 s, total: 3.99 s
Wall time: 3.99 s


In [92]:
%%time
with open('data/allprokuror_emb_32.npy','rb') as f:
     prokuror = np.load(f, allow_pickle=True)

CPU times: user 21 s, sys: 9.11 s, total: 30.1 s
Wall time: 1min 45s


In [63]:
# metalurg[0], prokuror[0]

In [93]:
# from numba import cuda

In [116]:
@jit(nopython=True,nogil=True,cache=True, fastmath=True)
def lin(y1, y2):
    return cp.linalg.norm(y1-y2)

@jit(nopython=True,nogil=True,cache=True)
def find_neighbors(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = np.zeros(prokuror_arr.shape[0], dtype=np.float64)
    for i, arr in enumerate(prokuror_arr):
        prok_dist[i] = lin(metalurg_arr, arr)
    indexes = np.argsort(prok_dist)
    return prok_dist[indexes][:3], prokuror_id[indexes][:3]

In [94]:
prokuror_arr = []
prokuror_id  = []
is_2d = len(prokuror[0]['arr'].shape) == 2
for prokuror_elem in tqdm(prokuror):
    if is_2d:
        prokuror_arr.append(prokuror_elem["arr"][0])
    else:
        prokuror_arr.append(prokuror_elem["arr"])
    prokuror_id.append(prokuror_elem["sentence_id"])
prokuror_arr = np.array(prokuror_arr)
prokuror_id = np.array(prokuror_id)

100%|██████████| 3019791/3019791 [00:01<00:00, 1913158.08it/s]


In [127]:
!pip install cupy-cuda11x

Defaulting to user installation because normal site-packages is not writeable


In [117]:
res = []
for metalurg_elem in tqdm(metalurg):
    metalurg_arr = np.array(metalurg_elem["arr"])
    distances, sentences_id = find_neighbors(metalurg_arr, prokuror_arr, prokuror_id)
    for dist, sent_id in zip(distances, sentences_id):
        res.append({
            "metalurg_id":metalurg_elem['sentence_id'],
            "metalurg_inn":metalurg_elem["inn"],
            "prokuror_sentence_id": sent_id,
            "prokuror_distance": dist,
        })

  0%|          | 0/935699 [00:00<?, ?it/s]


AttributeError: module 'cupy' has no attribute 'linalg'

In [56]:
df = pd.DataFrame(res)
df.head()

Unnamed: 0,metalurg_id,metalurg_inn,prokuror_sentence_id,prokuror_distance
0,58748,3525299900,225412,0.585173
1,58748,3525299900,659052,0.62499
2,58748,3525299900,871227,0.62499
3,58749,3525299900,22032,0.777344
4,58749,3525299900,715153,0.777344


In [57]:
df.to_csv("data/metalurg_dist2.csv.zip", compression="zip")

## Test euclidean distances

In [1]:
from sklearn.metrics.pairwise import euclidean_distances
from numba import jit
import math
import numpy as np

In [2]:
%%time
with open('data/metalurgs_emb_32.npy','rb') as f:
     metalurg = np.load(f, allow_pickle=True)

CPU times: user 2.6 s, sys: 1.25 s, total: 3.84 s
Wall time: 3.84 s


In [3]:
%%time
with open('data/prokuror_emb_32.npy','rb') as f:
     prokuror = np.load(f, allow_pickle=True)

CPU times: user 3.59 ms, sys: 1.48 ms, total: 5.07 ms
Wall time: 4 ms


In [4]:
metalurg.shape, prokuror.shape

((935699,), (707,))

In [5]:
metalurg_arr = metalurg[0]["arr"]
prokuror_arr = prokuror[0]["arr"][0]

In [6]:
metalurg_arr.shape, prokuror_arr.shape

((768,), (768,))

In [7]:
%timeit np.linalg.norm(metalurg_arr - prokuror_arr)

5.98 µs ± 58.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [8]:
%timeit math.sqrt(sum([(a - b)**2 for a, b in zip(metalurg_arr, prokuror_arr)]))

2.11 ms ± 12.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
%timeit np.sqrt(np.sum(np.power(metalurg_arr - prokuror_arr, 2)))

18.3 µs ± 35.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [10]:
%timeit np.sqrt(np.sum((metalurg_arr - prokuror_arr)**2))

8.21 µs ± 45.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [11]:
%timeit (np.sum((metalurg_arr - prokuror_arr)**2))

6.52 µs ± 114 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [12]:
v1 = [metalurg_arr]
v2 = [prokuror_arr]

In [13]:
%timeit euclidean_distances(v1, v2)

138 µs ± 3.91 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [14]:
@jit(nopython=True)
def jit_linalg(y1, y2):
    return np.linalg.norm(y1-y2)

In [15]:
%timeit jit_linalg(metalurg_arr, prokuror_arr)

The slowest run took 38.88 times longer than the fastest. This could mean that an intermediate result is being cached.
6.49 µs ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%timeit sum((metalurg_arr - prokuror_arr)**2)

105 µs ± 2.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [17]:
%timeit ((metalurg_arr - prokuror_arr)**2).sum()

4.63 µs ± 34.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [18]:
@jit(nopython=True)
def test_jit_sum(y1, y2):
    return ((y1 - y2)**2).sum()

In [19]:
test_jit_sum(metalurg_arr, prokuror_arr)

6.049072265625

In [20]:
%timeit test_jit_sum(metalurg_arr, prokuror_arr)

1.08 µs ± 0.301 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [21]:
@jit(nopython=True)
def test_jit_sum_square(y1, y2):
    return (np.power(y1 - y2, 2)).sum()

In [22]:
test_jit_sum_square(metalurg_arr, prokuror_arr)

6.049072265625

In [23]:
%timeit test_jit_sum_square(metalurg_arr, prokuror_arr)

1.07 µs ± 1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [24]:
prokuror_arr = []
prokuror_id  = []
for prokuror_elem in prokuror:
    prokuror_arr.append(prokuror_elem["arr"])
    prokuror_id.append(prokuror_elem["sentence_id"])

In [25]:
metalurg_arr = np.array(metalurg[0]["arr"])
prokuror_arr = np.array(prokuror_arr)
prokuror_id = np.array(prokuror_id)

In [26]:
def original(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = []
    for prokuror_elem, id in zip(prokuror_arr, prokuror_id):
        distance = np.linalg.norm(metalurg_arr - prokuror_elem[0])
        prok_dist.append({"sent":id, "dist": distance})
    sorted(prok_dist, key=lambda x: x["dist"])[:3]

In [54]:
@jit(nopython=True,nogil=True,cache=True, fastmath=True)
def linalg(y1, y2):
    return np.linalg.norm(y1-y2)

def numba_v1(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = []
    for prokuror_elem, id in zip(prokuror_arr, prokuror_id):
        distance = linalg(metalurg_arr, prokuror_elem[0])
        prok_dist.append({"sent":id, "dist": distance})
    sorted(prok_dist, key=lambda x: x["dist"])[:3]
    
linalg(np.array([1., 2., 3.]), np.array([1., 2., 3.])) # compile

0.0

In [50]:
@jit(nopython=True,nogil=True,cache=True)
def test_cycle(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = np.zeros(prokuror_arr.shape[0], dtype=np.float64)
    for i, arr in enumerate(prokuror_arr):
        prok_dist[i] = np.linalg.norm(metalurg_arr - arr[0])
    indexes = np.argsort(prok_dist)
    return prok_dist[indexes][:3], prokuror_id[indexes][:3]
test_cycle(np.array([1., 2., 3.]), np.array([[1.], [2.], [3.]]), np.array([1, 2, 3, 4])) # compile

(array([1.41421356, 2.23606798, 2.23606798]), array([2, 1, 3]))

In [55]:
@jit(nopython=True,nogil=True,cache=True)
def numba_in_numba(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = np.zeros(prokuror_arr.shape[0], dtype=np.float64)
    for i, arr in enumerate(prokuror_arr):
        prok_dist[i] = linalg(metalurg_arr, arr[0])
    indexes = np.argsort(prok_dist)
    return prok_dist[indexes][:3], prokuror_id[indexes][:3]
numba_in_numba(np.array([1., 2., 3.]), np.array([[1.], [2.], [3.]]), np.array([1, 2, 3, 4])) # compile

(array([1.41421356, 2.23606798, 2.23606798]), array([2, 1, 3]))

In [58]:
%timeit original(metalurg_arr, prokuror_arr, prokuror_id)
%timeit numba_v1(metalurg_arr, prokuror_arr, prokuror_id)
%timeit test_cycle(metalurg_arr, prokuror_arr, prokuror_id)
%timeit numba_in_numba(metalurg_arr, prokuror_arr, prokuror_id)

5.11 ms ± 69.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.17 ms ± 4.12 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
547 µs ± 523 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
491 µs ± 120 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Other

In [2]:
df = pd.read_csv("data/metalurg_dist2.csv.zip", compression="zip", index_col=0)

In [58]:
df.head()

Unnamed: 0,metalurg_id,metalurg_inn,prokuror_sentence_id,prokuror_distance
0,58748,3525299900,225412,0.585173
1,58748,3525299900,659052,0.62499
2,58748,3525299900,871227,0.62499
3,58749,3525299900,22032,0.777344
4,58749,3525299900,715153,0.777344


In [59]:
df["prokuror_distance"].quantile([i/10 for i in range(10)])

0.0    0.419605
0.1    0.490608
0.2    0.515041
0.3    0.537816
0.4    0.561641
0.5    0.588030
0.6    0.617589
0.7    0.648411
0.8    0.702091
0.9    0.794378
Name: prokuror_distance, dtype: float64