In [1]:
%%capture
!pip install gdown
!gdown --fuzzy https://drive.google.com/file/d/1pJ2_ddbtLZeIuvvnrERumzQ6iibUagpO/view?usp=sharing
!gdown --fuzzy https://drive.google.com/file/d/1Ew5hefhUiffKG0-IhjIybTYkLLkGtNjq/view?usp=sharing

In [2]:
!nvidia-smi

Sat Oct  8 17:32:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [77]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModel

import os
from humanize import naturalsize

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer_RabotaRu = AutoTokenizer.from_pretrained("RabotaRu/HRBert-mini")
model_RabotaRu = AutoModelForMaskedLM.from_pretrained("RabotaRu/HRBert-mini").to(device)

In [10]:
class WorkDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __getitem__(self, idx):
        text = self.data[idx]
        return text

    def __len__(self):
        return len(self.data)


class collate_fn:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, batch):
        return self.tokenizer(batch, padding=True,
                              max_length=300, truncation=True,
                              return_tensors='pt')

    
def get_loader(dataset, shuffle, batch_size, tokenizer):
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=False,
        num_workers=0,
        collate_fn=collate_fn(tokenizer)
    )
    return loader

In [11]:
def get_embeddings(model, loader, device):
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(loader):
            batch = batch.to(device)
            embeddings.extend(model(**batch).last_hidden_state.mean(-2).detach().cpu().numpy())
    return embeddings

In [63]:
df = pd.read_parquet("v.pq")
interfax = pd.read_parquet("interfax.pq")

In [70]:
final_df = interfax[["link", "title"]].rename({"link": "url", "title": "text"}, axis=1)\
                .append(df[["url", "text"]]).reset_index()

In [71]:
dataset = WorkDataset(final_df["text"].tolist())
loader = get_loader(dataset, shuffle=False,
                    batch_size=64,
                    tokenizer=tokenizer_RabotaRu)

In [72]:
embeddings = get_embeddings(model_RabotaRu.roberta, loader, device)

  0%|          | 0/1164 [00:00<?, ?it/s]

In [73]:
final_df["embeddings_text"] = embeddings

In [74]:
final_df

Unnamed: 0,index,url,text,embeddings_text
0,0,https://www.interfax.ru/world/861378,Подданных Великобритании ожидают значительные ...,"[-0.24219716, 0.951448, 1.1724422, -0.10878857..."
1,1,https://www.interfax.ru/business/861377,"Биржевые цены на Аи-95 снизились на 6,13%, на ...","[-0.0568328, -0.14791739, 0.34602273, -0.16481..."
2,2,https://www.interfax.ru/russia/861385,"""Зеленый"" стандарт для многоквартирного жилья ...","[-0.8447949, 0.4040649, 0.34496567, -0.6385873..."
3,3,https://www.interfax.ru/business/861381,Tesla собралась построить завод по производств...,"[-0.75750387, -0.082782075, 0.8371053, 0.05072..."
4,4,https://www.interfax.ru/world/861384,В Пентагоне провели секретные совещания об уси...,"[-0.22298387, 0.50802124, 0.6974535, -0.108342..."
...,...,...,...,...
74440,1374,https://prosto.rabota.ru/post/neuplata-strahov...,Неуплату страховых взносов в крупном и особо ...,"[0.029552562, 1.0568345, 0.3930199, 0.14269774..."
74441,1375,https://prosto.rabota.ru/post/sberbank-i-yande...,Сейчас «Яндекс.Маркет» — это поисковик по онл...,"[-0.84356415, 1.0104287, 0.26565373, 0.2307545..."
74442,1376,https://prosto.rabota.ru/post/letniy-sostav-te...,В конце июля Минтранс России опубликовал прое...,"[0.17739405, 1.3176425, 0.38538116, -0.0289969..."
74443,1377,https://prosto.rabota.ru/post/roskomnadzor-otk...,В Роскомнадзорза первую половину 2017 года по...,"[-0.19243573, 1.0174437, 0.26168004, 0.2350805..."


In [75]:
final_df.to_feather("rabota_interfax_vac_news.feather")

In [78]:
naturalsize(os.path.getsize("rabota_interfax_vac_news.feather"))

'122.1 MB'