In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.neighbors import LocalOutlierFactor
import numpy as np

In [2]:
data = pd.read_csv('out_w_comm_lemma.csv')
bot_comments = data['lemmatized_text'].tolist()

In [3]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = AutoModelForMaskedLM.from_pretrained('DeepPavlov/rubert-base-cased')



tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [4]:
class BotDataset(Dataset):
    def __init__(self, comments):
        self.encodings = tokenizer(comments, truncation=True, padding=True, max_length=128, return_tensors='pt')

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [5]:
bot_dataset = BotDataset(bot_comments)
bot_loader = DataLoader(bot_dataset, batch_size=4, shuffle=True)

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [7]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [8]:
for epoch in range(3):
    for batch in bot_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = {'labels': input_ids}
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

KeyboardInterrupt: 

In [None]:
import os
save_directory = './finetuned_rubert'
if not os.path.exists(save_directory):
    os.makedirs(save_directory)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print("Model and tokenizer saved to", save_directory)

In [ ]:
def get_embeddings(comments, batch_size=8):
    embeddings = []
    for i in range(0, len(comments), batch_size):
        batch = comments[i:i+batch_size]
        encodings = tokenizer(batch, truncation=True, padding=True, max_length=128, return_tensors='pt')
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        with torch.no_grad():
            outputs = model.bert(input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embeddings)
    return np.vstack(embeddings)

In [ ]:
bot_embeddings = get_embeddings(bot_comments)

In [ ]:
lof = LocalOutlierFactor(n_neighbors=20, novelty=True)
lof.fit(bot_embeddings)

In [ ]:
new_df = pd.read_csv('../parser_comments/comments.csv')
new_comments = new_df['text'].tolist()
new_embeddings = get_embeddings(new_comments)
predictions = lof.predict(new_embeddings)