Загружаем скачанный классификатор токсичности:

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers torch sentencepiece gensim

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 36.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.9 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 3.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.many

In [4]:
import pandas as pd
import numpy as np

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# path_ro_roberta = "unitary/multilingual-toxic-xlm-roberta"
path_to_roberta = "/content/drive/MyDrive/Yandex-MLCup-2021/nlp/trained_roberta"
tokenizer = AutoTokenizer.from_pretrained(path_to_roberta)

model = AutoModelForSequenceClassification.from_pretrained(path_to_roberta).cuda()

TOXIC_CLASS=-1
TOKENIZATION_TYPE='sentencepiece'


Ниже функции для применения классификатора

In [10]:
from torch import softmax, sigmoid
import numpy as np


ALLOWED_ALPHABET=list(map(chr, range(ord('а'), ord('я') + 1)))
ALLOWED_ALPHABET.extend(map(chr, range(ord('a'), ord('z') + 1)))
ALLOWED_ALPHABET.extend(list(map(str.upper, ALLOWED_ALPHABET)))
ALLOWED_ALPHABET = set(ALLOWED_ALPHABET)


def logits_to_toxic_probas(logits):
    if logits.shape[-1] > 1:
        activation = lambda x: softmax(x, -1)
    else:
        activation = sigmoid
    return activation(logits)[:, TOXIC_CLASS].cpu().detach().numpy()


def is_word_start(token):
    if TOKENIZATION_TYPE == 'sentencepiece':
        return token.startswith('▁')
    if TOKENIZATION_TYPE == 'bert':
        return not token.startswith('##')
    raise ValueError("Unknown tokenization type")


def normalize(sentence, max_tokens_per_word=20):
    sentence = ''.join(map(lambda c: c if c.isalpha() else ' ', sentence.lower()))
    ids = tokenizer(sentence)['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(ids)[1:-1]
    
    result = []
    num_continuation_tokens = 0
    for token in tokens:
        if not is_word_start(token):
            num_continuation_tokens += 1
            if num_continuation_tokens < max_tokens_per_word:
                result.append(token.lstrip('#▁'))
        else:
            num_continuation_tokens = 0
            result.extend([' ', token.lstrip('▁#')])
    
    return ''.join(result).strip()

def iterate_batches(data, batch_size=40):
    batch = []
    for x in data:
        batch.append(x)
        if len(batch) >= batch_size:
            yield batch
            batch = []
    if len(batch) > 0:
        yield batch

from tqdm.auto import tqdm
def predict_toxicity(sentences, batch_size=5, threshold=0.5, return_scores=False, verbose=True, device='cuda'):
    results = []
    tqdm_fn = tqdm if verbose else lambda x, total: x
    for batch in tqdm_fn(iterate_batches(sentences, batch_size), total=np.ceil(len(sentences) / batch_size)):
        normlized = [normalize(sent, max_tokens_per_word=5) for sent in batch]
        tokenized = tokenizer(normlized, return_tensors='pt', padding=True, max_length=512, truncation=True)
        
        logits = model.to(device)(**{key: val.to(device) for key, val in tokenized.items()}).logits
        preds = logits_to_toxic_probas(logits)
        if not return_scores:
            preds = preds >= threshold
        results.extend(preds)
    return results


Читаем тестовый набор

In [12]:
texts = []
with open("/content/drive/MyDrive/Yandex-MLCup-2021/nlp/public_testset.txt", 'rt') as f:
    for line in f:
        texts.append(normalize(line)) 

Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors


Вычисляем токсичность отдельных слов

In [13]:
import torch

words = set()
for text in texts:
    words.update(text.split())
words = sorted(words)

with torch.inference_mode():
    word_toxicities = predict_toxicity(words, batch_size=100, return_scores=True)
    
toxicity = dict(zip(words, word_toxicities))


  0%|          | 0/221.0 [00:00<?, ?it/s]

Ниже читаем эмбеддинги слов и описываем функции их обработки

In [14]:
word_toxicity_df = pd.DataFrame.from_dict({'word': words, 'toxicity': word_toxicities})

In [15]:
word_toxicity_df.sort_values(by='toxicity', ascending=False).head(20)

Unnamed: 0,word,toxicity
12400,педерастов,0.990629
19785,ублюдочных,0.990629
12721,пиндосов,0.990628
19784,ублюдок,0.990627
4806,ебанутых,0.990624
20263,уродуйбезкультурье,0.990622
2923,выродки,0.990622
4428,долбоящеры,0.990617
12720,пиндосии,0.990608
12697,пидорас,0.990603


In [16]:
import gensim
from pymystem3 import Mystem

stemmer = Mystem()

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [17]:
embs_file = np.load('/content/drive/MyDrive/Yandex-MLCup-2021/nlp/embeddings_with_lemmas.npz', allow_pickle=True)
embs_vectors = embs_file['vectors']
embs_vectors_normed = embs_vectors / np.linalg.norm(embs_vectors, axis=1, keepdims=True)
embs_voc = embs_file['voc'].item()

embs_voc_by_id = [None for i in range(len(embs_vectors))]
for word, idx in embs_voc.items():
    if embs_voc_by_id[idx] is None:
        embs_voc_by_id[idx] = word

In [18]:
def get_w2v_indicies(a):
    res = []
    if isinstance(a, str):
        a = a.split()
    for w in a:
        if w in embs_voc:
            res.append(embs_voc[w])
        else:
            lemma = stemmer.lemmatize(w)[0]
            res.append(embs_voc.get(lemma, None))
    return res

def calc_embs(words):
    words = ' '.join(map(normalize, words))
    inds = get_w2v_indicies(words)
    return [None if i is None else embs_vectors[i] for i in inds]

Сложим эмбеддинги нетоксичных слов в kd-дерево, чтобы можно было близко искать ближайших соседей

In [19]:
nontoxic_emb_inds = [ind for word, ind in embs_voc.items() if toxicity.get(word, 1.0) <= 0.5]
embs_vectors_normed_nontoxic = embs_vectors_normed[nontoxic_emb_inds]

In [20]:
from sklearn.neighbors import KDTree
embs_tree = KDTree(embs_vectors_normed_nontoxic, leaf_size=20)

Функция находит самое близкое нетоксичное слово по предпосчитанным эмбеддингам слов

In [21]:
from functools import lru_cache

@lru_cache()
def find_closest_nontoxic(word, threshold=0.5, allow_self=False):
    if toxicity.get(word, 1.0) <= threshold:
        return word
    
    if word not in toxicity and word not in embs_voc:
        return None
    
    threshold = min(toxicity.get(word, threshold), threshold)
    word = normalize(word)
    word_emb = calc_embs([word])
    if word_emb is None or word_emb[0] is None:
        return None
    
    for i in embs_tree.query(word_emb)[1][0]:
        other_word = embs_voc_by_id[nontoxic_emb_inds[i]]
        if (other_word != word or allow_self) and toxicity.get(other_word, 1.0) <= threshold:
            return other_word
    return None

Заменяем токсичные слова на ближайшие по эмбеддингам не-токсичные

In [22]:
def detox(line):
    words = normalize(line).split()
    fixed_words = [find_closest_nontoxic(word, allow_self=True) or '' for word in words]
    return ' '.join(fixed_words)

In [None]:
fixed_texts = list(map(detox, tqdm(texts)))

  0%|          | 0/2500 [00:00<?, ?it/s]

запишем результат в файл

In [None]:
with open('baseline_fixed.txt', 'wt') as f:
    for text in fixed_texts:
        print(text, file=f)

Скор, если никак не изменять комментарии:

In [None]:
!python3.7 score.py public_testset.short.txt public_testset.short.txt  --embeddings embeddings_with_lemmas.npz --lm lm.binary --model ./trained_roberta/ --device cuda --score -

Loading tokenizer
Loading model
Loading texts
Loading LM
Loading embeddings
Scoring
 10%|████                                    | 50/500.0 [00:01<00:15, 29.21it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████| 500/500.0 [00:20<00:00, 24.28it/s]
2500it [00:26, 95.03it/s] 
average toxicity: 0.6330938
mean lmdiff: 1.0
mean distance_score: 1.0
36.69


Скор бейзлайна:

In [None]:
!python3.7 score.py public_testset.short.txt baseline_fixed.txt  --embeddings embeddings_with_lemmas.npz --lm lm.binary --model ./trained_roberta/ --device cuda --score -

Loading tokenizer
Loading model
Loading texts
Loading LM
Loading embeddings
Scoring
 20%|███████▊                               | 100/500.0 [00:03<00:14, 27.69it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████| 500/500.0 [00:19<00:00, 25.01it/s]
2500it [00:40, 62.24it/s]
average toxicity: 0.46444112
mean lmdiff: 0.9444674231112382
mean distance_score: 0.8119417961430562
42.11


Сохраним данные для бейзлайна online-задачи

In [None]:
!mkdir -p online_baseline

In [None]:
import pickle as pkl

with open('./online_baseline/data.pkl', 'wb') as f:
    pkl.dump(toxicity, f)
    pkl.dump(nontoxic_emb_inds, f)