# Загрузка и разделение данных

In [1]:
import gzip

from dataclasses import dataclass
from typing import Iterator

@dataclass
class Text:
    label: str
    title: str
    text: str

def read_texts(fn: str) -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))

texts = list(read_texts("../data/news.tar.gz"))

In [3]:
from sklearn.model_selection import train_test_split


X = []
y = []

for item in texts:
    label, text = item.label, item.text
    X.append(text)
    y.append(label)

In [7]:
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

# Препроцессинг

In [16]:
from tqdm.notebook import tqdm
import string
import pymorphy2
import nltk

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian') 
stop_words.extend(['«', '»', '.)', '?.."', '..?', # добавим некоторые пунктуационные знаки из текстов
                   '--', '…', '...', '—', '–', '>.',
                   ').', '&#', '])', '».', '".',
                   '?..', '»,', '",', ',['])

word_tokenizer = nltk.WordPunctTokenizer()
morph = pymorphy2.MorphAnalyzer()

def preprocess_texts(texts):
    preprocessed_tokens = []
    for text in tqdm(texts):
        tokens = word_tokenizer.tokenize(text)
        tokens = [
            word for word in tokens if (word not in string.punctuation \
            and word not in stop_words and not word.isnumeric())
        ]
        tokens = [morph.parse(x)[0].normal_form for x in tokens]
        preprocessed_tokens.append(tokens)
    return preprocessed_tokens

[nltk_data] Downloading package stopwords to /home/demsy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
X_train_texts_preprocessed = preprocess_texts(X_train_texts)
X_test_texts_preprocessed = preprocess_texts(X_test_texts)

  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

# Обучение модели векторного представления слов [fasttext]

In [57]:
import tempfile
import os
import fasttext

with tempfile.NamedTemporaryFile(suffix=str(os.getpid()), mode="w", encoding="utf-8") as tmp:
    for text_tokens in X_train_texts_preprocessed + X_test_texts_preprocessed:
        text = " ".join(text_tokens)
        tmp.write(f"{text}\n")
    tmp.seek(0)

    model = fasttext.train_unsupervised(tmp.name, model='skipgram')

Read 1M words
Number of words:  20882
Number of labels: 0
Progress: 100.0% words/sec/thread:   29698 lr:  0.000000 avg.loss:  2.084731 ETA:   0h 0m 0s


Проверка логичности представлений

In [58]:
model.get_nearest_neighbors('футбол')

[(0.8110319972038269, 'футбольный'),
 (0.7906937599182129, 'футболист'),
 (0.7497872710227966, 'футболка'),
 (0.7398141026496887, 'фифа'),
 (0.7286355495452881, 'чемпионат'),
 (0.7116661071777344, 'сборная'),
 (0.7075999975204468, 'уефа'),
 (0.690766453742981, 'чм'),
 (0.6866900324821472, 'матч'),
 (0.6861810088157654, 'ффу')]

In [55]:
model.get_dimension()

100

# Doc2Vec

In [73]:
import numpy as np

X_train_text_vectors = []
X_test_text_vectors = []

def vectorize_texts(texts_tokens):
    ret = []
    for text_tokens in tqdm(texts_tokens):
        tokens_vectors = []
        for token in text_tokens:
            token_vector = model.get_word_vector(token)
            tokens_vectors.append(token_vector)
        text_vector = np.mean(tokens_vectors, axis=0)
        ret.append(text_vector)
    return ret

X_train_text_vectors = vectorize_texts(X_train_texts_preprocessed)
X_test_text_vectors = vectorize_texts(X_test_texts_preprocessed)

  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

# SVM классификатор на усредненных векторах-токенах

## Обучение

In [75]:
from sklearn.svm import SVC

svm = SVC(gamma='auto')
svm.fit(X_train_text_vectors, y_train)

## Метрики

In [80]:
from sklearn.metrics import accuracy_score, f1_score

y_train_pred = svm.predict(X_train_text_vectors)
y_test_pred = svm.predict(X_test_text_vectors)

print(f"train accuracy: {accuracy_score(y_train, y_train_pred):.3f}, train macro-f1: {f1_score(y_train, y_train_pred, average='macro'):.3f}")
print(f"test accuracy: {accuracy_score(y_test, y_test_pred):.3f}, test macro-f1: {f1_score(y_test, y_test_pred, average='macro'):.3f}")

train accuracy: 0.790, train macro-f1: 0.580
test accuracy: 0.781, test macro-f1: 0.567


# Использование sentence_transformers

In [82]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.45k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

## Векторизация сырых данных

In [92]:
def encode_sentences(sentences):
    ret = []
    for text in tqdm(sentences):
        query_embedding = model.encode(text)
        ret.append(query_embedding)
    return ret

X_train_raw_text_vectors = encode_sentences(X_train_texts)
X_test_raw_text_vectors = encode_sentences(X_test_texts)

  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

### SVM для сырых данных

In [93]:
from sklearn.svm import SVC

svm = SVC(gamma='auto')
svm.fit(X_train_raw_text_vectors, y_train)

In [94]:
y_train_pred = svm.predict(X_train_raw_text_vectors)
y_test_pred = svm.predict(X_test_raw_text_vectors)

print(f"train accuracy: {accuracy_score(y_train, y_train_pred):.3f}, train macro-f1: {f1_score(y_train, y_train_pred, average='macro'):.3f}")
print(f"test accuracy: {accuracy_score(y_test, y_test_pred):.3f}, test macro-f1: {f1_score(y_test, y_test_pred, average='macro'):.3f}")

train accuracy: 0.260, train macro-f1: 0.116
test accuracy: 0.259, test macro-f1: 0.115


## Векторизация предобработанных данных

In [97]:
X_train_preprocessed_text_vectors = encode_sentences([" ".join(t) for t in X_train_texts_preprocessed])
X_test_preprocessed_text_vectors = encode_sentences([" ".join(t) for t in X_test_texts_preprocessed])

  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

### SVM для предобработанных данных

In [98]:
from sklearn.svm import SVC

svm = SVC(gamma='auto')
svm.fit(X_train_preprocessed_text_vectors, y_train)

In [99]:
y_train_pred = svm.predict(X_train_preprocessed_text_vectors)
y_test_pred = svm.predict(X_test_preprocessed_text_vectors)

print(f"train accuracy: {accuracy_score(y_train, y_train_pred):.3f}, train macro-f1: {f1_score(y_train, y_train_pred, average='macro'):.3f}")
print(f"test accuracy: {accuracy_score(y_test, y_test_pred):.3f}, test macro-f1: {f1_score(y_test, y_test_pred, average='macro'):.3f}")

train accuracy: 0.236, train macro-f1: 0.104
test accuracy: 0.232, test macro-f1: 0.101


Использование готовой multilanguage модели из `sentence_transformers` без каких-либо файн-тюнов -- плохая идея для дальнейшей классификации текстов.

Векторизация сырых текстов работает лучше для `sentence_transformers`, но это в целом было ожидаемо, поскольку такие модели обучаются в основном на сырых данных, а предобработка (в частности нормализация слов) удаляет контекст и связь между словами.

# HuggingFace models

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

device = torch.device("cuda")

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


# Sentences we want sentence embeddings for
sentences = ['Привет! Как твои дела?',
             'А правда, что 42 твое любимое число?']

# Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru").to(device)

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [2]:
model.__dir__()

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_auto_class',
 '_autoset_attn_implementation',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_check_and_enable_flash_attn_2',
 '_check_and_enable_sdpa',
 '_compiled_call_impl',
 '_convert_head_mask_to_5d',
 '_copy_lm_head_original_to_resized',
 '_create_repo',
 '_dispatch_accelerate_model',
 '_expand_inputs_for_generation',
 '_extract_past_from_model_output',
 '_forward_hooks',
 '_forward_hooks_always_called',
 '_forward_hooks_with_k

In [7]:
model.config.hidden_size

1024

In [122]:
sentence_embeddings[0].shape

torch.Size([1024])

## Векторизация сырых данных

In [124]:
def encode_sentences(sentences):
    ret = []
    for text in tqdm(sentences):
        encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)

        sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])[0].cpu().numpy()
        ret.append(sentence_embedding)
    return ret

X_train_raw_text_vectors = encode_sentences(X_train_texts)
X_test_raw_text_vectors = encode_sentences(X_test_texts)

  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

### SVM для сырых данных

In [126]:
from sklearn.svm import SVC

svm = SVC(gamma='auto')
svm.fit(X_train_raw_text_vectors, y_train)

In [127]:
y_train_pred = svm.predict(X_train_raw_text_vectors)
y_test_pred = svm.predict(X_test_raw_text_vectors)

print(f"train accuracy: {accuracy_score(y_train, y_train_pred):.3f}, train macro-f1: {f1_score(y_train, y_train_pred, average='macro'):.3f}")
print(f"test accuracy: {accuracy_score(y_test, y_test_pred):.3f}, test macro-f1: {f1_score(y_test, y_test_pred, average='macro'):.3f}")

train accuracy: 0.872, train macro-f1: 0.814
test accuracy: 0.849, test macro-f1: 0.779


## Векторизация предобработанных данных

In [128]:
X_train_preprocessed_text_vectors = encode_sentences([" ".join(t) for t in X_train_texts_preprocessed])
X_test_preprocessed_text_vectors = encode_sentences([" ".join(t) for t in X_test_texts_preprocessed])

  0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

### SVM для предобработанных данных

In [129]:
from sklearn.svm import SVC

svm = SVC(gamma='auto')
svm.fit(X_train_preprocessed_text_vectors, y_train)

In [130]:
y_train_pred = svm.predict(X_train_preprocessed_text_vectors)
y_test_pred = svm.predict(X_test_preprocessed_text_vectors)

print(f"train accuracy: {accuracy_score(y_train, y_train_pred):.3f}, train macro-f1: {f1_score(y_train, y_train_pred, average='macro'):.3f}")
print(f"test accuracy: {accuracy_score(y_test, y_test_pred):.3f}, test macro-f1: {f1_score(y_test, y_test_pred, average='macro'):.3f}")

train accuracy: 0.842, train macro-f1: 0.765
test accuracy: 0.812, test macro-f1: 0.738


В случае использования `large` модели `sbert_large_nlu_ru`, которая предназначена для русского языка, удалось хорошо повысить метрики по сравнению с векторизацией FastText: accuracy `0.781` -> `0.812`, f1 score `0.567` -> `0.738`.

И тут подтверждается факт, что лучше использовать трансформеры на сырых данных, поскольку модельный токенайзер предобучен на сырых текстах.