In [1]:
import os
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

# Считывание данных

In [2]:
model = SentenceTransformer("nvidia/llama-embed-nemotron-8b", trust_remote_code=True)

Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

In [3]:
TRAIN_CSV = "../data/raw.csv"
TEST_CSV = '../data/test.csv'
TEXT_COL = "text"
LABEL_COL = "author"
NUM_EPOCHS = 3

In [4]:
ds_train = load_dataset("csv", data_files={"data": TRAIN_CSV})["data"]
ds_test = load_dataset("csv", data_files={"data": TEST_CSV})["data"]

In [5]:
labels = ds_train[LABEL_COL]
le = LabelEncoder()
le.fit(labels)
ds_train = ds_train.map(lambda x: {LABEL_COL: int(le.transform([x[LABEL_COL]])[0])})
print("Label mapping:", {cls: int(idx) for idx, cls in enumerate(le.classes_)})

Label mapping: {np.str_('EAP'): 0, np.str_('HPL'): 1, np.str_('MWS'): 2}


In [6]:
import numpy as np
from itertools import batched
from tqdm.auto import tqdm

def encode_texts(
    texts,
    model,
    batch_size=256,
    show_tqdm=True,
    desc="Encoding texts"
):
    """
    Кодирует список текстов батчами и возвращает numpy-массив эмбеддингов.

    Args:
        texts (list[str]): список текстов
        model: модель с методом encode
        batch_size (int): размер батча
        show_tqdm (bool): показывать прогресс-бар
        desc (str): описание для tqdm

    Returns:
        np.ndarray: shape (len(texts), embedding_dim)
    """
    all_embeddings = []

    num_batches = (len(texts) + batch_size - 1) // batch_size

    iterator = batched(texts, batch_size)
    if show_tqdm:
        iterator = tqdm(
            iterator,
            total=num_batches,
            desc=desc,
            unit="batch"
        )

    for batch in iterator:
        emb = model.encode(
            list(batch),
            convert_to_numpy=True,
            show_progress_bar=False
        )
        all_embeddings.append(emb)

    return np.vstack(all_embeddings)


In [7]:
embs_train = encode_texts(ds_train['text'], model)

Encoding texts:   0%|          | 0/77 [00:00<?, ?batch/s]

In [8]:
embs_test = encode_texts(ds_test['text'], model)

Encoding texts:   0%|          | 0/33 [00:00<?, ?batch/s]

In [9]:
np.save("embs_train.npy", embs_train)

In [10]:
np.save("embs_test.npy", embs_test)