In [1]:
import os
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

# Считывание данных

In [2]:
model = SentenceTransformer("nvidia/llama-embed-nemotron-8b", trust_remote_code=True)



Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

In [3]:
TRAIN_CSV = "../data/raw.csv"             # путь к твоему csv
TEXT_COL = "text"
LABEL_COL = "author"
NUM_EPOCHS = 3

In [4]:
ds = load_dataset("csv", data_files={"data": TRAIN_CSV})["data"]

In [5]:
labels = ds[LABEL_COL]
le = LabelEncoder()
le.fit(labels)
ds = ds.map(lambda x: {LABEL_COL: int(le.transform([x[LABEL_COL]])[0])})
print("Label mapping:", {cls: int(idx) for idx, cls in enumerate(le.classes_)})

Label mapping: {np.str_('EAP'): 0, np.str_('HPL'): 1, np.str_('MWS'): 2}


In [6]:
ds['text']

Column(['This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.', 'It never once occurred to me that the fumbling might be a mere mistake.', 'In his left hand was a gold snuff box, from which, as he capered down the hill, cutting all manner of fantastic steps, he took snuff incessantly with an air of the greatest possible self satisfaction.', 'How lovely is spring As we looked from Windsor Terrace on the sixteen fertile counties spread beneath, speckled by happy cottages and wealthier towns, all looked as in former years, heart cheering and fair.', 'Finding nothing else, not even gold, the Superintendent abandoned his attempts; but a perplexed look occasionally steals over his countenance as he sits thinking at his desk.', ...])

In [7]:
from itertools import batched
from tqdm.auto import tqdm
BATCH_SIZE = 256
texts = ds["text"]

all_embeddings = []

num_batches = (len(texts) + BATCH_SIZE - 1) // BATCH_SIZE

for batch in tqdm(
    batched(texts, BATCH_SIZE),
    total=num_batches,
    desc="Encoding texts",
    unit="batch"
):
    emb = model.encode(
        list(batch),
        convert_to_numpy=True,
        show_progress_bar=False  # чтобы не было двойного бара
    )
    all_embeddings.append(emb)

embeddings = np.vstack(all_embeddings)
print("Embeddings shape:", embeddings.shape)

Encoding texts:   0%|          | 0/77 [00:00<?, ?batch/s]

Embeddings shape: (19579, 4096)


In [8]:
y = np.asarray(ds["author"], dtype=int)

In [12]:
np.save("embeddings.npy", embeddings)

embeddings = np.load("embeddings.npy")
print(embeddings.shape)

(19579, 4096)


In [1]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, log_loss

cb = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="MultiClass",
    iterations=250,
    learning_rate=0.05,
    depth=16,
    l2_leaf_reg=3.0,
    random_seed=42,
    verbose=200,
    task_type="CPU",        
    devices="0"
)

cb.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True
)

NameError: name 'X_train' is not defined

In [None]:
proba = cb.predict_proba(X_val)
print("proba shape:", proba.shape)

pred = proba.argmax(axis=1)
print(classification_report(y_val, pred))
print("logloss:", log_loss(y_val, proba))

In [None]:
from sklearn.metrics import f1_score

proba = cb.predict_proba(X_val)          # (N, 3)
y_pred = proba.argmax(axis=1)

f1_macro = f1_score(y_val, y_pred, average="macro")
f1_micro = f1_score(y_val, y_pred, average="micro")
f1_weighted = f1_score(y_val, y_pred, average="weighted")

print("F1 macro   :", f1_macro)
print("F1 micro   :", f1_micro)
print("F1 weighted:", f1_weighted)
