In [1]:
import numpy as np
import polars as pl

from transformers.pipelines import AutoTokenizer, AutoModel
import torch

In [2]:
print(f"Доступно GPU: {torch.cuda.device_count()}")
device = 0 if torch.cuda.is_available() else -1
device

Доступно GPU: 1


0

In [3]:
model = AutoModel.from_pretrained(
    "answerdotai/ModernBERT-base",
    device_map="auto",
    attn_implementation="flash_attention_2"  # Ускорение внимания
).eval()

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base", use_fast=True)

def batch_embed(texts, batch_size=32):
    embeddings = []

    with torch.inference_mode():
        for i in range(0, len(texts), batch_size):
            inputs = tokenizer(
                texts[i:i+batch_size],
                padding="longest",
                truncation=True,
                max_length=8192,
                return_tensors="pt"
            ).to(model.device)

            with torch.amp.autocast("cuda"):
                outputs = model(**inputs)

            attn_mask = inputs.attention_mask.unsqueeze(-1)
            pooled = torch.sum(outputs.last_hidden_state * attn_mask, dim=1) / torch.clamp(attn_mask.sum(dim=1), min=1e-9)
            embeddings.append(pooled.half().cpu())  # using float16
            torch.cuda.empty_cache()

    result = torch.cat(embeddings).numpy().reshape(len(texts), 768)
    torch.cuda.empty_cache()
    return result

In [None]:
lazy_df = pl.scan_parquet("../../data/preprocessed/articles.parquet")
total = lazy_df.select(pl.len()).collect().item()
threshold = 100_000 / total

lazy_df = lazy_df.with_columns(
    pl.struct([pl.col("assets")]).map_elements(
        lambda _: np.random.rand(),
        skip_nulls=False,
        return_dtype=pl.Float64()
    ).alias("random")
)
sampled_lazy_df = (
    lazy_df
    .filter(pl.col("random") < threshold)
    .select(["datetime", "title", "source", "assets", "url", "text"])
)

data = sampled_lazy_df.collect().to_pandas()
texts = data.text.to_list()

In [5]:
%%time
embeddings = batch_embed(texts, batch_size=32)

W0427 01:29:33.762000 126851 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


CPU times: user 2h 41min 47s, sys: 46.4 s, total: 2h 42min 33s
Wall time: 2h 36min 57s


In [None]:
np.save("../aspects/data/embeddings_large.npy", embeddings)
data.to_csv("../aspects/data/data_large.csv", index=False)

In [7]:
torch.cuda.empty_cache()