In [None]:
!pip install sentence_transformers torch



In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from sentence_transformers import models
from torch.utils.data import DataLoader
import pandas as pd
import random
import torch
import os

import wandb
wandb.init(mode="disabled")
# === Параметры ===
DATA_PATH = "text_dataset_05_09_25.csv"
MODEL_NAME = "intfloat/multilingual-e5-large"
OUTPUT_DIR = "e5_large_05_09_hard_easy_2ep_neg_6_6"
BATCH_SIZE = 32
EPOCHS = 2
HARD_NEGATIVES = 6
EASY_NEGATIVES = 6
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# === 1. Загружаем данные ===
df = pd.read_csv(DATA_PATH, sep=";")
df = df.dropna(subset=["Вопрос", "Категория"])

categories = list(df["Категория"].unique())

# === 2. Загружаем модель для поиска hard negatives ===
embed_model = SentenceTransformer(MODEL_NAME, device=DEVICE)
cat_embeddings = embed_model.encode(categories, convert_to_tensor=True, show_progress_bar=True)

# === 3. Генерируем пары ===
examples = []
for _, row in df.iterrows():
    question = row["Вопрос"].strip()
    correct_cat = row["Категория"].strip()

    # Положительная пара
    examples.append(InputExample(texts=[question, correct_cat], label=1.0))

        # --- Hard negatives ---
    correct_idx = categories.index(correct_cat)
    cos_scores = util.pytorch_cos_sim(cat_embeddings[correct_idx], cat_embeddings)[0]
    sorted_idx = torch.argsort(cos_scores, descending=True).tolist()

    # Топ-4 похожих (исключая саму категорию)
    hard_negs = [categories[i] for i in sorted_idx if categories[i] != correct_cat][:HARD_NEGATIVES]

    # --- Easy negatives ---
    # Берём самые непохожие (нижний топ)
    sorted_idx_rev = torch.argsort(cos_scores, descending=False).tolist()
    easy_negs = [categories[i] for i in sorted_idx_rev if categories[i] != correct_cat][:EASY_NEGATIVES]

    # Добавляем все негативы
    for neg_cat in hard_negs + easy_negs:
        examples.append(InputExample(texts=[question, neg_cat], label=0.0))

# === 4. Модель ===
word_embedding_model = models.Transformer(MODEL_NAME)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=DEVICE)

# === 5. DataLoader ===
train_dataloader = DataLoader(examples, shuffle=True, batch_size=BATCH_SIZE)

# === 6. Loss ===
train_loss = losses.CosineSimilarityLoss(model)

# === 7. Обучение ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=int(len(train_dataloader) * 0.1),
    output_path=OUTPUT_DIR,
    use_amp=True
)

print(f"Модель сохранена в {OUTPUT_DIR}")

  | |_| | '_ \/ _` / _` |  _/ -_)


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0868
1000,0.0273
1500,0.0235
2000,0.0203
2500,0.0183
3000,0.0179
3500,0.0148
4000,0.0102
4500,0.0104
5000,0.0095


Модель сохранена в e5_large_05_09_hard_easy_2ep_neg_5_5


In [None]:
!zip -r finetuned_model_1.zip e5_large_05_09_hard_easy_2ep_neg_5_5

  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/ (stored 0%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/tokenizer.json (deflated 76%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/model.safetensors (deflated 21%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/sentence_bert_config.json (deflated 9%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/sentencepiece.bpe.model (deflated 49%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/config.json (deflated 49%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/README.md (deflated 63%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/special_tokens_map.json (deflated 85%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/tokenizer_config.json (deflated 76%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/modules.json (deflated 53%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/1_Pooling/ (stored 0%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/1_Pooling/config.json (deflated 58%)
  adding: e5_large_05_09_hard_easy_2ep_neg_5_5/config_sentence_tr

In [None]:
from google.colab import drive
drive.mount('/content/drive')