<a href="https://colab.research.google.com/github/f-player/RAG/blob/main/No_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip -q install -U transformers accelerate torch datasets evaluate sacrebleu rouge-score sentence-transformers


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m447.9 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.9/374.9 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [3]:
import re, math, pandas as pd, numpy as np
from pathlib import Path
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate

# Метрики
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")

# Простые метрики: нормализация текста, EM, токен-F1
def normalize_text(s: str) -> str:
    if s is None: return ""
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^\w\s\-–—.,?!:;()«»\"'`]", "", s, flags=re.UNICODE)
    return s

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_token(pred, gold):
    p_tokens = normalize_text(pred).split()
    g_tokens = normalize_text(gold).split()
    if len(p_tokens) == 0 and len(g_tokens) == 0:
        return 1.0
    if len(p_tokens) == 0 or len(g_tokens) == 0:
        return 0.0
    common = {}
    for t in g_tokens:
        common[t] = common.get(t,0)+1
    num_same = 0
    for t in p_tokens:
        if common.get(t,0) > 0:
            num_same += 1
            common[t] -= 1
    if num_same == 0:
        return 0.0
    precision = num_same / len(p_tokens)
    recall    = num_same / len(g_tokens)
    return 2 * precision * recall / (precision + recall)

# Семантическая близость через sentence-transformers (мультиязычная)
from sentence_transformers import SentenceTransformer, util
sem_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

def cosine_sim(a, b):
    emb = sem_model.encode([a, b], convert_to_tensor=True, normalize_embeddings=True)
    return float(util.cos_sim(emb[0], emb[1]).item())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
from google.colab import files
uploaded = files.upload()  # выберите ваш файл .csv
csv_path = list(uploaded.keys())[0]  # берем первое загруженное имя
print("Загружен:", csv_path)


Saving qa_only.csv to qa_only (1).csv
Загружен: qa_only (1).csv


In [8]:
df = pd.read_csv(csv_path)
cols_lower = {c.lower(): c for c in df.columns}

# авто-поиск названий
cands_q = ["question","q","prompt","query","question_text","вопрос"]
cands_a = ["answer","a","completion","response","target","gold","ответ"]

QUESTION_COL = next((cols_lower[c] for c in cands_q if c in cols_lower), None)
ANSWER_COL   = next((cols_lower[c] for c in cands_a if c in cols_lower), None)

if QUESTION_COL is None or ANSWER_COL is None:
    print("Колонки обнаружены не полностью.")
    print("Имеющиеся:", list(df.columns))
    # 👇 при необходимости — выставьте вручную:
    # QUESTION_COL = "question"
    # ANSWER_COL = "answer"
    raise ValueError("Не найдено имя колонки вопроса или ответа. Переименуйте вручную в коде.")

df = df[[QUESTION_COL, ANSWER_COL]].rename(columns={QUESTION_COL:"question", ANSWER_COL:"answer"}).dropna().reset_index(drop=True)
print("Строк:", len(df))
df.head(3)


Строк: 1000


Unnamed: 0,question,answer
0,How does Freud's psychoanalytic theory explain...,Freud believed that human behaviour is driven ...
1,How does behaviourism differ from psychoanalyt...,behaviourism suggests that behaviour is learne...
2,What is the main criticism of Carl Rogers' per...,Critics argue that it places too much emphasis...


In [9]:
MODEL_NAME = "google/flan-t5-base"  # варианты: "google/mt5-base", "google/flan-t5-large" (дороже)

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
model.eval()


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [10]:
def build_prompt(q: str) -> str:
    # Подходящий для FLAN-T5 формат
    return f"Answer the question concisely:\n\nQuestion: {q}\nAnswer:"

GEN_KW = dict(
    max_new_tokens=64,
    do_sample=False,        # детерминированно; можно True + temperature для разнообразия
    temperature=0.0,
    num_beams=1
)

preds = []
for q in tqdm(df["question"].tolist(), desc="Generating"):
    prompt = build_prompt(q)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, **GEN_KW)
    text = tokenizer.decode(out[0], skip_special_tokens=True).strip()
    preds.append(text)

df["pred"] = preds
df.head(5)


Generating:   0%|          | 0/1000 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

Unnamed: 0,question,answer,pred
0,How does Freud's psychoanalytic theory explain...,Freud believed that human behaviour is driven ...,Freud's psychoanalytic theory
1,How does behaviourism differ from psychoanalyt...,behaviourism suggests that behaviour is learne...,The behaviourism theory is based on the belief...
2,What is the main criticism of Carl Rogers' per...,Critics argue that it places too much emphasis...,The treatment of people with mental illness is...
3,What is the main focus of cognitive psychology?,The main focus of cognitive psychology is on h...,cognitive psychology
4,What is the main difference between classical ...,Classical conditioning involves associating a ...,The classical conditioning is a system of cond...


In [11]:
# Построчные метрики
df["em"]   = [exact_match(p, g) for p, g in zip(df["pred"], df["answer"])]
df["f1"]   = [f1_token(p, g) for p, g in zip(df["pred"], df["answer"])]
df["cos"]  = [cosine_sim(p, g) for p, g in zip(df["pred"], df["answer"])]

# Коллективные метрики (ROUGE-L, BLEU)
rouge_res = rouge.compute(predictions=df["pred"].tolist(), references=df["answer"].tolist(), use_stemmer=True)
bleu_res  = bleu.compute(predictions=df["pred"].tolist(), references=[[a] for a in df["answer"].tolist()])

summary = {
    "EM_mean": float(np.mean(df["em"])),
    "F1_mean": float(np.mean(df["f1"])),
    "Cosine_mean": float(np.mean(df["cos"])),
    "ROUGE1": rouge_res.get("rouge1", 0.0),
    "ROUGE2": rouge_res.get("rouge2", 0.0),
    "ROUGEL": rouge_res.get("rougeL", 0.0),
    "ROUGELsum": rouge_res.get("rougeLsum", 0.0),
    "BLEU": bleu_res.get("score", 0.0)
}
summary


{'EM_mean': 0.0,
 'F1_mean': 0.16211615685223257,
 'Cosine_mean': 0.5550396973630414,
 'ROUGE1': np.float64(0.1803153326203064),
 'ROUGE2': np.float64(0.06615725768606562),
 'ROUGEL': np.float64(0.16238890841721532),
 'ROUGELsum': np.float64(0.16258226337293014),
 'BLEU': 1.128433178074348}

In [12]:
out_csv = "qa_predictions_with_metrics.csv"
df.to_csv(out_csv, index=False)
print("Сохранено:", out_csv)
df.head(10)


Сохранено: qa_predictions_with_metrics.csv


Unnamed: 0,question,answer,pred,em,f1,cos
0,How does Freud's psychoanalytic theory explain...,Freud believed that human behaviour is driven ...,Freud's psychoanalytic theory,0.0,0.0,0.608734
1,How does behaviourism differ from psychoanalyt...,behaviourism suggests that behaviour is learne...,The behaviourism theory is based on the belief...,0.0,0.285714,0.568556
2,What is the main criticism of Carl Rogers' per...,Critics argue that it places too much emphasis...,The treatment of people with mental illness is...,0.0,0.066667,0.546354
3,What is the main focus of cognitive psychology?,The main focus of cognitive psychology is on h...,cognitive psychology,0.0,0.2,0.661096
4,What is the main difference between classical ...,Classical conditioning involves associating a ...,The classical conditioning is a system of cond...,0.0,0.196078,0.659212
5,What is the primary focus of behaviourism?,The primary focus of behaviourism is on observ...,behaviour,0.0,0.0,0.470066
6,What is the primary focus of humanistic psycho...,The primary focus of humanistic psychology is ...,humanistic psychology,0.0,0.2,0.714955
7,How does Maslow's hierarchy of needs relate to...,Maslow's hierarchy of needs suggests that peop...,He argues that the motivational hierarchy of n...,0.0,0.217391,0.743517
8,What is the main criticism of the biological a...,Critics argue that it reduces complex human be...,The biological approach to psychology is a slu...,0.0,0.266667,0.490663
9,What is the main difference between nature and...,Nature refers to genetic or innate factors tha...,Nature is a living organism that is a living b...,0.0,0.125,0.57096


In [13]:
def aggregate_score(summary,
                    w_em=0.1, w_f1=0.2, w_rouge=0.2, w_bleu=0.2, w_cos=0.3):
    """
    Собирает единую метрику из summary-словаря.
    Можно менять веса w_em ... w_cos.
    """
    # Нормализуем BLEU: в sacrebleu обычно от 0 до 100, переведём к [0,1]
    bleu_norm = summary["BLEU"] / 100 if summary["BLEU"] > 1 else summary["BLEU"]

    # Берём ROUGE-L (или ROUGELsum) как главный
    rouge_val = summary.get("ROUGEL", summary.get("ROUGELsum", 0.0))

    score = (
        w_em * summary["EM_mean"] +
        w_f1 * summary["F1_mean"] +
        w_rouge * rouge_val +
        w_bleu * bleu_norm +
        w_cos * summary["Cosine_mean"]
    )

    return score

final_score = aggregate_score(summary)
print(f"Интегральная метрика качества: {final_score:.3f}")


Интегральная метрика качества: 0.234
