In [20]:
from transformers import T5Tokenizer,T5ForConditionalGeneration
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = T5Tokenizer.from_pretrained("t5-small")
rl_model=T5ForConditionalGeneration.from_pretrained('./DI_FT_Alpaca_modifyBLEU/30e/model')
rl_model.to(device)
rl_model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [21]:
def optimize_one_text(u_model,one_text):
    inputs = tokenizer(one_text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = u_model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=50,
        early_stopping=True)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

In [22]:
import json

with open('./result_claude_haiku.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

original_prompts = [item['instruction'] for item in data]

# average_len=0
# for i in range(len(original_prompts)):
#     average_len=average_len+len(original_prompts[i])
# print(average_len/len(original_prompts))

claude_prompts=[item['output'] for item in data]
print(len(original_prompts),len(claude_prompts))

9000 9000


In [23]:
from tqdm import tqdm
rl_generated_caption=[]

len_generation=len(claude_prompts)
for i in tqdm(range(len_generation)):
    rl_generated_caption.append(optimize_one_text(rl_model,claude_prompts[i]))

  0%|          | 5/9000 [00:00<17:34,  8.53it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 9000/9000 [18:34<00:00,  8.08it/s]


In [33]:
import pandas as pd
compare_promts={
    'reference_caption': original_prompts,
    'generated_caption': rl_generated_caption,
}
compare_promts = pd.DataFrame(compare_promts)
compare_promts
# compare_promts.to_parquet(f'./rl_other_generative_model_test.parquet')

Unnamed: 0,reference_caption,generated_caption
0,Give three tips for staying healthy.,enquiry: Write three tips for staying healthy.
1,Give three tips for staying healthy.,enquiry: Generate three tips for staying healthy.
2,Give three tips for staying healthy.,enquiry: Write 3 tips for staying healthy.
3,What are the three primary colors?,enquiry: List the three primary colors.
4,What are the three primary colors?,enquiry: What are the three primary colors?
...,...,...
8995,Name a job that requires a lot of physical str...,enquiry: What types of jobs typically require ...
8996,Name a job that requires a lot of physical str...,enquiry: Provide examples of jobs that require...
8997,Tell me a riddle.,enquiry: Create a riddle that follows the give...
8998,Tell me a riddle.,enquiry: Create a riddle that follows the give...


In [28]:
from datasets import load_metric
from nltk.tokenize import word_tokenize
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import evaluate
import pandas as pd
import numpy as np
import torch

def Calmetic(references:list[list[str]], predictions:list[str]):
    '''
    Input format:

    predictions = [
        "What is the capital of France?",
        "Who wrote the book?",
        "What is the largest planet?"
    ]

    references = [
        ["What is the capital city of France?"],
        ["Who is the author of the book?"],
        ["Which planet is the largest in the solar system?"]
    ]
    '''

    # # 加载 BLEU 评分器
    # bleu_metric = load_metric("bleu")

    # # 计算 BLEU 分数
    predictions_tokenized = [word_tokenize(pred) for pred in predictions]
    references_tokenized = [[word_tokenize(refs[0])] for refs in references]
    # B_S = {}
    # for n in range(1, 5):
    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)
    #     results = bleu_metric.compute(max_order=n)
    #     B_S[f"BLEU-{n}"] = results
    bleu_metric = evaluate.load("bleu")
    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)
    for i,n in enumerate(B_S['precisions']):
        print(f"BLEU-{i+1} score: {n:.5f}")
        


    # 加载 ROUGE 评分器
    rouge_metric = load_metric("rouge")
    '''
    ROUGE-1: 衡量生成文本和参考文本之间的 unigram 匹配。
    ROUGE-2: 衡量生成文本和参考文本之间的 bigram 匹配。
    ROUGE-L: 衡量生成文本和参考文本之间的最长公共子序列(LCS)。
    ROUGE-Lsum: 基于 LCS 的一个变体，专门用于长文本的评估。
    '''
    # 计算 ROUGE 分数
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    rouge1_mid_f1 = rouge_results['rouge1'][1][2]
    rouge2_mid_f1 = rouge_results['rouge2'][1][2]
    rougeL_mid_f1 = rouge_results['rougeL'][1][2]
    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]
    print(f"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}")
    print(f"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}")
    print(f"ROUGE-L F1 score: {rougeL_mid_f1:.5f}")
    print(f"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}")

    # 计算 METEOR 分数
    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]
    average_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR score: {average_meteor_score:.5f}")

    # 计算 BERTScore 分数
    '''
    同样效果：
    bert_metric = load_metric("bertscore",cache_dir="/media/fenghe/New Volume/A2Q/Metric")
    bert_results = bert_metric.compute(predictions=predictions, references=references,lang="en",device=f"cuda:{torch.cuda.device_count() - 1}")

    设置 verbose=True 会使函数在计算过程中输出更多的信息，例如处理进度、当前正在处理的数据等。
    '''
    P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en", verbose=False)
    average_bert_score = F1.mean().item()
    print(f"Average BERTScore F1: {average_bert_score:.5f}")

    return {
        "BLEU":B_S,
        "ROUGE":rouge_results,
        "METERO":meteor_scores,
        "BERTScore":{"Precision":P,"Recall":R,"F1":F1},
    }

In [31]:
res = Calmetic(references=original_prompts,predictions=rl_generated_caption)

BLEU-1 score: 0.40871
BLEU-2 score: 0.28403
BLEU-3 score: 0.22146
BLEU-4 score: 0.17463


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


ROUGE-1 F1 score: 0.56393
ROUGE-2 F1 score: 0.42253
ROUGE-L F1 score: 0.54331
ROUGE-Lsum F1 score: 0.54361
Average METEOR score: 0.00341


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.81366


In [32]:
# 两个重要数值
print(res['BLEU'])
print(res['ROUGE']['rougeLsum'][1])

{'bleu': 0.2588499938289279, 'precisions': [0.40871474848174555, 0.2840296150454275, 0.22145734536739584, 0.17462990247667157], 'brevity_penalty': 1.0, 'length_ratio': 1.640070978167876, 'translation_length': 155277, 'reference_length': 94677}
Score(precision=np.float64(0.49265447647976646), recall=np.float64(0.6506093448006257), fmeasure=np.float64(0.5436050149995293))


In [34]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  #SentenceTransformer("bert-base-uncased") 

reference_texts_ = [ i.replace('enquiry: ',"") for i in rl_generated_caption ]
embeddings1 = sentence_model.encode(original_prompts, convert_to_tensor=True)
embeddings2 = sentence_model.encode(reference_texts_, convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)   #[52002,52002]维度的矩阵，对角线上的值为对应文本的余弦相似度

# 输出余弦相似度的值
print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")

Average Cosine Similarity: 0.7393010854721069
Biggest Cosine Similarity: 1.0000007152557373
Middle Cosine Similarity: 0.8318278789520264
