In [2]:
import pandas as pd
import json
import os
import argparse
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import numpy as np

def load_json_file(file_path):
    """Load JSON file and return as Python object"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

def extract_video_name(path):
    """Extract video filename without extension from path"""
    return os.path.splitext(os.path.basename(path))[0]

In [13]:
prediction_path = 'output/batch_video_results.json'
references_path = 'datasets/video_prompts.json'
#output_path = 'output/batch_video_results.csv'


# Load JSON files
predictions_data = load_json_file(prediction_path)
references_data = load_json_file(references_path)

if not predictions_data or not references_data:
    print("Failed to load one or both JSON files. Exiting.")

# Create a dictionary of reference prompts keyed by video name
reference_dict = {}
for item in references_data:
    # Adapt this according to your reference JSON structure
    if isinstance(item, dict) and 'video' in item and 'conversations' in item:
        reference_dict[item['video']] = item['conversations'][1]['value']


# Create a list to store all comparison data
comparison_data = []

# Process each prediction
for pred in predictions_data:
    video_name = pred['video_name']
    prediction = pred.get('prediction', '')
    
    # Find matching reference
    reference = reference_dict.get(video_name, '')
    
    
    # Add to comparison data
    comparison_data.append({
        'video_name': video_name,
        'reference': reference,
        'prediction': prediction,
    })

# Convert to DataFrame
df = pd.DataFrame(comparison_data)





In [19]:
#Save to CSV
df.to_parquet('output/compare_evalutaion_results.parquet', index=False)


In [1]:
import pandas as pd

result_df = pd.read_parquet('output/compare_evalutaion_results.parquet')

In [None]:
result_df.head()

In [None]:
from datasets import load_metric
from nltk.tokenize import word_tokenize
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import evaluate
import pandas as pd
import numpy as np
import torch

def Calmetic(references:list[list[str]], predictions:list[str]):
    '''
    Input format:

    predictions = [
        "What is the capital of France?",
        "Who wrote the book?",
        "What is the largest planet?"
    ]

    references = [
        ["What is the capital city of France?"],
        ["Who is the author of the book?"],
        ["Which planet is the largest in the solar system?"]
    ]
    '''

    # # 加载 BLEU 评分器
    # bleu_metric = load_metric("bleu")

    # # 计算 BLEU 分数
    predictions_tokenized = [word_tokenize(pred) for pred in predictions]
    references_tokenized = [[word_tokenize(refs[0])] for refs in references]
    # B_S = {}
    # for n in range(1, 5):
    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)
    #     results = bleu_metric.compute(max_order=n)
    #     B_S[f"BLEU-{n}"] = results
    bleu_metric = evaluate.load("bleu")
    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)
    for i,n in enumerate(B_S['precisions']):
        print(f"BLEU-{i+1} score: {n:.5f}")
        


    # 加载 ROUGE 评分器
    rouge_metric = load_metric("rouge")
    '''
    ROUGE-1: 衡量生成文本和参考文本之间的 unigram 匹配。
    ROUGE-2: 衡量生成文本和参考文本之间的 bigram 匹配。
    ROUGE-L: 衡量生成文本和参考文本之间的最长公共子序列(LCS)。
    ROUGE-Lsum: 基于 LCS 的一个变体，专门用于长文本的评估。
    '''
    # 计算 ROUGE 分数
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    rouge1_mid_f1 = rouge_results['rouge1'][1][2]
    rouge2_mid_f1 = rouge_results['rouge2'][1][2]
    rougeL_mid_f1 = rouge_results['rougeL'][1][2]
    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]
    print(f"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}")
    print(f"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}")
    print(f"ROUGE-L F1 score: {rougeL_mid_f1:.5f}")
    print(f"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}")

    # 计算 METEOR 分数
    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]
    average_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR score: {average_meteor_score:.5f}")

    # 计算 BERTScore 分数

    P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en", verbose=False)
    average_bert_score = F1.mean().item()
    print(f"Average BERTScore F1: {average_bert_score:.5f}")

    return {
        "BLEU":B_S,
        "ROUGE":rouge_results,
        "METERO":meteor_scores,
        "BERTScore":{"Precision":P,"Recall":R,"F1":F1},
    }

In [6]:
refs = [ [i] for i in result_df['reference']]
content = result_df['prediction']

In [9]:
import re

def clean_sentence(s):
    s = s.replace("，", ",")  # 替换中文逗号
    s = re.sub(r"-[a-z]+\s*\d+", "", s)  # 去掉如 -fps 24 这种参数
    s = s.strip()
    return s

predictions = [clean_sentence(p) for p in result_df['prediction']]
references = [[clean_sentence(r)] for r in result_df['reference']]


In [None]:
res = Calmetic(references=references,predictions=predictions)

In [None]:
# 两个重要数值
print(res['BLEU'])
print(res['ROUGE']['rougeLsum'][1])

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  #SentenceTransformer("bert-base-uncased") 

reference_texts_ = [ clean_sentence(i) for i in result_df['reference'] ]
embeddings1 = sentence_model.encode(predictions, convert_to_tensor=True)
embeddings2 = sentence_model.encode(result_df['reference'], convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)  

# 输出余弦相似度的值
print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")


In [None]:
import matplotlib.pyplot as plt

# 获取余弦相似度数据
cos_sim_scores = cosine_scores_2.diagonal()

# 创建直方图
plt.figure(figsize=(10, 6))
plt.hist(cos_sim_scores.cpu().numpy(), bins=50, edgecolor='black')
plt.title('Distribution of Cosine Similarity Scores')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
result_df['cosine_similarity'] = cos_sim_scores.cpu().numpy().tolist()
result_df.head()

In [38]:
result_df.to_parquet('output/result_df_cosine_similarity.parquet', index=False)

In [4]:
import pandas as pd
result_df = pd.read_parquet('output/result_df_cosine_similarity.parquet')

In [None]:
#在result_df中找测试集datasets/test_prompts_20percent.json中对应video_name的行，形成一个新的dataframe
# Read test prompts json file
import json
import matplotlib.pyplot as plt

with open('datasets/test_prompts_20percent.json', 'r') as f:
    test_data = json.load(f)

# Get list of video names from test set
test_video_names = [item['video'] for item in test_data]

# Filter result_df to only include videos from test set
test_result_df = result_df[result_df['video_name'].isin(test_video_names)]

# Print statistics about cosine similarity scores for test set
test_cos_scores = test_result_df['cosine_similarity']
print(f"Test Set Statistics:")
print(f"Average Cosine Similarity: {test_cos_scores.mean():.4f}")
print(f"Max Cosine Similarity: {test_cos_scores.max():.4f}") 
print(f"Median Cosine Similarity: {test_cos_scores.median():.4f}")

# Plot distribution of cosine similarities for test set
plt.figure(figsize=(10,6))
plt.hist(test_cos_scores, bins=50, edgecolor='black')
plt.title('Distribution of Cosine Similarity Scores (Test Set)')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

In [41]:
test_result_df.to_parquet('output/test_result_df_cosine_similarity.parquet', index=False)

In [49]:
test_result_df.reset_index(drop=True, inplace=True)

In [7]:
test_result_df =pd.read_parquet('output/test_result_df_cosine_similarity.parquet')

In [None]:
test_result_df.head()

In [None]:
test_cos_scores

In [None]:
# Get indices where cosine similarity > 0.5
high_test_indices = [i for i, score in enumerate(test_cos_scores) if score > 0.7]

# Create a new dataframe with high similarity pairs
high_test_sim_df = pd.DataFrame({
    'video_name': [test_result_df['video_name'][i] for i in high_test_indices],
    'reference': [test_result_df['reference'][i] for i in high_test_indices],
    'prediction': [test_result_df['prediction'][i] for i in high_test_indices],
    'similarity_score': [test_result_df['cosine_similarity'][i] for i in high_test_indices]
})

# Sort by similarity score in descending order
#high_test_sim_df = high_test_sim_df.sort_values('similarity_score', ascending=False)

print(f"Number of pairs with similarity > 0.7: {len(high_test_sim_df)}")
display(high_test_sim_df.head())

In [None]:
import re

def clean_sentence(s):
    s = s.replace("，", ",")  # 替换中文逗号
    s = re.sub(r"-[a-z]+\s*\d+", "", s)  # 去掉如 -fps 24 这种参数
    s = s.strip()
    return s

predictions_test = [p for p in high_test_sim_df['prediction']]
references_test = [[r] for r in high_test_sim_df['reference']]

res_test = Calmetic(references=references_test,predictions=predictions_test)
print(res_test)
print(res_test['BLEU'])
print(res_test['ROUGE']['rougeLsum'][1])


In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  #SentenceTransformer("bert-base-uncased") 

embeddings1 = sentence_model.encode(high_test_sim_df['prediction'], convert_to_tensor=True)
embeddings2 = sentence_model.encode(high_test_sim_df['reference'], convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)  

# 输出余弦相似度的值
print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")

In [59]:
high_test_sim_df.to_parquet('output/high_test_over0.5_sim_df.parquet', index=False)

In [None]:
result_df.head()

In [None]:
# Get indices where cosine similarity > 0.7
high_sim_indices = (cos_sim_scores > 0.6).nonzero().squeeze().tolist()

# Create a new dataframe with high similarity pairs
high_sim_df = pd.DataFrame({
    'video_name': [result_df['video_name'][i] for i in high_sim_indices],
    'reference': [result_df['reference'][i] for i in high_sim_indices],
    'prediction': [result_df['prediction'][i] for i in high_sim_indices],
    'similarity_score': cos_sim_scores[high_sim_indices].cpu().numpy()
})

# Sort by similarity score in descending order
high_sim_df = high_sim_df.sort_values('similarity_score', ascending=False)

print(f"Number of pairs with similarity > 0.7: {len(high_sim_df)}")
display(high_sim_df.head())

In [26]:
high_sim_df.to_parquet('output/high_sim_over0.6_df.parquet', index=False)

In [None]:
high_sim_df

In [None]:
import re

def clean_sentence(s):
    s = s.replace("，", ",")  # 替换中文逗号
    s = re.sub(r"-[a-z]+\s*\d+", "", s)  # 去掉如 -fps 24 这种参数
    s = s.strip()
    return s

predictions = [p for p in high_sim_df['prediction']]
references = [[r] for r in high_sim_df['reference']]

res = Calmetic(references=references,predictions=predictions)
print(res['BLEU'])
print(res['ROUGE']['rougeLsum'][1])



In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  #SentenceTransformer("bert-base-uncased") 

embeddings1 = sentence_model.encode(high_sim_df['prediction'], convert_to_tensor=True)
embeddings2 = sentence_model.encode(high_sim_df['reference'], convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)  

# 输出余弦相似度的值
print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")