In [None]:
import pandas as pd
import json
import os
import argparse
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import numpy as np

def load_json_file(file_path):
    """Load JSON file and return as Python object"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

def extract_video_name(path):
    """Extract video filename without extension from path"""
    return os.path.splitext(os.path.basename(path))[0]

In [13]:
prediction_path = 'output/batch_video_results.json'
references_path = 'datasets/video_prompts.json'
#output_path = 'output/batch_video_results.csv'


# Load JSON files
predictions_data = load_json_file(prediction_path)
references_data = load_json_file(references_path)

if not predictions_data or not references_data:
    print("Failed to load one or both JSON files. Exiting.")

# Create a dictionary of reference prompts keyed by video name
reference_dict = {}
for item in references_data:
    # Adapt this according to your reference JSON structure
    if isinstance(item, dict) and 'video' in item and 'conversations' in item:
        reference_dict[item['video']] = item['conversations'][1]['value']


# Create a list to store all comparison data
comparison_data = []

# Process each prediction
for pred in predictions_data:
    video_name = pred['video_name']
    prediction = pred.get('prediction', '')
    
    # Find matching reference
    reference = reference_dict.get(video_name, '')
    
    
    # Add to comparison data
    comparison_data.append({
        'video_name': video_name,
        'reference': reference,
        'prediction': prediction,
    })

# Convert to DataFrame
df = pd.DataFrame(comparison_data)





In [19]:
#Save to CSV
df.to_parquet('output/compare_evalutaion_results.parquet', index=False)


In [1]:
import pandas as pd

result_df = pd.read_parquet('output/compare_evalutaion_results.parquet')

In [None]:
result_df.head()

In [None]:
from datasets import load_metric
from nltk.tokenize import word_tokenize
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import evaluate
import pandas as pd
import numpy as np
import torch

def Calmetic(references:list[list[str]], predictions:list[str]):
    '''
    Input format:

    predictions = [
        "What is the capital of France?",
        "Who wrote the book?",
        "What is the largest planet?"
    ]

    references = [
        ["What is the capital city of France?"],
        ["Who is the author of the book?"],
        ["Which planet is the largest in the solar system?"]
    ]
    '''

    # # 加载 BLEU 评分器
    # bleu_metric = load_metric("bleu")

    # # 计算 BLEU 分数
    predictions_tokenized = [word_tokenize(pred) for pred in predictions]
    references_tokenized = [[word_tokenize(refs[0])] for refs in references]
    # B_S = {}
    # for n in range(1, 5):
    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)
    #     results = bleu_metric.compute(max_order=n)
    #     B_S[f"BLEU-{n}"] = results
    bleu_metric = evaluate.load("bleu")
    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)
    for i,n in enumerate(B_S['precisions']):
        print(f"BLEU-{i+1} score: {n:.5f}")
        


    # 加载 ROUGE 评分器
    rouge_metric = load_metric("rouge")
    '''
    ROUGE-1: 衡量生成文本和参考文本之间的 unigram 匹配。
    ROUGE-2: 衡量生成文本和参考文本之间的 bigram 匹配。
    ROUGE-L: 衡量生成文本和参考文本之间的最长公共子序列(LCS)。
    ROUGE-Lsum: 基于 LCS 的一个变体，专门用于长文本的评估。
    '''
    # 计算 ROUGE 分数
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    rouge1_mid_f1 = rouge_results['rouge1'][1][2]
    rouge2_mid_f1 = rouge_results['rouge2'][1][2]
    rougeL_mid_f1 = rouge_results['rougeL'][1][2]
    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]
    print(f"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}")
    print(f"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}")
    print(f"ROUGE-L F1 score: {rougeL_mid_f1:.5f}")
    print(f"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}")

    # 计算 METEOR 分数
    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]
    average_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR score: {average_meteor_score:.5f}")

    # 计算 BERTScore 分数

    P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en", verbose=False)
    average_bert_score = F1.mean().item()
    print(f"Average BERTScore F1: {average_bert_score:.5f}")

    return {
        "BLEU":B_S,
        "ROUGE":rouge_results,
        "METERO":meteor_scores,
        "BERTScore":{"Precision":P,"Recall":R,"F1":F1},
    }

In [None]:
content

In [6]:
refs = [ [i] for i in result_df['reference']]
content = result_df['prediction']

In [9]:
import re

def clean_sentence(s):
    s = s.replace("，", ",")  # 替换中文逗号
    s = re.sub(r"-[a-z]+\s*\d+", "", s)  # 去掉如 -fps 24 这种参数
    s = s.strip()
    return s

predictions = [clean_sentence(p) for p in result_df['prediction']]
references = [[clean_sentence(r)] for r in result_df['reference']]


In [None]:
res = Calmetic(references=references,predictions=predictions)

In [None]:
# 两个重要数值
print(res['BLEU'])
print(res['ROUGE']['rougeLsum'][1])

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  #SentenceTransformer("bert-base-uncased") 

reference_texts_ = [ clean_sentence(i) for i in result_df['reference'] ]
embeddings1 = sentence_model.encode(predictions, convert_to_tensor=True)
embeddings2 = sentence_model.encode(result_df['reference'], convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)  

# 输出余弦相似度的值
print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")


In [None]:
import matplotlib.pyplot as plt

# 获取余弦相似度数据
cos_sim_scores = cosine_scores_2.diagonal()

# 创建直方图
plt.figure(figsize=(10, 6))
plt.hist(cos_sim_scores.cpu().numpy(), bins=50, edgecolor='black')
plt.title('Distribution of Cosine Similarity Scores')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
result_df.head()

In [None]:
# Get indices where cosine similarity > 0.7
high_sim_indices = (cos_sim_scores > 0.6).nonzero().squeeze().tolist()

# Create a new dataframe with high similarity pairs
high_sim_df = pd.DataFrame({
    'video_name': [result_df['video_name'][i] for i in high_sim_indices],
    'reference': [result_df['reference'][i] for i in high_sim_indices],
    'prediction': [result_df['prediction'][i] for i in high_sim_indices],
    'similarity_score': cos_sim_scores[high_sim_indices].cpu().numpy()
})

# Sort by similarity score in descending order
high_sim_df = high_sim_df.sort_values('similarity_score', ascending=False)

print(f"Number of pairs with similarity > 0.7: {len(high_sim_df)}")
display(high_sim_df.head())

In [26]:
high_sim_df.to_parquet('output/high_sim_over0.6_df.parquet', index=False)

In [None]:
high_sim_df

In [None]:
import re

def clean_sentence(s):
    s = s.replace("，", ",")  # 替换中文逗号
    s = re.sub(r"-[a-z]+\s*\d+", "", s)  # 去掉如 -fps 24 这种参数
    s = s.strip()
    return s

predictions = [p for p in high_sim_df['prediction']]
references = [[r] for r in high_sim_df['reference']]

res = Calmetic(references=references,predictions=predictions)
print(res['BLEU'])
print(res['ROUGE']['rougeLsum'][1])



In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  #SentenceTransformer("bert-base-uncased") 

embeddings1 = sentence_model.encode(high_sim_df['prediction'], convert_to_tensor=True)
embeddings2 = sentence_model.encode(high_sim_df['reference'], convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)  

# 输出余弦相似度的值
print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")

In [None]:
import av
import torch
import numpy as np

from transformers import AutoProcessor, AutoModel
from huggingface_hub import hf_hub_download

np.random.seed(0)


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 8 frames
indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

inputs = processor(
    text=["playing sports", "eating spaghetti", "go shopping"],
    videos=list(video),
    return_tensors="pt",
    padding=True,
)

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

# Evaluation For Video

In [1]:
import pandas as pd

compare_promts = pd.read_parquet('output/prompt_vid_cos_res/DI+FT_Video2Text_RL_generation_vidsim&bert_over0.5_df.parquet')

In [None]:
compare_promts

In [3]:
from bert_score import score

def bert_sim_eval(candidate_sentence, target_sentence):
    # 将句子放入列表中，因为 bert-score 的 score 函数期望列表输入
    candidates = [candidate_sentence]
    references = [target_sentence]

    # 计算 BertScore
    P, R, F1 = score(candidates, references, lang='en', verbose=True, model_type='bert-base-uncased')
    return {"P":P.item(),"R":R.item(),"F1":F1.item()}

In [None]:
from tqdm import tqdm
P_ours = []
R_ours = []
F1_ours = []
for index,item in tqdm(compare_promts.iterrows(),total=len(compare_promts)):    
    res = bert_sim_eval(item['reference'],item['prediction'])
    P_ours.append(res['P'])
    R_ours.append(res['R'])
    F1_ours.append(res['F1'])

In [None]:
print(sum(P_ours)/len(P_ours))
print(sum(R_ours)/len(R_ours))
print(sum(F1_ours)/len(F1_ours))

In [1]:
# xclip

In [None]:
import torch
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video
import gc


def gen_pic(compare_df):
    pipe = CogVideoXPipeline.from_pretrained(
        "THUDM/CogVideoX-2b",
        torch_dtype=torch.float16
    ).to("cuda")

    # pipe.enable_model_cpu_offload()
    # pipe.enable_sequential_cpu_offload()
    pipe.vae.enable_slicing()
    pipe.vae.enable_tiling()
    
    for idx,row in compare_df.iterrows():
        prompt = row['RL_generation']
        video_name = row['video_name']

        print(f"[{idx+1}/{len(compare_df)}] Generating: RL_gen_{video_name}")

        try:
            video = pipe(
                prompt=prompt,
                num_videos_per_prompt=1,
                num_inference_steps=50,
                num_frames=49,
                guidance_scale=6,
                generator=torch.Generator(device="cuda").manual_seed(42),
            ).frames[0]

            export_to_video(video, f"output/RL_genvideos/RL_gen_{video_name}", fps=8)

        except Exception as e:
            print(f"❌ Error generating {video_name}: {e}")

        finally:
            # 显式释放显存
            del video
            torch.cuda.empty_cache()
            gc.collect()

        # if 'pipe' in globals():
        #     del pipe
        #     gc.collect()
        #     torch.cuda.empty_cache()

In [None]:
gen_pic(compare_promts)

In [None]:
import av
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
from huggingface_hub import hf_hub_download
from torch.nn.functional import cosine_similarity
import pandas as pd
from tqdm import tqdm
import json

def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

def get_video_features(video_path, clip_len=8):
    """Extract features from a video file using XCLIP model"""
    container = av.open(video_path)
    indices = sample_frame_indices(clip_len=clip_len, frame_sample_rate=1, 
                                 seg_len=container.streams.video[0].frames)
    video = read_video_pyav(container, indices)
    
    processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
    model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
    
    inputs = processor(videos=list(video), return_tensors="pt")
    video_features = model.get_video_features(**inputs)
    return video_features

def compute_video_similarity(video1_path, video2_path):
    """Compute cosine similarity between two videos"""
    features1 = get_video_features(video1_path)
    features2 = get_video_features(video2_path)
    
    # Compute cosine similarity between video features
    similarity = cosine_similarity(features1, features2)
    return similarity.item()

def compare_videos(compare_df):
    """Compare original videos with generated videos"""
    results = []
    similarity_scores = []

    for idx, row in tqdm(compare_df.iterrows(), total=len(compare_df)):
        original_video = row['video_name']
        generated_video = f"RL_gen_{original_video}"
        
        try:
            similarity = compute_video_similarity(
                f"vidprom/cog_videos_example/{original_video}",
                f"output/RL_genvideos/{generated_video}"
            )
            
            # Save intermediate results to JSON file
            result = {
                'video_name': original_video,
                'generated_video': generated_video,
                'reference': row['reference'],
                'origin': row['Origin_generation'],
                'DI_prediction': row['DI_generation'],
                'video_similarity_score': similarity,
                'promt_similarity_score': row['similarity_score']
            }
            
            # Append to results list
            results.append(result)
            similarity_scores.append(similarity)

            # Save current results to JSON file
            with open('output/RL_gen_video_comparison_results.json', 'w') as f:
                json.dump(results, f, indent=4)
                
            print(f"Processed {idx+1}/{len(compare_df)} videos. Current similarity: {similarity:.4f}")

        except Exception as e:
            print(f"Error comparing videos {original_video}: {e}")
            continue
    
    # Create DataFrame with results
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('similarity_score', ascending=False)
    
    return results_df,similarity_scores

# Example usage:
# results = compare_videos(compare_df)
# print(results.head())



In [None]:
results,video_cos_sim_scores = compare_videos(compare_promts)
print(results.head())

In [None]:
import pandas as pd
vid_sim_res = pd.read_json('output/RL_gen_video_comparison_results.json')
print(vid_sim_res.shape)
vid_sim_res.head()

In [None]:
(compare_promts['video_name'] == vid_sim_res['video_name']).sum()

In [13]:
# Add RL_video_similarity_score column to compare_prompts using video_similarity_score from vid_sim_res
compare_promts['RL_video_similarity_score'] = vid_sim_res['video_similarity_score']
compare_promts['RL_generation'] = vid_sim_res['generated_video']

In [None]:
compare_promts.head()

In [16]:
compare_promts.to_parquet('output/prompt_vid_cos_res/DI+FT_Video2Text_RL_generation_vidsim&bert_over0.5_df.parquet',index=False)

In [None]:
#将vid_sim_res转换成datasets
from datasets import Dataset
vid_sim_res_ds = Dataset.from_pandas(vid_sim_res)
vid_sim_res_ds.save_to_disk("output/prompt_vid_cos_res/video_comparison_results")


In [6]:
import datasets
vid_sim_res_ds = datasets.load_from_disk("output/prompt_vid_cos_res/video_comparison_results")

In [None]:
vid_sim_res_ds

In [None]:

similarity = compute_video_similarity("examples/video3.mp4", "examples/video3.mp4")
print(f"Video similarity score: {similarity}")

In [None]:

similarity = compute_video_similarity("examples/video3.mp4", "examples/video3.mp4")
print(f"Video similarity score: {similarity}")

In [1]:
# 综合DI和DI+FT的数据集，进行video_similarity的评估以及纯文本的评估

import pandas as pd


In [2]:
DI_df = pd.read_parquet('output/prompt_vid_cos_res/video_comparison_results.parquet')
DI_FT_df = pd.read_parquet('output/prompt_vid_cos_res/DI+FT_Video2Text_RL_generation_vidsim&bert_over0.5_df.parquet')

In [None]:
DI_FT_df.head()

In [None]:
DI_df.head()

In [None]:
#将DI_FT_df中的对video_name在DI_df中找到video_similarity_score，并添加到DI_FT_df中
DI_FT_df['DI_video_similarity_score'] = DI_FT_df['video_name'].apply(lambda x: DI_df[DI_df['video_name'] == x]['video_similarity_score'].values[0])
DI_FT_df.head()
#DI_FT_df.to_parquet('output/prompt_vid_cos_res/DI+FT_Video2Text_RL_generation_sim_over0.5_df.parquet',index=False)


In [17]:
from bert_score import score

def bert_sim_eval(candidate_sentence, target_sentence):
    # 将句子放入列表中，因为 bert-score 的 score 函数期望列表输入
    candidates = [candidate_sentence]
    references = [target_sentence]

    # 计算 BertScore
    P, R, F1 = score(candidates, references, lang='en', verbose=True)
    return {"P":P.item(),"R":R.item(),"F1":F1.item()}

In [None]:
from tqdm import tqdm
P_ours = []
R_ours = []
F1_ours = []
for index,item in tqdm(DI_FT_df.iterrows(),total=len(DI_FT_df)):    
    res = bert_sim_eval(item['DI_generation'],item['reference'])
    P_ours.append(res['P'])
    R_ours.append(res['R'])
    F1_ours.append(res['F1'])

In [None]:
print("P:",sum(P_ours)/len(P_ours))
print("R:",sum(R_ours)/len(R_ours))
print("F1:",sum(F1_ours)/len(F1_ours))

In [21]:
# 将P_ours,R_ours,F1_ours添加到DI_FT_df中，只有一列，保存成列表，保证在读取的时候能识别成列表
DI_FT_df['DI_bert_sim_score'] = DI_FT_df.apply(lambda x: [P_ours[x.name], R_ours[x.name], F1_ours[x.name]], axis=1)
DI_FT_df.head()
DI_FT_df.to_parquet('output/prompt_vid_cos_res/DI+FT_Video2Text_RL_generation_vidsim&bert_over0.5_df.parquet',index=False)


# #读取DI_FT_df
# DI_FT_df = pd.read_parquet('output/prompt_vid_cos_res/DI+FT_Video2Text_RL_generation_vidsim&bert_over0.5_df.parquet')
# DI_FT_df.head()
# #将DI_FT_df中的bert_sim_score转换成列表
# DI_FT_df['RL_bert_sim_score'] = DI_FT_df['RL_bert_sim_score'].apply(lambda x: x[0])
# DI_FT_df.head()


In [None]:
print(P_ours)
print(R_ours)
print(F1_ours)


In [None]:
DI_FT_df

In [None]:
# Read the parquet file
filtered_df = pd.read_parquet('output/prompt_vid_cos_res/DI+FT_Video2Text_RL_generation_sim_over0.6_df.parquet')

# Get the video names from filtered_df
filtered_video_names = filtered_df['video_name'].tolist()

# Filter DI_FT_df to only include rows where video_name is in filtered_video_names
filtered_DI_FT_df = DI_FT_df[DI_FT_df['video_name'].isin(filtered_video_names)]

# Print number of rows in filtered dataframe
print(f"Number of rows in filtered dataframe: {len(filtered_DI_FT_df)}")


In [None]:
import numpy as np

# 假设 RL_bert_sim_score 每行是 [P, R, F1]
scores = np.array(filtered_DI_FT_df['RL_bert_sim_score'].tolist())  # 变成二维数组，每列分别是P、R、F1

P_avg = scores[:, 0].mean()
R_avg = scores[:, 1].mean()
F1_avg = scores[:, 2].mean()

print("P:", P_avg)
print("R:", R_avg)
print("F1:", F1_avg)

In [None]:
vid_scores  = np.array(filtered_DI_FT_df['RL_video_similarity_score'].tolist())
print(vid_scores.mean())
