In [1]:
import json
import pandas as pd
from rouge import Rouge

# Load JSON data
input_file = 'results_from_my_lora.json'  # Adjust the path to your JSON file
with open(input_file, 'r') as f:
    data = json.load(f)

# Remove newline characters from the predicted_response fields
for item in data:
    item["predicted_response"] = item["predicted_response"].replace("\n", "")

# Function to calculate ROUGE scores
def calculate_rouge(predicted_text, ground_truth_text):
    rouge = Rouge()
    scores = rouge.get_scores(predicted_text, ground_truth_text)
    rouge_1_score = scores[0]["rouge-1"]["f"]
    rouge_2_score = scores[0]["rouge-2"]["f"]
    rouge_l_score = scores[0]["rouge-l"]["f"]
    return rouge_1_score, rouge_2_score, rouge_l_score

# Initialize list to store ROUGE scores
rouge_scores = []

# Iterate over the data and calculate ROUGE scores
for entry in data:
    question = entry['question']
    original_response = entry['original_response']
    predicted_response = entry['predicted_response']
    
    rouge_1, rouge_2, rouge_l = calculate_rouge(predicted_response, original_response)
    rouge_scores.append({
        'question': question,
        'original_response': original_response,
        'predicted_response': predicted_response,
        'ROUGE-1': rouge_1,
        'ROUGE-2': rouge_2,
        'ROUGE-L': rouge_l
    })

# Convert the list to a DataFrame
rouge_df = pd.DataFrame(rouge_scores)

# Calculate average ROUGE scores
average_rouge_scores = rouge_df[['ROUGE-1', 'ROUGE-2', 'ROUGE-L']].mean()

# Print average ROUGE scores
print("Average ROUGE-1 score:", average_rouge_scores['ROUGE-1'])
print("Average ROUGE-2 score:", average_rouge_scores['ROUGE-2'])
print("Average ROUGE-L score:", average_rouge_scores['ROUGE-L'])

# Optionally, save the DataFrame with ROUGE scores to a new JSON file
# output_file = '/path/to/results_with_rouge_scores.json'  # Adjust the path to your output JSON file
# rouge_df.to_json(output_file, orient='records', indent=4)
# print(f"Results with ROUGE scores saved to {output_file}")


Average ROUGE-1 score: 0.10131848890407362
Average ROUGE-2 score: 0.0
Average ROUGE-L score: 0.10062237361585308


In [14]:
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sacrebleu
from pycocoevalcap.cider.cider import Cider
from rouge_score import rouge_scorer
import evaluate

# Download the necessary NLTK data files
nltk.download('punkt')

# Load the data from the JSON file
with open('Q_A_with_predictions_and_rouge.json', 'r') as file:
    data = json.load(file)

# Extract original and predicted answers
original_answers = [item['Original_Answer'] for item in data]
predicted_answers = [item['Predicted_Answer'] for item in data]

# Cosine Similarity
vectorizer = TfidfVectorizer().fit_transform(original_answers + predicted_answers)
vectors = vectorizer.toarray()
cosine_scores = [cosine_similarity([vectors[i]], [vectors[i + len(original_answers)]])[0][0] for i in range(len(original_answers))]

# BLEU Score
bleu_scores = [sacrebleu.sentence_bleu(pred, [orig]).score for orig, pred in zip(original_answers, predicted_answers)]

# CIDEr Score
cider = Cider()
cider_scores, _ = cider.compute_score({i: [pred] for i, pred in enumerate(predicted_answers)}, 
                                       {i: [orig] for i, orig in enumerate(original_answers)})

# ROUGE Scores
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for orig, pred in zip(original_answers, predicted_answers):
    scores = rouge.score(orig, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# METEOR Score using Hugging Face's evaluate library
meteor = evaluate.load("meteor")
meteor_scores = meteor.compute(predictions=predicted_answers, references=original_answers)

# Calculate overall scores
overall_cosine_similarity = sum(cosine_scores) / len(cosine_scores)
overall_bleu_score = sum(bleu_scores) / len(bleu_scores)
overall_cider_score = cider_scores  # Directly use the CIDEr score
overall_rouge1_score = sum(rouge1_scores) / len(rouge1_scores)
overall_rouge2_score = sum(rouge2_scores) / len(rouge2_scores)
overall_rougeL_score = sum(rougeL_scores) / len(rougeL_scores)
overall_meteor_score = meteor_scores['meteor']  # Use the METEOR score

# Print overall scores
print("Overall Scores:")
print(f"Average Cosine Similarity: {overall_cosine_similarity:.4f}")
print(f"Average BLEU Score: {overall_bleu_score:.4f}")
print(f"Average CIDEr Score: {overall_cider_score:.4f}")
print(f"Average ROUGE-1 Score: {overall_rouge1_score:.4f}")
print(f"Average ROUGE-2 Score: {overall_rouge2_score:.4f}")
print(f"Average ROUGE-L Score: {overall_rougeL_score:.4f}")
print(f"Average METEOR Score: {overall_meteor_score:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\faara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\faara\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\faara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\faara\AppData\Roaming\nltk_data...


Overall Scores:
Average Cosine Similarity: 0.3780
Average BLEU Score: 13.2176
Average CIDEr Score: 0.5246
Average ROUGE-1 Score: 0.4652
Average ROUGE-2 Score: 0.2206
Average ROUGE-L Score: 0.3527
Average METEOR Score: 0.3457


In [4]:
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sacrebleu
from pycocoevalcap.cider.cider import Cider
from rouge_score import rouge_scorer
import evaluate

# Download the necessary NLTK data files
nltk.download('punkt')

# Load the data from the JSON file
with open('predictions.json', 'r') as file:
    data = json.load(file)

# Extract original and predicted answers
original_answers = [item['Ground Truth'] for item in data]
predicted_answers = [item['Predicted'] for item in data]

# Cosine Similarity
vectorizer = TfidfVectorizer().fit_transform(original_answers + predicted_answers)
vectors = vectorizer.toarray()
cosine_scores = [cosine_similarity([vectors[i]], [vectors[i + len(original_answers)]])[0][0] for i in range(len(original_answers))]

# BLEU Score
bleu_scores = [sacrebleu.sentence_bleu(pred, [orig]).score for orig, pred in zip(original_answers, predicted_answers)]

# CIDEr Score
cider = Cider()
cider_scores, _ = cider.compute_score({i: [pred] for i, pred in enumerate(predicted_answers)}, 
                                       {i: [orig] for i, orig in enumerate(original_answers)})

# ROUGE Scores
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for orig, pred in zip(original_answers, predicted_answers):
    scores = rouge.score(orig, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# METEOR Score using Hugging Face's evaluate library
meteor = evaluate.load("meteor")
meteor_scores = meteor.compute(predictions=predicted_answers, references=original_answers)

# Calculate overall scores
overall_cosine_similarity = sum(cosine_scores) / len(cosine_scores)
overall_bleu_score = sum(bleu_scores) / len(bleu_scores)
overall_cider_score = cider_scores  # Directly use the CIDEr score
overall_rouge1_score = sum(rouge1_scores) / len(rouge1_scores)
overall_rouge2_score = sum(rouge2_scores) / len(rouge2_scores)
overall_rougeL_score = sum(rougeL_scores) / len(rougeL_scores)
overall_meteor_score = meteor_scores['meteor']  # Use the METEOR score

# Print overall scores
print("Overall Scores:")
print(f"Average Cosine Similarity: {overall_cosine_similarity:.4f}")
print(f"Average BLEU Score: {overall_bleu_score:.4f}")
print(f"Average CIDEr Score: {overall_cider_score:.4f}")
print(f"Average ROUGE-1 Score: {overall_rouge1_score:.4f}")
print(f"Average ROUGE-2 Score: {overall_rouge2_score:.4f}")
print(f"Average ROUGE-L Score: {overall_rougeL_score:.4f}")
print(f"Average METEOR Score: {overall_meteor_score:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\faara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\faara\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\faara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\faara\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Overall Scores:
Average Cosine Similarity: 0.1279
Average BLEU Score: 0.2882
Average CIDEr Score: 0.0071
Average ROUGE-1 Score: 0.0813
Average ROUGE-2 Score: 0.0314
Average ROUGE-L Score: 0.0733
Average METEOR Score: 0.0348
