In [None]:
from evaluation import Evaluation
import os
import pandas as pd

### Evaluation

- To evaluate all metrics specify the path where the model has generated the text and specify the path where you want to store the metrics
- Important: The paths in the folder ./evaluation already have the metrics with the .csv files included. To make a new evaluation, delete the files with the metrics first

In [None]:
generation_path = ""
# for example: "./evaluation_files/variation3_llama2_chat/prompt_size8/generation/"
metrics_path = ""
# for example: "./evaluation_files/variation3_llama2_chat/prompt_size8/metrics/"

for filename in os.listdir(generation_path):
    evaluate = Evaluation(evaluation_file=os.path.join(generation_path, filename), metrics_folder=metrics_path)
    evaluate.evaluate_all_metrics()
    

In [None]:
## to convert the results to a latex table (optional, but saves a lot of time)
def convert_to_latex(data_array, caption="Your Table Caption Here", label="your-label"):
    table_data = {'supports': {}, 'refutes': {}}
    for item in data_array:
        label, hops, _ = item['filename'].split('_')
        label = label.lower()
        table_data[label][hops] = item['mean']

    latex_table = "\\begin{table}[h]\n\\centering\n"
    latex_table += "\\begin{tabular}{lccc}\n"
    latex_table += " & Hops 2 & Hops 3 & Hops 4 \\\\\\hline\n"
    for label in ['supports', 'refutes']:
        latex_table += f"{label.capitalize()} "
        for hops in ['hops2', 'hops3', 'hops4']:
            latex_table += f"& {table_data[label][hops]:.2f} "
        latex_table += "\\\\\n"
    latex_table += "\\end{tabular}\n"
    latex_table += f"\\caption{{{caption}}}\n"
    latex_table += f"\\label{{{label}}}\n"
    latex_table += "\\end{table}"

    # Printing the LaTeX table
    print(latex_table)

### Results

Llama2 Base Model 

In [None]:
metrics_path = "./evaluation_files/basic_llama2/metrics/"
bart_mean_scores = []
bleu_mean_scores = []
bert_mean_scores = []
rouge_mean_scores = []

for filename in os.listdir(metrics_path):
    df = pd.read_csv(os.path.join(metrics_path, filename))
    if filename.endswith("BART.csv"):
        bart_mean_scores.append({"filename": filename, "mean" : df['bartscore'].mean()})
    elif filename.endswith("BLEU.csv"):
        bleu_mean_scores.append({"filename": filename, "mean" : df['bleu_score'].mean()})
    elif filename.endswith("BERT.csv"):
        bert_mean_scores.append({"filename": filename, "mean" : df['F1 Score'].mean()})
    elif filename.endswith("ROUGE.csv"):
        rouge_mean_scores.append({"filename": filename, "mean" : df['ROUGE-1 F'].mean()})
        
        
print("Bart: ")
for obj in bart_mean_scores:
    print(obj)

print("Bert: ")
for obj in bert_mean_scores:
    print(obj)

print("Bleu: ")
for obj in bleu_mean_scores:
    print(obj)

#print("Rouge: ")
#for obj in rouge_mean_scores:
#    print(obj)

print("Mean Scores:")
print("Bart: ")
print(sum([obj['mean'] for obj in bart_mean_scores]) / len([obj['mean'] for obj in bart_mean_scores]))
print("Bert: ")
print(sum([obj['mean'] for obj in bert_mean_scores]) / len([obj['mean'] for obj in bert_mean_scores]))
print("Bleu: ")
print(sum([obj['mean'] for obj in bleu_mean_scores]) / len([obj['mean'] for obj in bleu_mean_scores]))
#print("Rouge:")
#print(sum([obj['mean'] for obj in rouge_mean_scores]) / len([obj['mean'] for obj in rouge_mean_scores]))

### Llama2-chat Model

In [None]:
metrics_path = "./evaluation_files/basic_llama2_chat/metrics/"
bart_mean_scores = []
bleu_mean_scores = []
bert_mean_scores = []
rouge_mean_scores = []

for filename in os.listdir(metrics_path):
    df = pd.read_csv(os.path.join(metrics_path, filename))
    if filename.endswith("BART.csv"):
        bart_mean_scores.append({"filename": filename, "mean" : df['bartscore'].mean()})
    elif filename.endswith("BLEU.csv"):
        bleu_mean_scores.append({"filename": filename, "mean" : df['bleu_score'].mean()})
    elif filename.endswith("BERT.csv"):
        bert_mean_scores.append({"filename": filename, "mean" : df['F1 Score'].mean()})
    elif filename.endswith("ROUGE.csv"):
        rouge_mean_scores.append({"filename": filename, "mean" : df['ROUGE-1 F'].mean()})
        
        
print("Bart: ")
for obj in bart_mean_scores:
    print(obj)

print("Bert: ")
for obj in bert_mean_scores:
    print(obj)

print("Bleu: ")
for obj in bleu_mean_scores:
    print(obj)

#print("Rouge: ")
#for obj in rouge_mean_scores:
#    print(obj)

print("Mean Scores:")
print("Bart: ")
print(sum([obj['mean'] for obj in bart_mean_scores]) / len([obj['mean'] for obj in bart_mean_scores]))
print("Bert: ")
print(sum([obj['mean'] for obj in bert_mean_scores]) / len([obj['mean'] for obj in bert_mean_scores]))
print("Bleu: ")
print(sum([obj['mean'] for obj in bleu_mean_scores]) / len([obj['mean'] for obj in bleu_mean_scores]))
#print("Rouge:")
#print(sum([obj['mean'] for obj in rouge_mean_scores]) / len([obj['mean'] for obj in rouge_mean_scores]))


### Variant1

In [None]:
metrics_path = "./evaluation_files/variation1_llama2_chat/metrics"
bart_mean_scores = []
bleu_mean_scores = []
bert_mean_scores = []
rouge_mean_scores = []

for filename in os.listdir(metrics_path):
    df = pd.read_csv(os.path.join(metrics_path, filename))
    if filename.endswith("BART.csv"):
        bart_mean_scores.append({"filename": filename, "mean" : df['bartscore'].mean()})
    elif filename.endswith("BLEU.csv"):
        bleu_mean_scores.append({"filename": filename, "mean" : df['bleu_score'].mean()})
    elif filename.endswith("BERT.csv"):
        bert_mean_scores.append({"filename": filename, "mean" : df['F1 Score'].mean()})
    elif filename.endswith("ROUGE.csv"):
        rouge_mean_scores.append({"filename": filename, "mean" : df['ROUGE-1 F'].mean()})
        
        
print("Bart: ")
for obj in bart_mean_scores:
    print(obj)

print("Bert: ")
for obj in bert_mean_scores:
    print(obj)

print("Bleu: ")
for obj in bleu_mean_scores:
    print(obj)

#print("Rouge: ")
#for obj in rouge_mean_scores:
#    print(obj)

print("Mean Scores:")
print("Bart: ")
print(sum([obj['mean'] for obj in bart_mean_scores]) / len([obj['mean'] for obj in bart_mean_scores]))
print("Bert: ")
print(sum([obj['mean'] for obj in bert_mean_scores]) / len([obj['mean'] for obj in bert_mean_scores]))
print("Bleu: ")
print(sum([obj['mean'] for obj in bleu_mean_scores]) / len([obj['mean'] for obj in bleu_mean_scores]))
#print("Rouge:")
#print(sum([obj['mean'] for obj in rouge_mean_scores]) / len([obj['mean'] for obj in rouge_mean_scores]))

In [None]:
metrics_path = "./evaluation_files/variation2_llama2_chat/metrics"
bart_mean_scores = []
bleu_mean_scores = []
bert_mean_scores = []
rouge_mean_scores = []

for filename in os.listdir(metrics_path):
    df = pd.read_csv(os.path.join(metrics_path, filename))
    if filename.endswith("BART.csv"):
        bart_mean_scores.append({"filename": filename, "mean" : df['bartscore'].mean()})
    elif filename.endswith("BLEU.csv"):
        bleu_mean_scores.append({"filename": filename, "mean" : df['bleu_score'].mean()})
    elif filename.endswith("BERT.csv"):
        bert_mean_scores.append({"filename": filename, "mean" : df['F1 Score'].mean()})
    elif filename.endswith("ROUGE.csv"):
        rouge_mean_scores.append({"filename": filename, "mean" : df['ROUGE-1 F'].mean()})
        
        
print("Bart: ")
for obj in bart_mean_scores:
    print(obj)

print("Bert: ")
for obj in bert_mean_scores:
    print(obj)

print("Bleu: ")
for obj in bleu_mean_scores:
    print(obj)

#print("Rouge: ")
#for obj in rouge_mean_scores:
#    print(obj)

print("Mean Scores:")
print("Bart: ")
print(sum([obj['mean'] for obj in bart_mean_scores]) / len([obj['mean'] for obj in bart_mean_scores]))
print("Bert: ")
print(sum([obj['mean'] for obj in bert_mean_scores]) / len([obj['mean'] for obj in bert_mean_scores]))
print("Bleu: ")
print(sum([obj['mean'] for obj in bleu_mean_scores]) / len([obj['mean'] for obj in bleu_mean_scores]))
#print("Rouge:")
#print(sum([obj['mean'] for obj in rouge_mean_scores]) / len([obj['mean'] for obj in rouge_mean_scores]))

In [None]:
metrics_path = "./evaluation_files/variation3_llama2_chat/prompt_size2/metrics"
bart_mean_scores = []
bleu_mean_scores = []
bert_mean_scores = []
rouge_mean_scores = []

for filename in os.listdir(metrics_path):
    df = pd.read_csv(os.path.join(metrics_path, filename))
    if filename.endswith("BART.csv"):
        bart_mean_scores.append({"filename": filename, "mean" : df['bartscore'].mean()})
    elif filename.endswith("BLEU.csv"):
        bleu_mean_scores.append({"filename": filename, "mean" : df['bleu_score'].mean()})
    elif filename.endswith("BERT.csv"):
        bert_mean_scores.append({"filename": filename, "mean" : df['F1 Score'].mean()})
    elif filename.endswith("ROUGE.csv"):
        rouge_mean_scores.append({"filename": filename, "mean" : df['ROUGE-1 F'].mean()})
        
        
print("Bart: ")
for obj in bart_mean_scores:
    print(obj)

print("Bert: ")
for obj in bert_mean_scores:
    print(obj)

print("Bleu: ")
for obj in bleu_mean_scores:
    print(obj)

#print("Rouge: ")
#for obj in rouge_mean_scores:
#    print(obj)

print("Mean Scores:")
print("Bart: ")
print(sum([obj['mean'] for obj in bart_mean_scores]) / len([obj['mean'] for obj in bart_mean_scores]))
print("Bert: ")
print(sum([obj['mean'] for obj in bert_mean_scores]) / len([obj['mean'] for obj in bert_mean_scores]))
print("Bleu: ")
print(sum([obj['mean'] for obj in bleu_mean_scores]) / len([obj['mean'] for obj in bleu_mean_scores]))
#print("Rouge:")
#print(sum([obj['mean'] for obj in rouge_mean_scores]) / len([obj['mean'] for obj in rouge_mean_scores]))

In [None]:
metrics_path = "./evaluation_files/variation3_llama2_chat/prompt_size4/metrics"
bart_mean_scores = []
bleu_mean_scores = []
bert_mean_scores = []
rouge_mean_scores = []

for filename in os.listdir(metrics_path):
    df = pd.read_csv(os.path.join(metrics_path, filename))
    if filename.endswith("BART.csv"):
        bart_mean_scores.append({"filename": filename, "mean" : df['bartscore'].mean()})
    elif filename.endswith("BLEU.csv"):
        bleu_mean_scores.append({"filename": filename, "mean" : df['bleu_score'].mean()})
    elif filename.endswith("BERT.csv"):
        bert_mean_scores.append({"filename": filename, "mean" : df['F1 Score'].mean()})
    elif filename.endswith("ROUGE.csv"):
        rouge_mean_scores.append({"filename": filename, "mean" : df['ROUGE-1 F'].mean()})
        
        
print("Bart: ")
for obj in bart_mean_scores:
    print(obj)

print("Bert: ")
for obj in bert_mean_scores:
    print(obj)

print("Bleu: ")
for obj in bleu_mean_scores:
    print(obj)

#print("Rouge: ")
#for obj in rouge_mean_scores:
#    print(obj)

print("Mean Scores:")
print("Bart: ")
print(sum([obj['mean'] for obj in bart_mean_scores]) / len([obj['mean'] for obj in bart_mean_scores]))
print("Bert: ")
print(sum([obj['mean'] for obj in bert_mean_scores]) / len([obj['mean'] for obj in bert_mean_scores]))
print("Bleu: ")
print(sum([obj['mean'] for obj in bleu_mean_scores]) / len([obj['mean'] for obj in bleu_mean_scores]))
#print("Rouge:")
#print(sum([obj['mean'] for obj in rouge_mean_scores]) / len([obj['mean'] for obj in rouge_mean_scores]))

In [None]:
metrics_path = "./evaluation_files/variation3_llama2_chat/prompt_size6/metrics"
bart_mean_scores = []
bleu_mean_scores = []
bert_mean_scores = []
rouge_mean_scores = []

for filename in os.listdir(metrics_path):
    df = pd.read_csv(os.path.join(metrics_path, filename))
    if filename.endswith("BART.csv"):
        bart_mean_scores.append({"filename": filename, "mean" : df['bartscore'].mean()})
    elif filename.endswith("BLEU.csv"):
        bleu_mean_scores.append({"filename": filename, "mean" : df['bleu_score'].mean()})
    elif filename.endswith("BERT.csv"):
        bert_mean_scores.append({"filename": filename, "mean" : df['F1 Score'].mean()})
    elif filename.endswith("ROUGE.csv"):
        rouge_mean_scores.append({"filename": filename, "mean" : df['ROUGE-1 F'].mean()})
        
        
print("Bart: ")
for obj in bart_mean_scores:
    print(obj)

print("Bert: ")
for obj in bert_mean_scores:
    print(obj)

print("Bleu: ")
for obj in bleu_mean_scores:
    print(obj)

#print("Rouge: ")
#for obj in rouge_mean_scores:
#    print(obj)

print("Mean Scores:")
print("Bart: ")
print(sum([obj['mean'] for obj in bart_mean_scores]) / len([obj['mean'] for obj in bart_mean_scores]))
print("Bert: ")
print(sum([obj['mean'] for obj in bert_mean_scores]) / len([obj['mean'] for obj in bert_mean_scores]))
print("Bleu: ")
print(sum([obj['mean'] for obj in bleu_mean_scores]) / len([obj['mean'] for obj in bleu_mean_scores]))
#print("Rouge:")
#print(sum([obj['mean'] for obj in rouge_mean_scores]) / len([obj['mean'] for obj in rouge_mean_scores]))

In [None]:
metrics_path = "./evaluation_files/variation3_llama2_chat/prompt_size8/metrics"
bart_mean_scores = []
bleu_mean_scores = []
bert_mean_scores = []
rouge_mean_scores = []

for filename in os.listdir(metrics_path):
    df = pd.read_csv(os.path.join(metrics_path, filename))
    if filename.endswith("BART.csv"):
        bart_mean_scores.append({"filename": filename, "mean" : df['bartscore'].mean()})
    elif filename.endswith("BLEU.csv"):
        bleu_mean_scores.append({"filename": filename, "mean" : df['bleu_score'].mean()})
    elif filename.endswith("BERT.csv"):
        bert_mean_scores.append({"filename": filename, "mean" : df['F1 Score'].mean()})
    elif filename.endswith("ROUGE.csv"):
        rouge_mean_scores.append({"filename": filename, "mean" : df['ROUGE-1 F'].mean()})
        
        
print("Bart: ")
for obj in bart_mean_scores:
    print(obj)

print("Bert: ")
for obj in bert_mean_scores:
    print(obj)

print("Bleu: ")
for obj in bleu_mean_scores:
    print(obj)

#print("Rouge: ")
#for obj in rouge_mean_scores:
#    print(obj)

print("Mean Scores:")
print("Bart: ")
print(sum([obj['mean'] for obj in bart_mean_scores]) / len([obj['mean'] for obj in bart_mean_scores]))
print("Bert: ")
print(sum([obj['mean'] for obj in bert_mean_scores]) / len([obj['mean'] for obj in bert_mean_scores]))
print("Bleu: ")
print(sum([obj['mean'] for obj in bleu_mean_scores]) / len([obj['mean'] for obj in bleu_mean_scores]))
#print("Rouge:")
#print(sum([obj['mean'] for obj in rouge_mean_scores]) / len([obj['mean'] for obj in rouge_mean_scores]))