# Evaluation of Generated Texts

In this notebook, we evaluate the generated plain language summaries using BERTScore and other metrics. We compare the generated summaries against ground truth summaries to measure the quality and accuracy of the generated texts.

In [None]:
import requests
import os
import pandas as pd
import numpy as npx
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import gc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from evaluate import load
from scipy.stats import ttest_rel

## Loading BERTScore and Setting Up Directories

We load the BERTScore metric and set up the paths to the directories containing the ground truth summaries and the generated summaries from GPT-3.5 and GPT-4.

In [None]:
bertscore = load("bertscore")
folder_path_ground_truth = "ground_truth"
folder_path_gpt_35 = "gpt_35"
folder_path_gpt_4 = "gpt_4"

## Reading Texts from Folders

We define a function to read all text files from a specified folder. We then use this function to read the texts from the ground truth, GPT-3.5, and GPT-4 folders.

In [None]:
def read_texts_from_folder(folder_path):
    """
    Reads all text files from the specified folder, handling encoding issues.
    
    Args:
        folder_path (str): Path to the folder containing text files.
    
    Returns:
        list: A list of strings, each containing the content of a text file.
    """
    # List all files in the folder
    files = os.listdir(folder_path)
    texts = []
    
    # Read each file in the folder
    for file in files:
        try:
            # Try reading the file with UTF-8 encoding
            with open(os.path.join(folder_path, file), 'r', encoding="utf-8") as f:
                texts.append(f.read())
        except UnicodeDecodeError:
            try:
                # Try reading the file with CP1252 encoding if UTF-8 fails
                with open(os.path.join(folder_path, file), 'r', encoding="cp1252") as f:
                    texts.append(f.read())
            except UnicodeDecodeError as e:
                print(f"Error decoding {file}: {e}")
    
    return texts

# Use the function to read the texts from the specified folders
texts_ground_truth = read_texts_from_folder(folder_path_ground_truth)
texts_gpt_35 = read_texts_from_folder(folder_path_gpt_35)
texts_gpt_4 = read_texts_from_folder(folder_path_gpt_4)

Before performing the evaluation, we clear the GPU cache and delete any previous results to free up memory. This helps ensure that we have sufficient resources for the upcoming computations.

In [None]:
# Clear the GPU cache
torch.cuda.empty_cache()

# Run garbage collection to free up memory
gc.collect()

## Computing BERTScore

We compute the BERTScore for the generated plain language summaries using the Longformer model fine-tuned on TriviaQA. This allows us to evaluate the similarity between the generated summaries and the ground truth summaries.

In [None]:
# Initialize an empty dictionary to store the scores
results = {}

# Define the model identifiers for the GPT-3.5 and GPT-4 generated summaries
r_gpt4 = "allenai/longformer-large-4096-finetuned-triviaqa_gpt4"
r_gpt35 = "allenai/longformer-large-4096-finetuned-triviaqa_gpt35"

# Compute the BERTScore for the GPT-3.5 generated summaries
results[r_gpt35] = bertscore.compute(
    predictions=texts_gpt_35,
    references=texts_ground_truth,
    model_type="allenai/longformer-large-4096-finetuned-triviaqa"
)

# Compute the BERTScore for the GPT-4 generated summaries
results[r_gpt4] = bertscore.compute(
    predictions=texts_gpt_4,
    references=texts_ground_truth,
    model_type="allenai/longformer-large-4096-finetuned-triviaqa"
)

In [None]:
# Initialize dictionaries to store results for GPT-4 and GPT-35
results_gpt_4 = {'precision': [], 'recall': [], 'f1': []}
results_gpt_35 = {'precision': [], 'recall': [], 'f1': []}

# Iterate over the data and separate the results into the respective dictionaries
for key, value in results.items():
    print(key)
    if 'gpt4' in key:
        results_gpt_4['precision'] = results_gpt_4['precision'] + value['precision']
        results_gpt_4['recall'] = results_gpt_4['recall'] + value['recall']
        results_gpt_4['f1'] = results_gpt_4['f1'] + value['f1']
    elif 'gpt35' in key:
        results_gpt_35['precision'] = results_gpt_35['precision'] + value['precision']
        results_gpt_35['recall'] = results_gpt_35['recall'] + value['recall']
        results_gpt_35['f1'] = results_gpt_35['f1'] + value['f1']

results_gpt_4, results_gpt_35

facebook/bart-large-mnli_gpt4_1
facebook/bart-large-mnli_gpt35_1
facebook/bart-large-mnli_gpt4_2
facebook/bart-large-mnli_gpt35_2
facebook/bart-large-mnli_gpt4_3
facebook/bart-large-mnli_gpt35_3
facebook/bart-large-mnli_gpt4_4
facebook/bart-large-mnli_gpt35_4
facebook/bart-large-mnli_gpt4_5
facebook/bart-large-mnli_gpt35_5
facebook/bart-large-mnli_gpt4_6
facebook/bart-large-mnli_gpt35_6
facebook/bart-large-mnli_gpt4_7
facebook/bart-large-mnli_gpt35_7
facebook/bart-large-mnli_gpt4_8
facebook/bart-large-mnli_gpt35_8
facebook/bart-large-mnli_gpt4_9
facebook/bart-large-mnli_gpt35_9
facebook/bart-large-mnli_gpt4_10
facebook/bart-large-mnli_gpt35_10
facebook/bart-large-mnli_gpt4_11
facebook/bart-large-mnli_gpt35_11
facebook/bart-large-mnli_gpt4_12
facebook/bart-large-mnli_gpt35_12
facebook/bart-large-mnli_gpt4_13
facebook/bart-large-mnli_gpt35_13
facebook/bart-large-mnli_gpt4_14
facebook/bart-large-mnli_gpt35_14
facebook/bart-large-mnli_gpt4_15
facebook/bart-large-mnli_gpt35_15
facebook/bart

({'precision': [0.5544986724853516,
   0.5842946171760559,
   0.5771188735961914,
   0.5473431348800659,
   0.5778084993362427,
   0.5618857145309448,
   0.5628688335418701,
   0.612517237663269,
   0.58132004737854,
   0.5586987733840942,
   0.5668591856956482,
   0.5536668300628662,
   0.5981842279434204,
   0.553693413734436,
   0.5846501588821411,
   0.5171374082565308,
   0.587201714515686,
   0.5435212850570679,
   0.5487349033355713,
   0.5565096139907837,
   0.5550116300582886,
   0.5743672847747803,
   0.5507493615150452,
   0.5440987944602966,
   0.5296550989151001,
   0.5198606848716736,
   0.5263918042182922,
   0.5323164463043213,
   0.521886944770813,
   0.5438578128814697,
   0.5663763880729675,
   0.5534008741378784,
   0.5860253572463989,
   0.5006436109542847,
   0.5889208316802979,
   0.5628736019134521,
   0.6678363084793091,
   0.5193211436271667,
   0.5895353555679321,
   0.5704889297485352,
   0.5751364231109619,
   0.5962153673171997,
   0.5640169382095337,
   0

In [None]:
results_bleu = {}
r_gpt4 = f"gpt4"
r_gpt35 = f"gpt35"
results_bleu[r_gpt4] = []
results_bleu[r_gpt35] = []
n = 0
for i in range(len(texts_ground_truth)):
  results_bleu[r_gpt35].append(bleu.compute(
    predictions=[texts_gpt_35[i]],
    references=[texts_ground_truth[i]])["precisions"][n])
  results_bleu[r_gpt4].append(bleu.compute(
      predictions=[texts_gpt_4[i]],
      references=[texts_ground_truth[i]])["precisions"][n])

In [None]:
# Perform a paired t-test on the 'f1' scores of GPT-4 and GPT-35
t_statistic, p_value = ttest_rel(results[r_gpt4]['precision'], results[r_gpt35]['precision'])
t_statistic, p_value

(0.9167783778237993, 0.38606155269211473)

In [None]:
import json
with open('results_fb_gpt_4.json', 'w') as f:
    json.dump(results_gpt_4, f)

with open('results_fb_gpt_35.json', 'w') as f:
    json.dump(results_gpt_35, f)

In [None]:
results.keys()

dict_keys(['results_longformer_cochrane_gpt4_1', 'results_longformer_cochrane_gpt35_1', 'results_longformer_cochrane_gpt4_2', 'results_longformer_cochrane_gpt35_2', 'results_longformer_cochrane_gpt4_3', 'results_longformer_cochrane_gpt35_3', 'results_longformer_cochrane_gpt4_4', 'results_longformer_cochrane_gpt35_4', 'results_longformer_cochrane_gpt4_5', 'results_longformer_cochrane_gpt35_5', 'results_longformer_cochrane_gpt4_6', 'results_longformer_cochrane_gpt35_6', 'results_longformer_cochrane_gpt4_7', 'results_longformer_cochrane_gpt35_7', 'results_longformer_cochrane_gpt4_8', 'results_longformer_cochrane_gpt35_8', 'results_longformer_cochrane_gpt4_9', 'results_longformer_cochrane_gpt35_9', 'results_longformer_cochrane_gpt4_10', 'results_longformer_cochrane_gpt35_10', 'results_longformer_cochrane_gpt4_11', 'results_longformer_cochrane_gpt35_11'])

In [None]:
results_longformer_cochrane_gpt4_1 = bertscore.compute(predictions=texts_gpt_4[220:20], references=texts_ground_truth[220:20], model_type="allenai/longformer-large-4096-finetuned-triviaqa")
results_longformer_cochrane_gpt35_1 = bertscore.compute(predictions=texts_gpt_35[:20], references=texts_ground_truth[:20], model_type="allenai/longformer-large-4096-finetuned-triviaqa")
results_longformer_cochrane_gpt4_2 = bertscore.compute(predictions=texts_gpt_4[20:40], references=texts_ground_truth[20:40], model_type="allenai/longformer-large-4096-finetuned-triviaqa")
results_longformer_cochrane_gpt35_2 = bertscore.compute(predictions=texts_gpt_35[20:40], references=texts_ground_truth[20:40], model_type="allenai/longformer-large-4096-finetuned-triviaqa")

config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [None]:
results_longformer_cochrane_gpt4_2 = bertscore.compute(predictions=texts_gpt_4[20:40], references=texts_ground_truth[20:40], model_type="allenai/longformer-large-4096-finetuned-triviaqa")
results_longformer_cochrane_gpt35_2 = bertscore.compute(predictions=texts_gpt_35[20:40], references=texts_ground_truth[20:40], model_type="allenai/longformer-large-4096-finetuned-triviaqa")

In [None]:
results_longformer_cochrane_gpt4_3 = bertscore.compute(predictions=texts_gpt_4[40:60], references=texts_ground_truth[40:60], model_type="allenai/longformer-large-4096-finetuned-triviaqa")
results_longformer_cochrane_gpt35_3 = bertscore.compute(predictions=texts_gpt_35[40:60], references=texts_ground_truth[40:60], model_type="allenai/longformer-large-4096-finetuned-triviaqa")

In [None]:
results_longformer_cochrane_gpt4_4 = bertscore.compute(predictions=texts_gpt_4[60:80], references=texts_ground_truth[60:80], model_type="allenai/longformer-large-4096-finetuned-triviaqa")
results_longformer_cochrane_gpt35_4 = bertscore.compute(predictions=texts_gpt_35[60:80], references=texts_ground_truth[60:80], model_type="allenai/longformer-large-4096-finetuned-triviaqa")

In [None]:
results_longformer_cochrane_gpt4_5 = bertscore.compute(predictions=texts_gpt_4[80:100], references=texts_ground_truth[80:100], model_type="allenai/longformer-large-4096-finetuned-triviaqa")
results_longformer_cochrane_gpt35_5 = bertscore.compute(predictions=texts_gpt_35[80:100], references=texts_ground_truth[80:100], model_type="allenai/longformer-large-4096-finetuned-triviaqa")

In [None]:
results_longformer_cochrane_gpt4_6 = bertscore.compute(predictions=texts_gpt_4[100:], references=texts_ground_truth[100:], model_type="allenai/longformer-large-4096-finetuned-triviaqa")
results_longformer_cochrane_gpt35_6 = bertscore.compute(predictions=texts_gpt_35[100:], references=texts_ground_truth[100:], model_type="allenai/longformer-large-4096-finetuned-triviaqa")

In [None]:
precision_1 = results_longformer_cochrane_gpt4_1['precision']
recall_1 = results_longformer_cochrane_gpt4_1['recall']
f1_1 = results_longformer_cochrane_gpt4_1['f1']

In [None]:
precision_2 = results_longformer_cochrane_gpt4_2['precision']
recall_2 = results_longformer_cochrane_gpt4_2['recall']
f1_2 = results_longformer_cochrane_gpt4_2['f1']

In [None]:
precision_3 = results_longformer_cochrane_gpt4_3['precision']
recall_3 = results_longformer_cochrane_gpt4_3['recall']
f1_3 = results_longformer_cochrane_gpt4_3['f1']

In [None]:
precision_4 = results_longformer_cochrane_gpt4_4['precision']
recall_4 = results_longformer_cochrane_gpt4_4['recall']
f1_4 = results_longformer_cochrane_gpt4_4['f1']

In [None]:
precision_5 = results_longformer_cochrane_gpt4_5['precision']
recall_5 = results_longformer_cochrane_gpt4_5['recall']
f1_5 = results_longformer_cochrane_gpt4_5['f1']

In [None]:
precision_6 = results_longformer_cochrane_gpt4_6['precision']
recall_6 = results_longformer_cochrane_gpt4_6['recall']
f1_6 = results_longformer_cochrane_gpt4_6['f1']

In [None]:
f1_gpt4 = f1_1 + f1_2 + f1_3 + f1_4 + f1_5 + f1_6
precision_gpt4 = precision_1 + precision_2 + precision_3 + precision_4 + precision_5 + precision_6
recall_gpt4 = recall_1 + recall_2 + recall_3 + recall_4 + recall_5 + recall_6

results_longformer_cochrane_gpt4 = {}
results_longformer_cochrane_gpt4['precision'] = precision_gpt4
results_longformer_cochrane_gpt4['recall'] = recall_gpt4
results_longformer_cochrane_gpt4['f1'] = f1_gpt4
results_longformer_cochrane_gpt4

{'precision': [0.7839252948760986,
  0.8025096654891968,
  0.7971634864807129,
  0.7842993140220642,
  0.7842890024185181,
  0.7882592082023621,
  0.7913055419921875,
  0.8127666711807251,
  0.7967017292976379,
  0.7917708158493042,
  0.7917456030845642,
  0.7774495482444763,
  0.8004282116889954,
  0.7732133865356445,
  0.7909913063049316,
  0.7758715152740479,
  0.8014787435531616,
  0.7864641547203064,
  0.7871936559677124,
  0.781390368938446,
  0.7931512594223022,
  0.7778012156486511,
  0.7839068174362183,
  0.783258855342865,
  0.7753442525863647,
  0.7608494758605957,
  0.7785906791687012,
  0.7775368690490723,
  0.779800295829773,
  0.7875965237617493,
  0.7878661155700684,
  0.7928590774536133,
  0.8036999702453613,
  0.7659061551094055,
  0.7978131175041199,
  0.7858160734176636,
  0.8523211479187012,
  0.7826504707336426,
  0.8016983270645142,
  0.7851050496101379,
  0.790803074836731,
  0.8057169318199158,
  0.7876313924789429,
  0.784937858581543,
  0.7816547155380249,
  

In [None]:
# Para _1
precision_1 = results_longformer_cochrane_gpt35_1['precision']
recall_1 = results_longformer_cochrane_gpt35_1['recall']
f1_1 = results_longformer_cochrane_gpt35_1['f1']

# Para _2
precision_2 = results_longformer_cochrane_gpt35_2['precision']
recall_2 = results_longformer_cochrane_gpt35_2['recall']
f1_2 = results_longformer_cochrane_gpt35_2['f1']

# Para _3
precision_3 = results_longformer_cochrane_gpt35_3['precision']
recall_3 = results_longformer_cochrane_gpt35_3['recall']
f1_3 = results_longformer_cochrane_gpt35_3['f1']

# Para _4
precision_4 = results_longformer_cochrane_gpt35_4['precision']
recall_4 = results_longformer_cochrane_gpt35_4['recall']
f1_4 = results_longformer_cochrane_gpt35_4['f1']

# Para _5
precision_5 = results_longformer_cochrane_gpt35_5['precision']
recall_5 = results_longformer_cochrane_gpt35_5['recall']
f1_5 = results_longformer_cochrane_gpt35_5['f1']

# Para _6
precision_6 = results_longformer_cochrane_gpt35_6['precision']
recall_6 = results_longformer_cochrane_gpt35_6['recall']
f1_6 = results_longformer_cochrane_gpt35_6['f1']

f1_gpt35 = f1_1 + f1_2 + f1_3 + f1_4 + f1_5 + f1_6
precision_gpt35 = precision_1 + precision_2 + precision_3 + precision_4 + precision_5 + precision_6
recall_gpt35 = recall_1 + recall_2 + recall_3 + recall_4 + recall_5 + recall_6

results_longformer_cochrane_gpt35 = {}
results_longformer_cochrane_gpt35['precision'] = precision_gpt35
results_longformer_cochrane_gpt35['recall'] = recall_gpt35
results_longformer_cochrane_gpt35['f1'] = f1_gpt35
results_longformer_cochrane_gpt35

{'precision': [0.7850084900856018,
  0.7923835515975952,
  0.7787514328956604,
  0.7761006355285645,
  0.7823857069015503,
  0.7913665771484375,
  0.7908008694648743,
  0.8075686693191528,
  0.7955304384231567,
  0.7882530689239502,
  0.7954895496368408,
  0.7760149836540222,
  0.8101980686187744,
  0.7823495268821716,
  0.7991343140602112,
  0.7665894627571106,
  0.8091757893562317,
  0.7930967807769775,
  0.7949966192245483,
  0.780876874923706,
  0.7916431427001953,
  0.7881486415863037,
  0.7782250642776489,
  0.7892969846725464,
  0.7883710861206055,
  0.7637688517570496,
  0.7818271517753601,
  0.7898349761962891,
  0.7814518809318542,
  0.7781438231468201,
  0.7903143167495728,
  0.8109456300735474,
  0.7955396771430969,
  0.7908040881156921,
  0.7944298386573792,
  0.7790399789810181,
  0.8547947406768799,
  0.770750880241394,
  0.8115102052688599,
  0.7847480773925781,
  0.7986798286437988,
  0.7950671315193176,
  0.786137044429779,
  0.7766814231872559,
  0.7856732606887817,


In [None]:
import json
with open('results_longformer_cochrane_gpt35.json', 'w') as f:
    json.dump(results_longformer_cochrane_gpt35, f)

In [None]:
with open('results_longformer_cochrane_gpt4.json', 'w') as f:
    json.dump(results_longformer_cochrane_gpt4, f)