In [1]:
import pandas as pd
import os
import nltk
from nltk.translate.bleu_score import SmoothingFunction
nltk.download('wordnet')
from rouge_score import rouge_scorer
from copy import deepcopy
from nltk.stem import WordNetLemmatizer
from bert_score import score
import warnings
#warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tomcio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
path_to_questions = '../data_processing_scripts/q_a/questions'
path_to_answers = '../data_processing_scripts/q_a/answers_auto'
path_to_model_answers = '../data_processing_scripts/q_a/model_answers'

In [3]:
file_names = ['Balladyna']

In [14]:
file_names = ['Balladyna', 'Dziady_(Mickiewicz)', 'Konrad_Wallenrod',
               'Kordian', 'Lalka_(Prus)', 'Ogniem_i_mieczem', 'Pan_Tadeusz_(wyd._1834)',
                 'Pan_Wołodyjowski', 'Potop_(Sienkiewicz)', 'Quo_vadis', 'Sonety_Adama_Mickiewicza']


In [11]:
def calculate_bert_score_results(df_questions, df_answers, df_model_answers):
    BERT_scores = []
    
    for i in range(len(df_questions)):
        hypothesis = df_answers.iloc[:,0][i]
        reference = df_model_answers.iloc[:,0][i]


        P, R, F1 = score([hypothesis], [reference], lang="pl", verbose=False)

        BERT_scores.append(float(F1))

    return BERT_scores

In [6]:
def calculate_BLEU_results(df_questions, df_answers, df_model_answers):
    BLEU_scores = []
    
    for i in range(len(df_questions)):
        hypothesis = df_answers.iloc[:,0][i].split(" ")
        reference = df_model_answers.iloc[:,0][i]
        reference = [i.split(" ") for i in reference]

        #print(hypothesis)
        #print(reference)
        #print('--------------')


        BLEUscore = nltk.translate.bleu_score.sentence_bleu(reference, hypothesis, weights=([1]))
        BLEU_scores.append(BLEUscore)

    return BLEU_scores

In [7]:
def calculate_METEOR_results(df_questions, df_answers, df_model_answers):
    METEOR_scores = []
    
    for i in range(len(df_questions)):
        hypothesis = df_answers.iloc[:,0][i].split(" ")
        reference = df_model_answers.iloc[:,0][i]

        METEORscore = nltk.translate.meteor_score.meteor_score([reference], hypothesis)
        METEOR_scores.append(METEORscore)

    return METEOR_scores

In [8]:
def calculate_ROUGE_results(df_questions, df_answers, df_model_answers):
    ROGUE_scores = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    for i in range(len(df_questions)):
        hypothesis = df_answers.iloc[:,0][i]
        reference = df_model_answers.iloc[:,0][i]

        cur_rouge_score = []

        for ref in reference:
            ROGUEscore = scorer.score(ref, hypothesis)
            cur_rouge_score.append(ROGUEscore)

        # Initialize variables to store highest scores
        max_rouge1 = {'score': 0, 'rouge': None}
        max_rouge2 = {'score': 0, 'rouge': None} 
        max_rougeL = {'score': 0, 'rouge': None}

        # Iterate through scores
        for i, score in enumerate(cur_rouge_score):
            rouge1_fmeasure = score['rouge1'].fmeasure
            rouge2_fmeasure = score['rouge2'].fmeasure
            rougeL_fmeasure = score['rougeL'].fmeasure
            
            if rouge1_fmeasure >= max_rouge1['score']:
                max_rouge1['score'] = rouge1_fmeasure
                max_rouge1['rouge'] = score['rouge1']
                
            if rouge2_fmeasure >= max_rouge2['score']:
                max_rouge2['score'] = rouge2_fmeasure
                max_rouge2['rouge'] = score['rouge2']
                
            if rougeL_fmeasure >= max_rougeL['score']:
                max_rougeL['score'] = rougeL_fmeasure
                max_rougeL['rouge'] = score['rougeL']
    

        ROGUE_scores.append({'rouge1':max_rouge1['rouge'], 'rouge2':max_rouge2['rouge'], 'rougeL':max_rougeL['rouge']})

    return ROGUE_scores

In [19]:
def calculate_metrics(model=None):
    if model is None:
        return 'pass the model name variable, named like the result files'
    
    BLEU = {}
    METEOR = {}
    ROUGE = {}
    BERT = {}

    for file in file_names:
        print(file)
        df_questions = pd.read_table(path_to_questions + '/' + file + '_questions.txt', sep='\t', header=None)
        with open(path_to_answers + '/' + file + '/anserws_' + model + '.txt', 'r', encoding='utf-8') as f:
            content = f.read().split('===')
        df_answers = pd.DataFrame(content, columns=['Answer'])
        df_answers['Answer'] = df_answers['Answer'].str.strip("\n")
        #df_answers = pd.read_table(path_to_answers + '/' + file + '/answers_' + model + '.txt', lineterminator='===', header=None)
        df_model_answers = pd.read_table(path_to_model_answers + '/' + file + '_paraphrased.txt', sep="\t", header=None)
        df_model_answers.iloc[:,0] = df_model_answers.iloc[:,0].str.split('\n')

        BLEU_result = calculate_BLEU_results(df_questions, df_answers, df_model_answers)
        #print(BLEU_result)

        METEOR_result = calculate_METEOR_results(df_questions, df_answers, df_model_answers)
        #print(METEOR_result)

        # TO ZWRACA DICTA XDDDDD
        ROUGE_result = calculate_ROUGE_results(df_questions, df_answers, df_model_answers)
        #print(ROUGE_result)

        BERT_result = calculate_bert_score_results(df_questions, df_answers, df_model_answers)

        BLEU[file] = BLEU_result
        METEOR[file] = METEOR_result
        ROUGE[file] = ROUGE_result
        BERT[file] = BERT_result

    ROUGE_1 = deepcopy(ROUGE)
    ROUGE_2 = deepcopy(ROUGE)
    ROUGE_L = deepcopy(ROUGE)

    for key, val in ROUGE.items():
        ROUGE_1[key] = [val[i]['rouge1'].fmeasure for i in range(len(val))]
        ROUGE_2[key] = [val[i]['rouge2'].fmeasure for i in range(len(val))]
        ROUGE_L[key] = [val[i]['rougeL'].fmeasure for i in range(len(val))]

    for file in file_names:
        print(f"{file} & {sum(BLEU[file]) / 100:.3f} & {sum(METEOR[file]) / 100:.3f} & {sum(ROUGE_1[file]) / 100:.3f} & {sum(BERT[file]) / 100:.3f} \\\\")
        print("\hline")

In [15]:
calculate_metrics(model='qwen')

Balladyna




Dziady_(Mickiewicz)




Konrad_Wallenrod
Kordian
Lalka_(Prus)




Ogniem_i_mieczem




Pan_Tadeusz_(wyd._1834)
Pan_Wołodyjowski
Potop_(Sienkiewicz)
Quo_vadis
Sonety_Adama_Mickiewicza
Balladyna & 0.252 & 0.000 & 0.310 & 0.751 \\
\hline
Dziady_(Mickiewicz) & 0.155 & 0.000 & 0.221 & 0.705 \\
\hline
Konrad_Wallenrod & 0.158 & 0.000 & 0.234 & 0.724 \\
\hline
Kordian & 0.180 & 0.000 & 0.251 & 0.732 \\
\hline
Lalka_(Prus) & 0.305 & 0.000 & 0.368 & 0.772 \\
\hline
Ogniem_i_mieczem & 0.038 & 0.000 & 0.097 & 0.660 \\
\hline
Pan_Tadeusz_(wyd._1834) & 0.093 & 0.000 & 0.138 & 0.676 \\
\hline
Pan_Wołodyjowski & 0.149 & 0.000 & 0.235 & 0.726 \\
\hline
Potop_(Sienkiewicz) & 0.124 & 0.000 & 0.191 & 0.699 \\
\hline
Quo_vadis & 0.135 & 0.000 & 0.215 & 0.721 \\
\hline
Sonety_Adama_Mickiewicza & 0.085 & 0.000 & 0.135 & 0.674 \\
\hline


In [16]:
calculate_metrics(model='llama')

Balladyna




Dziady_(Mickiewicz)




Konrad_Wallenrod
Kordian
Lalka_(Prus)




Ogniem_i_mieczem




Pan_Tadeusz_(wyd._1834)
Pan_Wołodyjowski




Potop_(Sienkiewicz)
Quo_vadis
Sonety_Adama_Mickiewicza




Balladyna & 0.068 & 0.000 & 0.111 & 0.613 \\
\hline
Dziady_(Mickiewicz) & 0.074 & 0.000 & 0.109 & 0.625 \\
\hline
Konrad_Wallenrod & 0.067 & 0.000 & 0.107 & 0.620 \\
\hline
Kordian & 0.065 & 0.000 & 0.105 & 0.625 \\
\hline
Lalka_(Prus) & 0.071 & 0.000 & 0.113 & 0.611 \\
\hline
Ogniem_i_mieczem & 0.015 & 0.000 & 0.035 & 0.567 \\
\hline
Pan_Tadeusz_(wyd._1834) & 0.049 & 0.000 & 0.073 & 0.598 \\
\hline
Pan_Wołodyjowski & 0.042 & 0.000 & 0.080 & 0.598 \\
\hline
Potop_(Sienkiewicz) & 0.044 & 0.000 & 0.078 & 0.596 \\
\hline
Quo_vadis & 0.044 & 0.000 & 0.080 & 0.599 \\
\hline
Sonety_Adama_Mickiewicza & 0.068 & 0.000 & 0.093 & 0.610 \\
\hline


In [17]:
calculate_metrics(model='bielik')

Balladyna




Dziady_(Mickiewicz)




Konrad_Wallenrod
Kordian
Lalka_(Prus)




Ogniem_i_mieczem




Pan_Tadeusz_(wyd._1834)
Pan_Wołodyjowski
Potop_(Sienkiewicz)
Quo_vadis
Sonety_Adama_Mickiewicza
Balladyna & 0.107 & 0.000 & 0.155 & 0.678 \\
\hline
Dziady_(Mickiewicz) & 0.085 & 0.000 & 0.134 & 0.667 \\
\hline
Konrad_Wallenrod & 0.084 & 0.000 & 0.130 & 0.668 \\
\hline
Kordian & 0.082 & 0.000 & 0.124 & 0.666 \\
\hline
Lalka_(Prus) & 0.111 & 0.000 & 0.145 & 0.676 \\
\hline
Ogniem_i_mieczem & 0.033 & 0.000 & 0.064 & 0.636 \\
\hline
Pan_Tadeusz_(wyd._1834) & 0.059 & 0.000 & 0.086 & 0.644 \\
\hline
Pan_Wołodyjowski & 0.071 & 0.000 & 0.112 & 0.672 \\
\hline
Potop_(Sienkiewicz) & 0.075 & 0.000 & 0.122 & 0.666 \\
\hline
Quo_vadis & 0.110 & 0.000 & 0.170 & 0.693 \\
\hline
Sonety_Adama_Mickiewicza & 0.073 & 0.000 & 0.120 & 0.662 \\
\hline


In [20]:
calculate_metrics(model='mistral')

Balladyna




Dziady_(Mickiewicz)




Konrad_Wallenrod
Kordian
Lalka_(Prus)




Ogniem_i_mieczem




Pan_Tadeusz_(wyd._1834)
Pan_Wołodyjowski
Potop_(Sienkiewicz)
Quo_vadis
Sonety_Adama_Mickiewicza
Balladyna & 0.246 & 0.000 & 0.310 & 0.732 \\
\hline
Dziady_(Mickiewicz) & 0.142 & 0.000 & 0.196 & 0.703 \\
\hline
Konrad_Wallenrod & 0.145 & 0.000 & 0.218 & 0.710 \\
\hline
Kordian & 0.136 & 0.000 & 0.207 & 0.706 \\
\hline
Lalka_(Prus) & 0.248 & 0.000 & 0.315 & 0.739 \\
\hline
Ogniem_i_mieczem & 0.026 & 0.000 & 0.070 & 0.641 \\
\hline
Pan_Tadeusz_(wyd._1834) & 0.097 & 0.000 & 0.155 & 0.679 \\
\hline
Pan_Wołodyjowski & 0.139 & 0.000 & 0.220 & 0.711 \\
\hline
Potop_(Sienkiewicz) & 0.117 & 0.000 & 0.183 & 0.689 \\
\hline
Quo_vadis & 0.125 & 0.000 & 0.200 & 0.704 \\
\hline
Sonety_Adama_Mickiewicza & 0.110 & 0.000 & 0.188 & 0.698 \\
\hline
