<a href="https://colab.research.google.com/github/dml2611/HindiMRC/blob/main/automatic_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
## **Rouge**

In [None]:
# Install rouge
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
# Import the libraries
import pandas as pd
from rouge import Rouge
from csv import DictWriter

In [None]:
# Mount the google drive
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Initialise the data input and output paths
data_path = '/content/gdrive/MyDrive/Colab Notebooks/Lancaster University/GPT3.5 - Hindi MRC /'

In [None]:
gpt3_all = pd.read_excel(data_path + "gpt_output.xlsx", sheet_name='H4 GPT')
gpt4_all = pd.read_excel(data_path + "gpt_output.xlsx", sheet_name='H4 GPT4')
llama_all = pd.read_excel(data_path + "gpt_output.xlsx", sheet_name='H4 Llama')
hindi_all = pd.read_excel(data_path + "gpt_output.xlsx", sheet_name='H4 Hindi GPT')

reference = pd.read_excel(data_path + "data.xlsx" , sheet_name='Sheet2')

In [None]:
gpt3 = gpt3_all['Answer 2']
gpt4 = gpt4_all['Answer 4']
llama = llama_all['Answer 1']
hindi = hindi_all['Answer 3']

ref = reference['Answer']

In [None]:
len(ref)

40

In [None]:
# Define the function for calculating the Rouge Scores
def calc_rouge(answers, name):
  rouge1, rouge2, rougeL = [], [], []
  scores = rouge.get_scores(ref, answers, avg=True)
  print(name)
  rouge1_f1, rouge2_f1, rougeL_f1 = scores.get('rouge-1').get('f'), scores.get('rouge-2').get('f'), scores.get('rouge-l').get('f')
  rouge1.append(rouge1_f1)
  rouge2.append(rouge2_f1)
  rougeL.append(rougeL_f1)
  print(f'Rouge1-F1: {rouge1_f1}')
  print(f'Rouge2-F1: {rouge2_f1}')
  print(f'RougeL-F1:  {rougeL_f1}')
  print("\n")
  return scores

In [None]:
# Calculate Rouge
rouge = Rouge()

rouge_gpt3 = calc_rouge(gpt3, "GPT3.5")
rouge_gpt4 = calc_rouge(gpt4, "GPT4")
rouge_hindi = calc_rouge(hindi, "Hindi GPT")
rouge_llama  = calc_rouge(llama, "Llama3")

GPT3.5
Rouge1-F1: 0.5397915163553433
Rouge2-F1: 0.4045104501297396
RougeL-F1:  0.5101213236943313


GPT4
Rouge1-F1: 0.5115401653495586
Rouge2-F1: 0.4011000523385893
RougeL-F1:  0.4939012584615249


Hindi GPT
Rouge1-F1: 0.5404272927190086
Rouge2-F1: 0.40420088456582226
RougeL-F1:  0.5154521855716325


Llama3
Rouge1-F1: 0.5325626655749031
Rouge2-F1: 0.4326076750201646
RougeL-F1:  0.5164595383336685




---
## **Bleu**

In [None]:
#Install nltk and bleu
!pip install nltk
!pip install bleu

Collecting bleu
  Downloading bleu-0.3.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting efficiency (from bleu)
  Downloading efficiency-2.0-py3-none-any.whl.metadata (2.5 kB)
Downloading efficiency-2.0-py3-none-any.whl (32 kB)
Building wheels for collected packages: bleu
  Building wheel for bleu (setup.py) ... [?25l[?25hdone
  Created wheel for bleu: filename=bleu-0.3-py3-none-any.whl size=5781 sha256=6cc83d3f770a3b3490c2cef9668592638c828edc05617bf0842baabb917a7d51
  Stored in directory: /root/.cache/pip/wheels/c6/d8/d1/009abe01b8b2c6a14c62d197b510b3cc1076014c22d712c5ce
Successfully built bleu
Installing collected packages: efficiency, bleu
Successfully installed bleu-0.3 efficiency-2.0


In [None]:
# Import the libraries
import numpy as np
import nltk
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from bleu import multi_file_bleu, multi_list_bleu

In [None]:
# Define the function for calculating the bleu scores
def calc_bleu(model, name):
  scores = []
  for i in range(len(ref)):
    scores.append(sentence_bleu(ref[i], model[i], weights = (1,0,0,0)))
  score = sum(scores) / len(scores)
  print(name, "Bleu Score = ", score)
  return score

In [None]:
# Calculate Bleu
bleu_gpt3 = calc_bleu(gpt3, "GPT3.5")
bleu_gpt4 = calc_bleu(gpt4, "GPT4")
bleu_hindi = calc_bleu(hindi, "Hindi GPT")
bleu_llama  = calc_bleu(llama, "Llama3")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


GPT3.5 Bleu Score =  0.34779141198077984
GPT4 Bleu Score =  0.3171056535698914
Hindi GPT Bleu Score =  0.3578056465881177
Llama3 Bleu Score =  0.3725709557162986


---
## **Bleurt**

In [None]:
# Installing bluert
!pip install --upgrade pip  # ensures that pip is current
!git clone https://github.com/google-research/bleurt.git

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.3.1
Cloning into 'bleurt'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 134 (delta 0), reused 17 (delta 0), pack-reused 116 (from 1)[K
Receiving objects: 100% (134/134), 31.28 MiB | 18.81 MiB/s, done.
Resolving deltas: 100% (49/49), done.


In [None]:
cd bleurt

/content/bleurt


In [None]:
pip install .

Processing /content/bleurt
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456764 sha256=eb4426ce95d95c5390e886670a4c7f4f6e62034cdb99bbc0ffbc3301ff7f5972
  Stored in directory: /tmp/pip-ephem-wheel-cache-xqkmalwt/wheels/92/4f/fb/afa555fa27aa9e2c7958df797a62cc4e74f0f459cec9c4fa7c
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2


In [None]:
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip .

--2024-11-07 15:45:21--  https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.153.207, 142.250.145.207, 74.125.128.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.153.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2140294207 (2.0G) [application/octet-stream]
Saving to: ‘BLEURT-20.zip’


2024-11-07 15:47:16 (17.9 MB/s) - ‘BLEURT-20.zip’ saved [2140294207/2140294207]

--2024-11-07 15:47:16--  http://./
Resolving . (.)... failed: No address associated with hostname.
wget: unable to resolve host address ‘.’
FINISHED --2024-11-07 15:47:16--
Total wall clock time: 1m 55s
Downloaded: 1 files, 2.0G in 1m 54s (17.9 MB/s)


In [None]:
!unzip BLEURT-20.zip

Archive:  BLEURT-20.zip
   creating: BLEURT-20/
  inflating: BLEURT-20/bert_config.json  
  inflating: BLEURT-20/saved_model.pb  
   creating: BLEURT-20/variables/
  inflating: BLEURT-20/variables/variables.index  
  inflating: BLEURT-20/variables/variables.data-00000-of-00001  
  inflating: BLEURT-20/sent_piece.vocab  
  inflating: BLEURT-20/bleurt_config.json  
  inflating: BLEURT-20/sent_piece.model  


In [None]:
!python -m bleurt.score_files \
  -candidate_file=bleurt/test_data/candidates \
  -reference_file=bleurt/test_data/references \
  -bleurt_checkpoint=BLEURT-20

2024-11-07 15:48:02.316500: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-07 15:48:02.338847: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-07 15:48:02.345419: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-07 15:48:02.363019: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
INFO:tensorflow:Running BLEURT scoring.
I1107 15:48:0

In [None]:
# Import libraries
import pandas as pd
from bleurt import score

In [None]:
def calc_bleurt(answers, name):
  bleurt_scores = scorer.score(references=ref, candidates=answers)
  score = sum(bleurt_scores) / len(bleurt_scores)
  print(name, "Bleurt Score = ", score)
  return score

In [None]:
# Calculating Bleurt Scores
checkpoint = "bleurt/test_checkpoint"
scorer = score.BleurtScorer(checkpoint)

bleurt_gpt3 = calc_bleurt(gpt3, "GPT3.5")
bleurt_gpt4 = calc_bleurt(gpt4, "GPT4")
bleurt_hindi = calc_bleurt(hindi, "Hindi GPT")
bleurt_llama  = calc_bleurt(llama, "Llama3")

GPT3.5 Bleurt Score =  0.5296638231724501
GPT4 Bleurt Score =  0.43066286779940127
Hindi GPT Bleurt Score =  0.4971300791949034
Llama3 Bleurt Score =  0.4578865200281143


---
## **Meteor**

In [None]:
# Install nltk
!pip install nltk



In [None]:
# Import libraries
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize

import pandas as pd
from csv import DictWriter
from nltk.translate.meteor_score import single_meteor_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Define the function for calculating the meteor scores
def calc_meteor(answers, name):
  scores = []
  for i in range(len(ref)):
    scores.append(round(single_meteor_score(word_tokenize(ref[i]), word_tokenize(answers[i])), 4))
  score = sum(scores) / len(scores)
  print(name, 'Meteor Scores = ', score)
  return score

In [None]:
# Calculating Meteor Scores
meteor_gpt3 = calc_meteor(gpt3, "GPT3.5")
meteor_gpt4 = calc_meteor(gpt4, "GPT4")
meteor_hindi = calc_meteor(hindi, "Hindi GPT")
meteor_llama  = calc_meteor(llama, "Llama3")

GPT3.5 Meteor Scores =  0.515045
GPT4 Meteor Scores =  0.5161150000000001
Hindi GPT Meteor Scores =  0.5074575
Llama3 Meteor Scores =  0.5079175


---
## **Cosine Similarity using FastText**

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296188 sha256=956463ede744d8075060935cfcae98eb38364e46889333fb22e008cc244d91f9
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58e02cec2ddb20ce3e59fad8d3c92a
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.3 pybind11-2.13.6


In [None]:
import math

In [None]:
vector_path = "/content/gdrive/MyDrive/Colab Notebooks/Hindi Comprehension/Answer Extraction/"

In [None]:
import fasttext
import fasttext.util
ft = fasttext.load_model(vector_path + 'wiki.hi.bin')

In [None]:
def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

In [None]:
# Define the function for calculating the meteor scores
def calc_cos(answers, name):
  scores = []
  for i in range(len(ref)):
    scores.append(cosine_similarity(ft.get_sentence_vector(ref[i].replace('\n', ' ')), ft.get_sentence_vector(answers[i].replace('\n', ' '))))
  score = sum(scores) / len(scores)
  print(name, 'Cosine Similarity = ', score)
  return score

In [None]:
# Calculating Meteor Scores
cos_gpt3 = calc_cos(gpt3, "GPT3.5")
cos_gpt4 = calc_cos(gpt4, "GPT4")
cos_hindi = calc_cos(hindi, "Hindi GPT")
cos_llama  = calc_cos(llama, "Llama3")

GPT3.5 Cosine Similarity =  0.9223463013718695
GPT4 Cosine Similarity =  0.9233784419644875
Hindi GPT Cosine Similarity =  0.9225622604946482
Llama3 Cosine Similarity =  0.9139167658985897


### **End**
---