Nome: Fabio Grassiotto  
RA: 890441

# Aula 8_9 - Reproduzindo o Visconde

In [1]:
%%capture
%pip install -q torch
%pip install groq
%pip install -U sentence-transformers
%pip install faiss-cpu
%pip install spacy
%pip install pandas
%python -m spacy download en_core_web_sm

## Setup Environment

### Imports

In [2]:
import os
import sys
import torch
import faiss
import json
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import groq
from groq import Groq
from bs4 import BeautifulSoup
import warnings
warnings.simplefilter('ignore')
from collections import Counter
import string
import re
import spacy
from tqdm import tqdm

### Global Variables

In [3]:
NUM_QUESTIONS = 150
model_name = "sentence-transformers/msmarco-distilbert-dot-v5"

### Collab Env Setup and GPU Device

In [3]:
# Colab environment
IN_COLAB = 'google.colab' in sys.modules

if (IN_COLAB):
    # Google Drive
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

    project_folder="/content/drive/MyDrive/Classes/IA024/Aula_8_9"
    os.chdir(project_folder)
    !ls -la

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


### Setup Groq Library

In [4]:
def load_groq_key():
    try:
        # Open and read the entire content of the file
        with open("groq-key.txt", 'r') as file:
            contents = file.read()
        
        return contents
    
    except FileNotFoundError:
        print(f"The file does not exist.")
        return None
    except Exception as e:
        # Handle other potential exceptions (e.g., permission errors)
        print(f"An error occurred while reading the file: {str(e)}")
        return None
    
groq_key = load_groq_key()
os.environ["GROQ_API_KEY"] = groq_key

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

def groq_chat(content):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": content,
                }
            ],
            model="llama3-70b-8192",
        )

    except groq.APIConnectionError as e:
        print("The server could not be reached")
        print(e.__cause__)  # an underlying Exception, likely raised within httpx.
    except groq.RateLimitError as e:
        print("A 429 status code was received; we should back off a bit.")
    except groq.APIStatusError as e:
        print("Another non-200-range status code was received")
        print(e.status_code)
        print(e.response)
    
    return chat_completion.choices[0].message.content

## IIRC Dataset

### Download

In [5]:
import os

if not os.path.exists("dataset/iirc_test.json"):
    !wget http://jamesf-incomplete-qa.s3.amazonaws.com/iirc.tar.gz
    !tar -xzf iirc.tar.gz --directory dataset
    !del iirc.tar.gz
    !cd iirc

    !wget http://jamesf-incomplete-qa.s3.amazonaws.com/context_articles.tar.gz
    !tar -xzf context_articles.tar.gz --directory dataset
    !del context_articles.tar.gz

    !wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_test.json
    ! mv iirc_test.json dataset

### Load JSON files

In [6]:
dev_dataset = json.load(open('dataset/iirc/dev.json','r'))
test_dataset  = json.load(open('dataset/iirc_test.json', 'r'))
articles = json.load(open("dataset/context_articles.json",'r'))

In [7]:
print(f'Dataset types: dev_dataset: {type(dev_dataset)}')
print(f'test_dataset: {type(test_dataset)}')
print(f'articles: {type(articles)}')

Dataset types: dev_dataset: <class 'list'>
test_dataset: <class 'list'>
articles: <class 'dict'>


### Select first 150 questions and related documents

In [8]:
# Code adapted from Visconde implementation.
# Added BeautifulSoup to remove html tags.

def grab_documents(test_dataset, articles):
    documents = []
    all_titles = []

    for item in test_dataset[:NUM_QUESTIONS]:

        if item['title'].lower() not in all_titles:
            # clean up html
            soup = BeautifulSoup(item["text"], 'html.parser')
            clean_text = soup.get_text()

            documents.append({
                    "title": item['title'],
                    "content": clean_text
                }
            )
            all_titles.append(item['title'].lower())

        for link in item["links"]:
            if link['target'].lower() in articles and link['target'].lower() not in all_titles:
                # clean up html
                soup = BeautifulSoup(articles[link['target'].lower()], 'html.parser')
                clean_text = soup.get_text()

                documents.append({
                    "title": link['target'],
                    "content": clean_text
                })
                all_titles.append(link['target'].lower())
            #else:
            #    print(link['target'].lower())
        
        return documents, all_titles

In [9]:
texts, titles = grab_documents(test_dataset, articles)

In [10]:
contents_list = []
nlp = spacy.load('en_core_web_sm')

for t in texts:
    paragraph = t.get('content')

    sentences = list(sent.text for sent in nlp(paragraph).sents)
    for s in sentences:
        contents_list.append(s)

In [11]:
contents_list[:5]

['The Palici (Παλικοί in Greek), or Palaci, were a pair of indigenous Sicilian chthonic deities in Roman mythology, and to a lesser extent in Greek mythology.',
 "They are mentioned in Ovid's Metamorphoses V, 406, and in Virgil's Aeneid IX, 585.",
 'Their cult centered on three small lakes that emitted sulphurous vapors in the Palagonia plain, and as a result these twin brothers were associated with geysers and the underworld.',
 'There was also a shrine to the Palaci in Palacia, where people could subject themselves or others to tests of reliability through divine judgement; passing meant that an oath could be trusted.',
 'The mythological lineage of the Palici is uncertain; one legend made the Palici the sons of Zeus, or possibly Hephaestus, by Aetna or Thalia, but another claimed that the Palici were the sons of the Sicilian deity Adranus.\n']

### Format questions and answers

In [12]:
# Código adaptado do Leandro Carísio, obrigado!

questions_to_ask = []

for i in range(len(test_dataset)):
  pr = test_dataset[i]['questions'][0]
  question = pr['question']
  answer = pr['answer']
  answer_type = answer['type']

  if answer_type == 'binary' or answer_type == 'value':
    final_answer = answer['answer_value']
  elif answer_type == 'span':
    final_answer = answer['answer_spans'][0]['text']
  elif answer_type == 'none':
    final_answer = 'none'
  else:
    final_answer = 'An error perhaps, bad type'
    print(answer_type)

  questions_to_ask.append({"Question": question, "Answer": final_answer})


In [13]:
questions_to_ask = questions_to_ask[:150]

In [14]:
len(questions_to_ask)

150

## Indexing the dataset

### Creating Embeddings with Sentence Transformer

In [16]:
model = SentenceTransformer(model_name)
model.to(device)

embeddings = model.encode(contents_list, show_progress_bar=True)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Batches:   0%|          | 0/75 [00:00<?, ?it/s]

### Indexing with FAISS

In [17]:
# Create a FAISS index from the embeddings
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

### Basic Test 

In [18]:
# Small test
input_sequence = questions_to_ask[40].get('Question')

def SentenceTransformer_getContext(question, base, k):
    xq = model.encode([question])
    _, I = index.search(xq, k)  # search
    str = ""
    for i in range(0, k):
        str = str + base[I[0][i]] + "\n"
    return str
    

print(input_sequence)
print()
print(SentenceTransformer_getContext(input_sequence, contents_list, 5))
print()

How many months was the Northwest Indian War?

The island remained under Roman rule until 469.
Trojan War and aftermath.
It began erupting in 1900 and erupted periodically for four years until a landslide changed the local water table.
31–78.
- Glennon, J.A. (2007).
The resulting mythological "history of the world" may be divided into three or four broader periods:
1.




## Evaluation

### Evaluation Functions

All code in this section comes from the Visconde implementation.  
https://github.com/neuralmind-ai/visconde/blob/main/qasper_evaluator.py

In [19]:
def normalize_answer(s):
  """
  Taken from the official evaluation script for v1.1 of the SQuAD dataset.
  Lower text and remove punctuation, articles and extra whitespace.
  """

  def remove_articles(text):
      return re.sub(r"\b(a|an|the)\b", " ", text)

  def white_space_fix(text):
      return " ".join(text.split())

  def remove_punc(text):
      exclude = set(string.punctuation)
      return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
      return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def token_f1_score(prediction, ground_truth):
  """
  Taken from the official evaluation script for v1.1 of the SQuAD dataset.
  """
  prediction_tokens = normalize_answer(prediction).split()
  ground_truth_tokens = normalize_answer(ground_truth).split()
  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
  num_same = sum(common.values())
  if num_same == 0:
      return 0
  precision = 1.0 * num_same / len(prediction_tokens)
  recall = 1.0 * num_same / len(ground_truth_tokens)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

### Get responses from LLama3 and compute scores

In [20]:
def create_llm_prompt(question):
    template = """Your task is read the sentences I will write below and reply to the question with a single sentence of no more than 10 words.
If you cannot answer, please reply 'none'.
These are the sentences:
{0}
This is my question: {1}
Answer:"""
    sentences = SentenceTransformer_getContext(question, contents_list, 5)
    return template.format(sentences, question)

### Testing prompt

In [21]:
question = questions_to_ask[0].get('Question')
print(create_llm_prompt(question))

Your task is read the sentences I will write below and reply to the question with a single sentence of no more than 10 words.
                  If you cannot answer, please reply 'none'.
These are the sentences:
Zeus in the Bible.
Zeus and the sun.
Zeus in philosophy.
Zeus is the Greek continuation of *, the name of the Proto-Indo-European god of the daytime sky, also called *' ("Sky Father").
Zeus and Hera.

This is my question: What is Zeus know for in Greek mythology?
Answer:


In [26]:
# Create dataframe with question, answer, Llm answer and scores
import pandas as pd
df_results = pd.DataFrame(columns=['question', 'answer', 'LLM answer', 'F1', 'Exact Match'])

for item in tqdm(questions_to_ask):
    question = item.get('Question')
    answer = normalize_answer(item.get('Answer'))

    # RAG context using search
    llm_answer = normalize_answer(groq_chat(create_llm_prompt(question)))
    f1_score = token_f1_score(llm_answer, answer)
    e_match_score = 1 if llm_answer == answer else 0
    
    row = pd.Series([question, answer, llm_answer, f1_score, e_match_score], index=df_results.columns)
    df_results = pd.concat([df_results, pd.DataFrame([row])], ignore_index=True)


100%|██████████| 150/150 [06:01<00:00,  2.41s/it]


### Results

In [27]:
df_results

Unnamed: 0,question,answer,LLM answer,F1,Exact Match
0,What is Zeus know for in Greek mythology?,sky and thunder god,zeus is known for being king of gods,0,0
1,How long had the First World War been over whe...,5,none,0,0
2,How long had Angela Scoular been acting profes...,2,none,0,0
3,What is the capacity of the stadium where Brun...,26688,none,0,0
4,In which country was Wilhelm Müller born?,germany,none,0,0
...,...,...,...,...,...
145,Which of the destinations had the largest popu...,none,none,1.0,1
146,Was Rao alive when Manmohan Singh was prime mi...,yes,none,0,0
147,Who was in charge of the London County Council...,municipal reformers,none,0,0
148,How many years passed between the sack of Cons...,4,none,0,0


In [28]:
df_results.to_csv("results/rag_results.csv")

In [29]:
print("Metrics:")
print(f"F1 score: Avg: {df_results['F1'].mean():.2f}. Std: {df_results['F1'].std():.2f}")
print(f"Exact Match score: Avg: {df_results['Exact Match'].mean():.2f}. Std: {df_results['Exact Match'].std():.2f}")

Metrics:
F1 score: Avg: 0.29. Std: 0.45
Exact Match score: Avg: 0.28. Std: 0.45
