Nome: Fabio Grassiotto  
RA: 890441

# Aula 8_9 - Reproduzindo o Visconde

In [1]:
%%capture
%pip install -q torch
%pip install groq
%pip install -U sentence-transformers
%pip install faiss-cpu
%pip install spacy
%pip install pandas
%python -m spacy download en_core_web_sm

In [2]:
import os
import torch
import faiss
import json
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import groq
from groq import Groq
from bs4 import BeautifulSoup
import warnings
warnings.simplefilter('ignore')
from collections import Counter
import string
import re
import spacy

NUM_QUESTIONS = 150

## Setup Groq Library

In [3]:
def load_groq_key():
    try:
        # Open and read the entire content of the file
        with open("groq-key.txt", 'r') as file:
            contents = file.read()
        
        return contents
    
    except FileNotFoundError:
        print(f"The file does not exist.")
        return None
    except Exception as e:
        # Handle other potential exceptions (e.g., permission errors)
        print(f"An error occurred while reading the file: {str(e)}")
        return None
    
groq_key = load_groq_key()
os.environ["GROQ_API_KEY"] = groq_key

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

def groq_chat(content):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {   "role": "system",
                    "content": "You are a helpful reader who reads a list of documents and uses them as evidence to answer a question given by the user. For each example, you will write an explanation for the answer based on the documents and the question. Then, after explaining your reasoning, you will give your final answer, which should be short (less than 10 words)."},
                {
                    "role": "user",
                    "content": content,
                }
            ],
            model="llama3-70b-8192",
        )

    except groq.APIConnectionError as e:
        print("The server could not be reached")
        print(e.__cause__)  # an underlying Exception, likely raised within httpx.
    except groq.RateLimitError as e:
        print("A 429 status code was received; we should back off a bit.")
    except groq.APIStatusError as e:
        print("Another non-200-range status code was received")
        print(e.status_code)
        print(e.response)
    
    return chat_completion.choices[0].message.content

## IIRC Dataset

### Download

In [4]:
import os

if not os.path.exists("dataset/iirc_test.json"):
    !wget http://jamesf-incomplete-qa.s3.amazonaws.com/iirc.tar.gz
    !tar -xzf iirc.tar.gz --directory dataset
    !del iirc.tar.gz
    !cd iirc

    !wget http://jamesf-incomplete-qa.s3.amazonaws.com/context_articles.tar.gz
    !tar -xzf context_articles.tar.gz --directory dataset
    !del context_articles.tar.gz

    !wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_test.json
    ! mv iirc_test.json dataset

### Load JSON files

In [5]:
dev_dataset = json.load(open('dataset/iirc/dev.json','r'))
test_dataset  = json.load(open('dataset/iirc_test.json', 'r'))
articles = json.load(open("dataset/context_articles.json",'r'))

In [6]:
print(f'Dataset types: dev_dataset: {type(dev_dataset)}')
print(f'test_dataset: {type(test_dataset)}')
print(f'articles: {type(articles)}')

Dataset types: dev_dataset: <class 'list'>
test_dataset: <class 'list'>
articles: <class 'dict'>


### Select first 150 questions and related documents

In [7]:
# Code adapted from Visconde implementation.
# Added BeautifulSoup to remove html tags.

def grab_documents(test_dataset, articles):
    documents = []
    all_titles = []

    for item in test_dataset[:NUM_QUESTIONS]:

        if item['title'].lower() not in all_titles:
            # clean up html
            soup = BeautifulSoup(item["text"], 'html.parser')
            clean_text = soup.get_text()

            documents.append({
                    "title": item['title'],
                    "content": clean_text
                }
            )
            all_titles.append(item['title'].lower())

        for link in item["links"]:
            if link['target'].lower() in articles and link['target'].lower() not in all_titles:
                # clean up html
                soup = BeautifulSoup(articles[link['target'].lower()], 'html.parser')
                clean_text = soup.get_text()

                documents.append({
                    "title": link['target'],
                    "content": clean_text
                })
                all_titles.append(link['target'].lower())
            #else:
            #    print(link['target'].lower())
        
        return documents, all_titles

In [8]:
texts, titles = grab_documents(test_dataset, articles)

In [9]:
contents_list = []
nlp = spacy.load('en_core_web_sm')

for t in texts:
    paragraph = t.get('content')

    sentences = list(sent.text for sent in nlp(paragraph).sents)
    for s in sentences:
        contents_list.append(s)

In [10]:
contents_list[:5]

['The Palici (Παλικοί in Greek), or Palaci, were a pair of indigenous Sicilian chthonic deities in Roman mythology, and to a lesser extent in Greek mythology.',
 "They are mentioned in Ovid's Metamorphoses V, 406, and in Virgil's Aeneid IX, 585.",
 'Their cult centered on three small lakes that emitted sulphurous vapors in the Palagonia plain, and as a result these twin brothers were associated with geysers and the underworld.',
 'There was also a shrine to the Palaci in Palacia, where people could subject themselves or others to tests of reliability through divine judgement; passing meant that an oath could be trusted.',
 'The mythological lineage of the Palici is uncertain; one legend made the Palici the sons of Zeus, or possibly Hephaestus, by Aetna or Thalia, but another claimed that the Palici were the sons of the Sicilian deity Adranus.\n']

### Format questions and answers

In [11]:
# Código adaptado do Leandro Carísio, obrigado!

questions_to_ask = []

for i in range(len(test_dataset)):
  pr = test_dataset[i]['questions'][0]
  question = pr['question']
  answer = pr['answer']
  answer_type = answer['type']

  if answer_type == 'binary' or answer_type == 'value':
    final_answer = answer['answer_value']
  elif answer_type == 'span':
    final_answer = answer['answer_spans'][0]['text']
  elif answer_type == 'none':
    final_answer = 'none'
  else:
    final_answer = 'An error perhaps, bad type'
    print(answer_type)

  questions_to_ask.append({"Question": question, "Answer": final_answer})

In [12]:
questions_to_ask[:3]

[{'Question': 'What is Zeus know for in Greek mythology?',
  'Answer': 'sky and thunder god'},
 {'Question': 'How long had the First World War been over when Messe was named aide-de-camp?',
  'Answer': '5'},
 {'Question': 'How long had Angela Scoular been acting professionally when she appeared in the movie "On Her Majesty\'s Secret Service"?',
  'Answer': '2'}]

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


## Indexing the dataset

### Creating Embeddings with Sentence Transformer

In [14]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model.to(device)

embeddings = model.encode(contents_list, show_progress_bar=True)

Batches:   0%|          | 0/75 [00:00<?, ?it/s]

### Indexing with FAISS

In [15]:
# Create a FAISS index from the embeddings
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

### Basic Test 

In [16]:
# Small test
input_sequence = questions_to_ask[40].get('Question')

def SentenceTransformer_getContext(question, base, k):
    xq = model.encode([question])
    _, I = index.search(xq, k)  # search
    str = ""
    for i in range(0, k):
        str = str + base[I[0][i]] + "\n"
    return str
    

print(input_sequence)
print()
print(SentenceTransformer_getContext(input_sequence, contents_list, 5))
print()

How many months was the Northwest Indian War?

The re-conquests marked an end to over 150 years of accommodationist policies with tribal invaders.
Ancient tribes.
In the southeast lie the lower Hyblaean Mountains, 1000 m. The mines of the Enna and Caltanissetta districts were part of a leading sulphur-producing area throughout the 19th century, but have declined since the 1950s.


No evidence survives of any warring between the tribes, but the Sicanians moved eastwards when the Elymians settled in the northwest corner of the island.
They also introduced their own culture, customs, and politics in the region.




## Evaluation comparing with LLama3-70b

### Evaluation Functions - From Visconde implementation

In [17]:
def normalize_answer(s):
  """
  Taken from the official evaluation script for v1.1 of the SQuAD dataset.
  Lower text and remove punctuation, articles and extra whitespace.
  """

  def remove_articles(text):
      return re.sub(r"\b(a|an|the)\b", " ", text)

  def white_space_fix(text):
      return " ".join(text.split())

  def remove_punc(text):
      exclude = set(string.punctuation)
      return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
      return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def token_f1_score(prediction, ground_truth):
  """
  Taken from the official evaluation script for v1.1 of the SQuAD dataset.
  """
  prediction_tokens = normalize_answer(prediction).split()
  ground_truth_tokens = normalize_answer(ground_truth).split()
  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
  num_same = sum(common.values())
  if num_same == 0:
      return 0
  precision = 1.0 * num_same / len(prediction_tokens)
  recall = 1.0 * num_same / len(ground_truth_tokens)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

### Get responses from LLama3 and compute F1-score

In [None]:
def create_llm_prompt(question):
    prompt = SentenceTransformer_getContext(question, contents_list, 5) + question
    prompt = prompt + "\n" + "Answer:"
    return prompt

In [18]:
# Create dataframe with question, answer, Llama answer and scores
import pandas as pd
df_results = pd.DataFrame(columns=['question', 'answer', 'Llama3 answer', 'F1 score'])

for item in questions_to_ask:
    question = item.get('Question')
    answer = normalize_answer(item.get('Answer'))

    # RAG context using search

    llama3_answer = normalize_answer(create_llm_prompt(prompt))

    f1_score = token_f1_score(llama3_answer, answer)
    
    row = pd.Series([question, answer, llama3_answer, f1_score], index=df_results.columns)
    df_results = pd.concat([df_results, pd.DataFrame([row])], ignore_index=True)


In [19]:
df_results

Unnamed: 0,question,answer,Llama3 answer,F1 score
0,What is Zeus know for in Greek mythology?,sky and thunder god,zeus in bible zeus in philosophy oracles of ze...,0.071429
1,How long had the First World War been over whe...,5,there was allied invasion of sicily during wor...,0.000000
2,How long had Angela Scoular been acting profes...,2,she was already pregnant with athena however a...,0.000000
3,What is the capacity of the stadium where Brun...,26688,31–78 glennon ja 2007 leipzig main remaining f...,0.000000
4,In which country was Wilhelm Müller born?,germany,max müller attempted to understand indoeuropea...,0.000000
...,...,...,...,...
509,Where is the paper that felt that Witness was ...,london,fifth book focuses on song of muses which desc...,0.000000
510,Which of the three commands that No. 59 Squadr...,none,there was allied invasion of sicily during wor...,0.000000
511,How long was the rule of the Dai Viet emperor ...,33 years,he had been appointed viceroy of emperor to ru...,0.000000
512,How old was the person that defeated Stuart in...,36,charles martindale however in 1849 bourbons re...,0.000000


In [21]:
df_results.sort_values(by=['F1 score'], ascending=False)

Unnamed: 0,question,answer,Llama3 answer,F1 score
164,What's the difference between territories and ...,unlike provinces territories of canada have no...,administratively sicily is divided into nine p...,0.142857
134,Why did President Trump fire FBI director Jame...,dismissing comey relieved unnecessary pressure...,when he did so he was hoisted only halfway up ...,0.100000
506,Which of the two universities in the United St...,university of california,oxford and new york oxford oxford univ oxford ...,0.095238
323,What does the acronym NICE stand for?,national institute for health and care excellence,etymology regional symbols epithets spelling t...,0.086957
204,"Which battle lasted longer, the Battle of Gutt...",battle of leipzig,leipzig münchen leipzig saur 2006 bibliotheca ...,0.083333
...,...,...,...,...
172,How many top 10 finishes did Hut Stricklin hav...,none,anderson ws and ln quartarone kelly wd wood cl...,0.000000
170,What were the names of the three British Musli...,ruhal ahmed,muslim army was then sent to island consisting...,0.000000
169,Who was the author of document that Bordiga co...,karl marx,wauconda bolchazycarducci 2000 montaigne for e...,0.000000
168,How long did the Civil War sparked by the atte...,9,parts of island were reoccupied before revolts...,0.000000
