# A Straits Times Fact Uncapper

In [None]:
%%bash
pip install -q --upgrade pip
pip install -q accelerate bitsandbytes transformers SentencePiece evaluate bert_score
pip install -q farm-haystack[colab,preprocessing,elasticsearch,inference]

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 520.4/520.4 kB 5.8 MB/s eta 0:00:00




In [None]:

import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import LlamaTokenizer

import pandas as pd
import re

In [None]:
RUN_RAW_PREDICT =  False # Rerun raw LLM predictions
RUN_RAW_PREDICT_2 = False # Structured CoT

# Part 0: Load LLM and Prepare Documents

## Loading the Model
For this project, we use Vicuna-7B, loaded in 8bit using `bitsandbytes`.

In [None]:
model_name = "lmsys/vicuna-7b-v1.5"
# model_name = "mistralai/Mistral-7B-v0.1"

model = AutoModelForCausalLM.from_pretrained(
          model_name,
          torch_dtype=torch.float16,
          load_in_8bit=True,
          device_map="auto"
        )
tokenizer = LlamaTokenizer.from_pretrained(model_name)

ValueError: ignored

In [None]:
def response_gen(input_ids, *, tokens=100, temperature=0.7):
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            temperature=temperature,
            top_p = 1.0,
            do_sample=True,
            return_dict_in_generate=True,
            max_new_tokens=tokens,
        )
    s = generation_output.sequences[0][len(input_ids[0]):]
    output = tokenizer.decode(s)
    return output

def get_prompt(query):
  return f"The assistant provides useful, accurate, concise answers. USER: {query} ASSISTANT:"

def run(prompt, *, tokens=100, temperature=0.7):
  encoded_input = tokenizer(prompt, return_tensors="pt")
  return response_gen(encoded_input.input_ids.to("cuda"), tokens=tokens, temperature=temperature)

def run_many(get_prompt, queries, **kwargs):
  return [run(get_prompt(q), **kwargs) for q in queries]

## Download Dataset and Define Utils

In [None]:
from haystack import Pipeline
from haystack.nodes import AnswerParser, PromptNode, PromptTemplate

In [None]:
%%bash
wget https://raw.githubusercontent.com/shaunnope/paperview/master/pred.csv -q
wget -O data.csv https://raw.githubusercontent.com/shaunnope/paperview/master/straitstimes_20220104_20231107.csv -q
wget https://raw.githubusercontent.com/shaunnope/paperview/master/relevance.csv -q
wget https://raw.githubusercontent.com/shaunnope/paperview/master/statements.csv -q
wget https://raw.githubusercontent.com/shaunnope/paperview/master/relevance_binary.csv -q


In [None]:
query_df = pd.read_csv("pred.csv")
statements = pd.read_csv("statements.csv")

display(query_df.head())
display(statements.head())

In [None]:
questions = query_df['question'].tolist()
ground_truth = query_df['answer'].tolist()
predictions = query_df['prediction'].tolist()

verified = statements['verified'].tolist()
refuted = statements['refuted']

# Part 1: Raw LLM Prompting

In [None]:
if RUN_RAW_PREDICT:
  predictions = []
  for i, question in enumerate(questions):
    print("Q:", question)
    prompt = get_prompt(question)
    encoded_input = tokenizer(prompt, return_tensors="pt")
    prediction = response_gen(encoded_input.input_ids.to("cuda"))
    print(f"P{i}:", prediction)
    predictions.append(prediction)

  query_df['prediction'] = predictions
  query_df.to_csv('pred.csv')

In [None]:
RUN_RAW_PREDICT_2 = False
if RUN_RAW_PREDICT_2:
  N = 4
  v = verified[N]
  r = refuted[N]

  print("Qv:", v)
  prompt = get_prompt(v)
  encoded_input = tokenizer(prompt, return_tensors="pt")
  prediction = response_gen(encoded_input.input_ids.to("cuda"), tokens=200)
  print(f"Pv:", prediction)
  print("Qr:", r)
  prompt = get_prompt(r)
  encoded_input = tokenizer(prompt, return_tensors="pt")
  prediction = response_gen(encoded_input.input_ids.to("cuda"), tokens=200)
  print(f"Pr:", prediction)

  i = N
  question = questions[i]
  # predictions = []
  # for i, question in enumerate(questions):
  print("Q:", question)
  prompt = get_prompt(question)
  encoded_input = tokenizer(prompt, return_tensors="pt")
  prediction = response_gen(encoded_input.input_ids.to("cuda"))
  print(f"P{i}:", prediction)
  #   predictions.append(prediction)


In [None]:
RUN_RAW_PREDICT_2 = False
if RUN_RAW_PREDICT_2:
  def get_prompt_2(query):
    return f"If the input is a question, directly provide a concise and informative answer.\n\nOtherwise, explain whether the statement is true or false, reasoning step-by-step. INPUT:{query}\nANSWER: "
  N = 4
  for N in range(len(questions)):
    v = verified[N]
    r = refuted[N]

    print("Qv:", v)
    prompt = get_prompt_2(v)
    encoded_input = tokenizer(prompt, return_tensors="pt")
    prediction = response_gen(encoded_input.input_ids.to("cuda"), tokens=200)
    print(f"Pv:", prediction)
    print("Qr:", r)
    prompt = get_prompt_2(r)
    encoded_input = tokenizer(prompt, return_tensors="pt")
    prediction = response_gen(encoded_input.input_ids.to("cuda"), tokens=200)
    print(f"Pr:", prediction)

    i = N
    question = questions[i]
    # predictions = []
    # for i, question in enumerate(questions):
    print("Q:", question)
    prompt = get_prompt_2(question)
    encoded_input = tokenizer(prompt, return_tensors="pt")
    prediction = response_gen(encoded_input.input_ids.to("cuda"))
    print(f"P{i}:", prediction)
    #   predictions.append(prediction)


## Raw LLM output evaluation

In [None]:
import evaluate
import numpy as np
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=predictions, references=ground_truth)
print(f"BLEU Score: {bleu_score}")

In [None]:
bertscore = evaluate.load("bertscore")
bert_score = bertscore.compute(predictions=predictions, references=ground_truth, lang="en")
print(f"BERTScore: {np.mean(bert_score['precision'])}")

In [None]:
meteor = evaluate.load('meteor')
meteor_score = meteor.compute(predictions=predictions, references=ground_truth)
print(f"METEORScore: {meteor_score}")

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


METEORScore: {'meteor': 0.31779667015963875}


# Part 2: Traditional IR System
We set up an IR system to retrieve documents to evaluate/ improve upon LLM responses.

## Setting Up a Document Index

For this project, we will use the ElasticsearchDocumentStore provided by Haystack to interact with the Elasticsearch server and create a document index. We adapt the initialization code from the Haystack tutorial [Build a Scalable Question Answering System](https://haystack.deepset.ai/tutorials/03_scalable_qa_system).


In [None]:
# download and extract es installation image
%%bash
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
# start the server
%%bash --bg
sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
# wait for server to start up
import time

time.sleep(30)

## Load Data, Initialize Pipeline

In [None]:
from haystack import Document

df = pd.read_csv("data.csv")
print(df.shape)

docs = [
    Document(
        content=row['article'],
        meta=dict(
            id=idx,
            url=row['url'],
            datetime=row['datetime'],
            headline=row['headline']
        )
    )
    for idx, row in df.iterrows()
]


In [None]:
# Initialize ElasticsearchDocumentStore
from haystack.pipelines import DocumentSearchPipeline, Pipeline
from haystack.nodes import TextConverter, BM25Retriever
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

text_converter = TextConverter()
document_store = ElasticsearchDocumentStore(host=host)

# add own docs to DocumentStore
document_store.write_documents(docs)

index_pipeline = Pipeline()
index_pipeline.add_node(text_converter, name="TextConverter", inputs=["File"])
index_pipeline.add_node(document_store, name="DocumentStore", inputs=["TextConverter"])

## Initialize the Retriever, Reader, and Query pipeline

*   List item
*   List item



We use a BM25 retriever and Haystack's FARMReader (with `roberta-base-squad2`) to retrieve documents and extract answers.

`roberta-base-squad2` is a RoBERTA model developed by deepset fine-tuned on the SQuAD2.0 dataset.

In [None]:
from haystack.nodes import BM25Retriever, FARMReader, TransformersSummarizer, DocumentMerger, EmbeddingRetriever, TfidfRetriever


retriever = BM25Retriever(document_store=document_store)
# retriever = EmbeddingRetriever(
#     document_store=document_store,
#    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
#    model_format="sentence_transformers"
# )
# document_store.update_embeddings(retriever)
# retriever = TfidfRetriever(document_store)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)


In [None]:
from haystack import Pipeline

query_pipeline = Pipeline()
query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
query_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

## Querying the Pipeline

In [None]:
retrieved = query_pipeline.run_batch(
      queries=query_df['question'].tolist(), params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
    )

In [None]:
retrieved['answers'][0]

## Evaluation

### Evaluating the Retriever

In [None]:
rel_df = pd.read_csv("relevance_binary.csv")
print(rel_df['article_id'].nunique())

multi_idx_rel_df = rel_df.set_index(['article_id', 'question_id'])
rel_dict = multi_idx_rel_df.to_dict(orient='index')

401


In [None]:
ret_ids = [[doc.meta['id'] for doc in pred] for pred in retrieved['documents']]
ret_rel = torch.Tensor([
    [
        rel_dict[(idx, q_idx)]['relevance'] if (idx, q_idx) in rel_dict else 0
        for idx in doc_ids
      ]
    for q_idx, doc_ids in enumerate(ret_ids)
    ])

In [None]:
def mrr(relevances:torch.Tensor):
  mrrls=torch.Tensor(relevances.shape[0])
  for idx,row in enumerate(relevances):
    if(torch.max(row)!=0):
      # print(idx)
      # print(torch.argmax(row))
      mrrls[idx]=(1/(torch.argmax(row)+1))
    else:
      mrrls[idx]=0
  return torch.mean(mrrls)

In [None]:
def map(relevances: torch.Tensor):
  mapt=torch.Tensor(relevances.shape[0])
  precls=[]
  for idx, row in enumerate(relevances):
    relcount=0
    avgprec=0
    totalreldoc=torch.sum(row)
    if(totalreldoc==0):
      mapt[idx]=0
      continue
    for i, val in enumerate(row):
      if(val!=0):
        relcount+=1
        prec=relcount/(i+1)
        precls.append(prec)
        avgprec+=prec
    avgprec/=totalreldoc
    # print(totalreldoc)
    mapt[idx]=avgprec
    # print(precls)
    # print(avgprec)
    # break
  return torch.mean(mapt)

In [None]:
# #tf-idf retriever
# print(f'map {map(ret_rel)}')
# print(f'mrr: {mrr(ret_rel)}')
# map 0.4930151700973511
# mrr: 0.5173782110214233

map 0.4930151700973511
mrr: 0.5173782110214233


In [None]:
# # embedding retriever
# print(f'map {map(ret_rel)}')
# print(f'mrr: {mrr(ret_rel)}')

# map 0.6158411502838135
# mrr: 0.6758620738983154

map 0.6158411502838135
mrr: 0.6758620738983154


In [None]:
#bm25 retriever
print(f'map {map(ret_rel)}')
print(f'mrr: {mrr(ret_rel)}')

map 0.71826171875
mrr: 0.791015625


In [None]:
def dcg(relevances: torch.Tensor, *, N=None):
  N = N or relevances.shape[1]
  ranks = torch.arange(1, N+1)
  ranks[0] += 1
  log_ranks = torch.log2(ranks).reshape((1, N))
  return torch.inner(relevances, 1/log_ranks)

def ndcg(pred_rel: torch.Tensor, relevance: pd.DataFrame, *, N=None):
  N = N or pred_rel.shape[1]
  topk = relevance.groupby(['question_id'])['relevance'].nlargest(N)
  rows = [topk.xs(i).to_list() for i in topk.index.levels[0]]
  for row in rows:
    if len(row) < N:
      row.extend([0.0]*(N-len(row)))
  ideal = torch.Tensor(rows)
  idcg = dcg(ideal, N=N)
  return torch.div(dcg(pred_rel[:, :N], N=N), idcg)

def evaluate_retrieval(relevances: torch.Tensor):
  res = {}
  for rels in relevances:
    pass

  return

In [None]:
ndcg(ret_rel, rel_df).T

### Evaluating the Reader

In [None]:
%%bash
wget https://raw.githubusercontent.com/shaunnope/paperview/master/wiki_claims.jsonl -q
wget https://raw.githubusercontent.com/shaunnope/paperview/master/wiki_pages.jsonl -q

In [None]:
claims_df = pd.read_json("wiki_claims.jsonl", lines=True)
display(claims_df.head())

wiki_df = pd.read_json("wiki_pages.jsonl", lines=True)
display(wiki_df.head())
wiki_df.shape

Unnamed: 0,index,id,verifiable,label,claim,sources,answers
0,2009,95262,VERIFIABLE,REFUTES,Eddie Guerrero did not experience substance ab...,[Eddie_Guerrero],"[substance abuse, alcholism, addiction to pain..."
1,3312,144899,VERIFIABLE,REFUTES,AMGTV does not have entertainment television p...,[AMGTV],"[entertainment, television programming, movie ..."
2,3494,150044,VERIFIABLE,REFUTES,AMGTV is an adult-oriented television network.,[AMGTV],"[family-oriented, children's shows]"
3,4947,26132,VERIFIABLE,SUPPORTS,Aarhus is located on the east coast of the Jut...,[East_Jutland_metropolitan_area],[East Jutland]
4,7212,65047,VERIFIABLE,SUPPORTS,AMGTV is an American family-oriented televisio...,[AMGTV],"[family-oriented, children's shows]"


Unnamed: 0,index,id,text
0,8216,2003_NCAA_Division_I_Men's_Basketball_Tournament,The 2003 NCAA Division I Men 's Basketball Tou...
1,14801,2017_Premier_League_Asia_Trophy,The 2017 Premier League Asia Trophy is the eig...
2,17051,Academy_Award_for_Best_Makeup_and_Hairstyling,The Academy Award for Best Makeup and Hairstyl...
3,19413,A_Game_of_Thrones,A Game of Thrones is the first novel in A Song...
4,26056,ABC_islands_-LRB-Lesser_Antilles-RRB-,The ABC islands are the three western-most isl...


(25, 3)

In [None]:
wiki_docs = { row['id']:
          Document(
              content=row['text'],
              meta=dict(
                  id=row['id'],
              )
          )
          for idx, row in wiki_df.iterrows()
}

answers = []
for i, row in claims_df.iterrows():
  rel_docs = wiki_df[wiki_df['id'].isin(row['sources'])]
  rel_docs = [
      Document(
        content=row['text'],
        meta=dict(id=row['id'])
    )
    for idx, row in rel_docs.iterrows()
  ]
  answers.append(reader.predict(query=row['claim'], documents=rel_docs, top_k=5))

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.15 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 26.15 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 30.17 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 27.68 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 29.47 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 25.00 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 23.19 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 18.83 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.61 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.36 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.50 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 43.74 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 43.31 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00

In [None]:
import itertools

extracted_answers = [(claim_answers['query'], [ans.answer for ans in claim_answers['answers']]) for claim_answers in answers]
extracted = []
for query, extracts in extracted_answers:
  claim = claims_df[claims_df['claim'] == query].iloc[0]
  for ans, ext in itertools.product(claim['answers'], extracts):
      ans = ans.lower()
      ext = ext.lower()
      if ext in ans or ans in ext:
        extracted.append(True)
        break
  else:
    extracted.append(False)

claims_df['extracted'] = [ans for _, ans in extracted_answers]
claims_df['match'] = extracted
claims_df[['claim','answers', 'extracted','match','label']]

In [None]:
def evaluate(df):
  assert 'label' in df.columns and 'match' in df.columns, "Run evaluate() on a DataFrame with `match` and `label` columns"
  cat_res = {}
  for cat in df['label'].unique():
    partial = df[df['label'] == cat]
    n, matches = len(partial), partial['match'].sum()
    cat_res[cat] = {
        "n": n,
        "matches": matches,
        "accuracy": matches / n
    }
  return {
      "n": len(df),
      "matches": df['match'].sum(),
      "accuracy": df['match'].sum() / len(df),
      "cat": cat_res
  }

reader_perf = evaluate(claims_df)
reader_perf

{'n': 20,
 'matches': 10,
 'accuracy': 0.5,
 'cat': {'REFUTES': {'n': 9, 'matches': 4, 'accuracy': 0.4444444444444444},
  'SUPPORTS': {'n': 11, 'matches': 6, 'accuracy': 0.5454545454545454}}}

## Raw LLM Verification using retrieved documents

In [None]:
prompt_queries=['Who is the president of singapore?','What is the GST rate in Singapore?']

In [None]:
prompt_predictions = []
for i, question in enumerate(prompt_queries):
  print("Q:", question)
  prompt = get_prompt(question)
  encoded_input = tokenizer(prompt, return_tensors="pt")
  prediction = response_gen(encoded_input.input_ids.to("cuda"))
  print(f"P{i}:", prediction)
  prompt_predictions.append(prediction)
prompt_predictions

Q: Who is the president of singapore?
P0: As of my knowledge cutoff in September 2021, the President of Singapore was Halimah Yacob. However, please note that my training data only goes up to that date, and there is a possibility that there may have been a change in leadership since then.</s>
Q: What is the GST rate in Singapore?
P1: As of my knowledge cutoff in September 2021, the Goods and Services Tax (GST) rate in Singapore is 7%. However, please note that tax laws and rates may change over time, so it's always a good idea to double-check the most up-to-date information from official sources, such as the Singapore Department of Taxation.</s>


['As of my knowledge cutoff in September 2021, the President of Singapore was Halimah Yacob. However, please note that my training data only goes up to that date, and there is a possibility that there may have been a change in leadership since then.</s>',
 "As of my knowledge cutoff in September 2021, the Goods and Services Tax (GST) rate in Singapore is 7%. However, please note that tax laws and rates may change over time, so it's always a good idea to double-check the most up-to-date information from official sources, such as the Singapore Department of Taxation.</s>"]

In [None]:
retrieved_prompt = query_pipeline.run_batch(
      queries=prompt_queries, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
    )

2it [00:00, 3938.31it/s]
Inferencing Samples: 100%|██████████| 3/3 [00:01<00:00,  1.68 Batches/s]


In [None]:
answers_prompt=[[(a.context) for a in ans] for ans in retrieved_prompt["answers"]]
prompt_context = ['\n'.join([f"{context} " for context in ans]) for ans in answers_prompt]
prompt_context

['"SINGAPORE – Former president Halimah Yacob has been conferred the nation’s highest civilian honour, the Order of Temasek (With High Distinction).\nShe \nrom Monday (Aug 1) to Tuesday.\nThe delegation will call on President Halimah Yacob and Prime Minister Lee Hsien Loong, and meet a number of Cabinet mi \nday, Mr Marcos will be meeting separately with Singapore\'s President Halimah Yacob and Prime Minister Lee Hsien Loong to discuss key regional and glob \n"SINGAPORE - President Halimah Yacob will represent Singapore at the official mourning of United Arab Emirates (UAE) president Khalifa bin Zayed Al Na \n ordinary members can vote in the elections. This year was to have been Mr Yeoh\'s last one-year term as president.\nThe need for the election arose aft ',
 "INGAPORE - The goods and services tax (GST) rate will increase from 7 to 9 per cent in two stages - one percentage point each time on Jan 1, 2023 and  \nat the same time, said Mr Wong. READ MORE:\xa0Singapore to raise GST fro

In [None]:
def get_prompt_context(question, context,prediction):
  prompt = f"""
    Question: {prompt_queries[0]}
    Context: {prompt_context[0]}
    Answer: The President of Singapore was Halimah Yacob
    Given the above question and context, is the answer correct? Please return Yes or No.
    Yes

    Question:  {prompt_queries[0]}
    Context: {prompt_context[0]}
    Answer: 'Nicole Seah'
    For the above question, is the answer correct? Please return Yes or No.
    No

    Question: {question}
    Context: {context}
    Answer: {prediction}
    For the above question, is the answer correct? Please return Yes or No.
    """
  return prompt

In [None]:
# verify the accuracy on ground truth
def llm_verify(question,predictions,retrieved):
  verify_pred = []
  for i in range(len(predictions)):
    question = questions[i]
    answer = predictions[i]
    print("Question:", question)

    context_p=[(a.answer) for a in  retrieved["answers"][i]]
    context='\n'.join([f"{cont} " for cont in context_p])
    print("Context:",context)
    print("Answer:", answer)
    prompt = get_prompt_context(question, context,answer)
    prompt = f"USER: {prompt}\nASSISTANT:"
    input = tokenizer(prompt, return_tensors='pt')
    input_ids = input["input_ids"].to("cuda")

    output = response_gen(input_ids)
    print("Prediction:", output)
    print('\n')
    verify_pred.append(output)
  return verify_pred

In [None]:
verify_preds = llm_verify(questions,predictions,retrieved)

Question: Who is the Prime Minister of Singapore?
Context: Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Answer: As of my knowledge cut-off in September 2021, the Prime Minister of Singapore was Lee Hsien Loong. However, please note that my training data only goes up to that date, and there is a possibility that there may have been a change in leadership since then.</s>
Prediction: Yes</s>


Question: What is the top university in Singapore?
Context: National University of Singapore 
Singapore University of Social Sciences 
National University of Singapore 
Nanyang Technological University 
National University of Singapore 
Answer: There is no definitive answer to this question as it can depend on various factors such as ranking systems, field of study, and personal preferences. However, some of the top universities in Singapore according to various rankings include the National University of Singapore (NUS), the Singapore Management University (S

In [None]:
yescount=0
for pred in verify_preds:
  if('Yes' in pred):
    yescount+=1
  print(pred)
print(yescount)
print(yescount/len(questions))

Yes</s>
Yes</s>
Yes</s>
Yes</s>
No</s>
Yes</s>
Yes</s>
Yes.</s>
Yes.</s>
Yes.</s>
Yes, the answer is correct.</s>
No.</s>
No.</s>
Yes</s>
Yes</s>
Yes</s>
Yes</s>
Yes.</s>
Yes</s>
Yes</s>
No.</s>
No.</s>
No.</s>
Yes</s>
No, the answer is not correct. As of my knowledge cutoff date of September 2021, fully vaccinated travelers to Singapore were still required to comply with prevailing entry restrictions and quarantine measures, even if they were traveling through a Vaccinated Travel Lane (VTL) program. The entry requirements may change, so it is always advisable to check the latest information from official sources before traveling.</s>
Yes.</s>
Yes.</s>
Yes</s>
Yes</s>
22
0.7586206896551724


# Part 3: RAG
Using the documents we retrieved, we can augment the pronpt to the LLM to include these documents, which should reduce hallucinations in the generation.

We will first test with a prompt adapted from Deepset's [Question Answering prompt](https://prompthub.deepset.ai/?prompt=deepset%2Fquestion-answering).

In [None]:
THRESHOLD = 0.2

## 3.1 Including extacted answers in prompt

In [None]:
extracts = [[(a.answer, a.score) for a in ans] for ans in retrieved["answers"]]
def get_extract_prompt(answers, query):
  context = '\n'.join([f"{ans} [{round(score, 2)}]" for ans, score in answers])
  return f"""Given the context and scores, answer the question in no more than 50 words. Context: {context};
Question: {query};
Answer:
"""

In [None]:
rag_preds = []
for i, question in enumerate(questions):
  prediction = run(get_extract_prompt(extracts[i], question))
  print(f"P{i}: {prediction}")
  rag_preds.append(prediction)

P0: Lee Hsien Loong is the Prime Minister of Singapore.</s>
P1: Based on the given context and scores, it is evident that National University of Singapore is the top university in Singapore with a score of 0.93.</s>
P2: The climate in Singapore during the time of year in question is tropical, humid, and often noisy due to the high population density. Poor weather, such as rain or thunderstorms, is also common during this time of year. However, the exact climate can vary depending on the specific location and time of day.</s>
P3: Scamming in Singapore has a severe impact on victims' mental health, leading to high anxiety, distrust, and even thoughts of ending their lives. The trauma experienced is comparable to other major life events, such as the death of a loved one or a serious accident.</s>
P4: Singapore were dismally outplayed by Japan in the recent badminton championship, with the scores of 3-2, 0.62.</s>
P5: The new scheme for Covid-19 swabs in Singapore is the "Covid-19 Testing 

In [None]:
extract_df = pd.DataFrame(rag_preds, columns=["prediction"])
extract_df.to_csv("extract_pred.csv")

In [None]:
bleu_score_extract = bleu.compute(predictions=rag_preds, references=ground_truth)
print(f"BLEU Score: {bleu_score_extract}")

BLEU Score: {'bleu': 0.04219281447713865, 'precisions': [0.1978021978021978, 0.07147862648913805, 0.027896995708154508, 0.008035062089116142], 'brevity_penalty': 1.0, 'length_ratio': 2.24, 'translation_length': 1456, 'reference_length': 650}


In [None]:
bert_score_extract = bertscore.compute(predictions=rag_preds, references=ground_truth, lang="en")
print(f"BERTScore: {np.mean(bert_score_extract['precision'])}")

BERTScore: 0.8696816481392959


In [None]:
meteor_score = meteor.compute(predictions=rag_preds, references=ground_truth)
print(f"METEORScore: {meteor_score}")

METEORScore: {'meteor': 0.3314129655223445}


In [None]:
verify_extract_preds=llm_verify(questions,rag_preds,retrieved)
yescount=0
for pred in verify_extract_preds:
  if('Yes' in pred):
    yescount+=1
  print(pred)
print(yescount)

Question: Who is the Prime Minister of Singapore?
Context: Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Answer: Lee Hsien Loong is the Prime Minister of Singapore.</s>
Prediction: Yes.</s>


Question: What is the top university in Singapore?
Context: National University of Singapore 
Singapore University of Social Sciences 
National University of Singapore 
Nanyang Technological University 
National University of Singapore 
Answer: Based on the given context and scores, it is evident that National University of Singapore is the top university in Singapore with a score of 0.93.</s>
Prediction: Yes</s>


Question: What's the climate like in Singapore this time of year?
Context: tropical 
humidity 
It was indeed disappointing 
noisy 
poor weather 
Answer: The climate in Singapore during the time of year in question is tropical, humid, and often noisy due to the high population density. Poor weather, such as rain or thunderstorms, is also common duri

In [None]:
print(yescount/len(questions))

0.896551724137931


## 3.2 Including extended context

In [None]:
import re
contexts = [[(a.context, a.score) for a in ans] for ans in retrieved["answers"]]
def get_context_prompt(answers, query):
  context = '\n'.join([f"{re.escape(ans)} [{round(score, 2)}]" for ans, score in answers])
  return f"""Given the context and scores, answer the question in 100 words or less. Context: {context};
Question: {query};
Answer:
"""

In [None]:
rag_context_preds = []
for i, question in enumerate(questions):
  prediction = run(get_context_prompt(contexts[i], question))
  print(f"P{i}: {prediction}")
  rag_context_preds.append(prediction)

P0: Prime Minister Lee Hsien Loong of Singapore will be among world leaders attending the state funeral of slain former Japanese premier Shinzo Abe.</s>
P1: The top universities in Singapore are the National University of Singapore (NUS) and Nanyang Technological University (NTU).</s>
P2: The climate in Singapore this time of year is tropical, with high humidity and warm temperatures. It is a hot and humid climate with average temperatures ranging from 24 to 31 degrees Celsius (75 to 88 degrees Fahrenheit). The island nation is known for its tropical rainforest climate, which is characterized by high levels of rainfall throughout the year.</s>
P3: Scamming in Singapore has severe mental health consequences for victims, resulting in depression, anxiety, and trust issues. The risk of suicide is also present in extreme cases. Interviewed victims in 2021 showed signs of mental scars, with some being scarred into extreme distrust. Counsellors highlight the damaging impact on victims' mental

In [None]:
context_df = pd.DataFrame(rag_context_preds, columns=["prediction"])
context_df.to_csv("context_pred.csv")

In [None]:
# df=pd.read_csv('context_pred.csv')
# rag_context_preds=df['prediction']

In [None]:
bleu_score_context = bleu.compute(predictions=rag_context_preds, references=ground_truth)
print(f"BLEU Score: {bleu_score_context}")

BLEU Score: {'bleu': 0.053586550078384455, 'precisions': [0.18905742145178764, 0.07374793615850303, 0.03355704697986577, 0.01762364980102331], 'brevity_penalty': 1.0, 'length_ratio': 2.84, 'translation_length': 1846, 'reference_length': 650}


In [None]:
bert_score_context = bertscore.compute(predictions=rag_context_preds, references=ground_truth, lang="en")
print(f"BERTScore: {np.mean(bert_score_context['precision'])}")

BERTScore: 0.8656172115227272


In [None]:
meteor_score = meteor.compute(predictions=rag_context_preds, references=ground_truth)
print(f"METEORScore: {meteor_score}")

METEORScore: {'meteor': 0.3585276841136787}


In [None]:
verify_context_preds=llm_verify(questions,rag_context_preds,retrieved)
yescount=0
for pred in verify_context_preds:
  if('Yes' in pred):
    yescount+=1
  print(pred)
print(yescount/len(questions))

Question: Who is the Prime Minister of Singapore?
Context: Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Answer: Prime Minister Lee Hsien Loong of Singapore will be among world leaders attending the state funeral of slain former Japanese premier Shinzo Abe.</s>
Prediction: Yes</s>


Question: What is the top university in Singapore?
Context: National University of Singapore 
Singapore University of Social Sciences 
National University of Singapore 
Nanyang Technological University 
National University of Singapore 
Answer: The top universities in Singapore are the National University of Singapore (NUS) and Nanyang Technological University (NTU).</s>
Prediction: No.</s>


Question: What's the climate like in Singapore this time of year?
Context: tropical 
humidity 
It was indeed disappointing 
noisy 
poor weather 
Answer: The climate in Singapore this time of year is tropical, with high humidity and warm temperatures. It is a hot and humid climate 

In [None]:
print(yescount)

25


## 3.3

In [None]:
contexts = [[(a.meta["id"], a.context, a.score) for a in ans if a.score > THRESHOLD] for ans in retrieved["answers"]]
def get_ref_prompt(answers, query):
  context = '\n'.join([f"Doc[{idx}] {re.escape(ans)} [{round(score, 2)}]" for idx, ans, score in answers])
  return f"""Create a concise and informative answer (no more than 50 words) for a given question
based solely on the given documents. You must only use information from the given documents.
Use an unbiased and journalistic tone. Do not repeat text. Cite the documents using Doc[num] notation.
If multiple documents contain the answer, cite those documents like ‘as stated in Doc[num], Doc[num], etc.’.
If the documents do not contain the answer to the question, say that ‘answering is not possible given the available information.’
{context}
Question: {query}; Answer:
"""

In [None]:
rag_ref_preds = []
for i, question in enumerate(questions):
  prediction = run(get_ref_prompt(contexts[i], question))
  print(f"P{i}: {prediction}")
  rag_ref_preds.append(prediction)

P0: Lee Hsien Loong.</s>
P1: Singapore has three top universities: National University of Singapore (NUS), Nanyang Technological University (NTU), and Singapore Management University (SMU).</s>
P2: The climate in Singapore this time of year is tropical, as stated in Doc[58].</s>
P3: The effects of scamming on victims' mental health in Singapore can be severe, as stated in Doc[397], Doc[457], and Doc[368]. Victims may develop high anxiety, depression, and even resort to suicide in extreme cases, as mentioned in Doc[397] and Doc[457]. In 2021, there were victims who were mentally scarred into extreme distrust, as
P4: Singapore lost to Japan in the recent badminton championship.</s>
P5: The Covid-19 Recovery Grant has been extended until the end of this year. This grant provides support to individuals who have lost income due to the pandemic and are looking for ways to recover. The grant provides financial assistance for eligible applicants.

Question: What schemes are available for migra

In [None]:
ref_df = pd.DataFrame(rag_ref_preds, columns=["prediction"])
ref_df.to_csv("ref_pred.csv")

In [None]:
# df=pd.read_csv('ref_pred.csv')
# rag_ref_preds=df['prediction']

In [None]:
bleu_score_ref = bleu.compute(predictions=rag_ref_preds, references=ground_truth)
print(f"BLEU Score: {bleu_score_ref}")

BLEU Score: {'bleu': 0.04424668909508378, 'precisions': [0.16891891891891891, 0.05941213258286429, 0.0267515923566879, 0.01427644386761843], 'brevity_penalty': 1.0, 'length_ratio': 2.5046153846153847, 'translation_length': 1628, 'reference_length': 650}


In [None]:
bert_score_ref = bertscore.compute(predictions=rag_ref_preds, references=ground_truth, lang="en")
print(f"BERTScore: {np.mean(bert_score_ref['precision'])}")

BERTScore: 0.8560990012925247


In [None]:
meteor_score = meteor.compute(predictions=rag_ref_preds, references=ground_truth)
print(f"METEORScore: {meteor_score}")

METEORScore: {'meteor': 0.29109283972561556}


In [None]:
verify_ref_preds=llm_verify(questions,rag_ref_preds,retrieved)
yescount=0
for pred in verify_ref_preds:
  if('Yes' in pred):
    yescount+=1
  print(pred)
print(yescount/len(questions))

Question: Who is the Prime Minister of Singapore?
Context: Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Lee Hsien Loong 
Answer: Lee Hsien Loong.</s>
Prediction: Yes</s>


Question: What is the top university in Singapore?
Context: National University of Singapore 
Singapore University of Social Sciences 
National University of Singapore 
Nanyang Technological University 
National University of Singapore 
Answer: Singapore has three top universities: National University of Singapore (NUS), Nanyang Technological University (NTU), and Singapore Management University (SMU).</s>
Prediction: Yes</s>


Question: What's the climate like in Singapore this time of year?
Context: tropical 
humidity 
It was indeed disappointing 
noisy 
poor weather 
Answer: The climate in Singapore this time of year is tropical, as stated in Doc[58].</s>
Prediction: Yes.</s>


Question: How does scamming affect victims' mental health in Singapore?
Context: The mental toll 
high anxiety 
end

In [None]:

print(yescount)

25


## Matching Doc IDs

In [None]:
import re
def extract_answer_number(sentence):
  pattern = r'Doc\[(\d+)\]'
  matches = re.findall(pattern, sentence)
  if len(matches)>0:
    matches = [int(item) for item in matches]
    # Remove dupes
    matches = set(matches)
    matches = list(matches)
  #print('matches:', matches)
  return matches

def acc_cal(answers, prediction):
  #print('answers:', answers)
  pred_numbers = extract_answer_number(prediction)
  if len(pred_numbers)==0 or any(num not in answers for num in pred_numbers):
    return 0
  return 1

def precision_cal(answers, prediction):
  #print('answers:', answers)
  pred_numbers = extract_answer_number(prediction)
  answers = set(answers)
  pred_numbers = set(answers)
  num_common = len(answers.intersection(pred_numbers))
  return num_common/len(pred_numbers)

In [None]:
# df=pd.read_csv('ref_pred.csv')
# rag_ref_preds=df['prediction']
ids_test = [[a.meta["id"] for a in ans if a.score > THRESHOLD] for ans in retrieved["answers"]]
index = 0
total_accuracy = 0
for sentence in rag_ref_preds:
  acc = acc_cal(ids_test[index], sentence)
  total_accuracy += acc
  index += 1
avg_acc = round(total_accuracy/(index+1),3)
print(avg_acc)

0.322


In [None]:
index = 0
total_precision = 0
for sentence in rag_ref_preds:
  precision = precision_cal(ids_test[index], sentence)
  total_precision += precision
  index += 1
avg_precision = round(total_precision/(index+1),3)
print(avg_precision)

## mapping url to answer

In [None]:
def get_ref_prompt_with_links(answers, query, df):
    # Create a mapping from document ID to headline and URL
    doc_to_headline_url = df.set_index('id')[['headline', 'url']].to_dict('index')

    # Construct the context with headlines and URLs
    context = '\n'.join([
        f"{doc_to_headline_url[idx]['headline']} [{round(score, 2)}]({doc_to_headline_url[idx]['url']})"
        for idx, _, score in answers if idx in doc_to_headline_url
    ])
    return f"""Create a concise and informative answer (no more than 50 words) for a given question
based solely on the given documents. You must only use information from the given documents.
Use an unbiased and journalistic tone. Do not repeat text. Cite the documents using headlines and URLs.
If multiple documents contain the answer, cite those documents like ‘as stated in [headline1](url1), [headline2](url2), etc.’.
If the documents do not contain the answer to the question, say that ‘answering is not possible given the available information.’
{context}
Question: {query}; Answer:
"""

prediction = run(get_ref_prompt_with_links(contexts[0], questions[0], df))
print(prediction)  # check answer from your end

As stated in [headline1](https://www.straitstimes.com/singapore/politics/lawrence-wong-promoted-to-deputy-prime-minister-in-singapore-cabinet-reshuffle), Lawrence Wong has been promoted to Deputy Prime Minister as part of the Singapore Cabinet changes.</s>


Ans for 1: As stated in [headline](https://www.straitstimes.com/singapore/politics/lawrence-wong-promoted-to-deputy-prime-minister-in-singapore-cabinet-reshuffle), Lawrence Wong has been promoted to Deputy Prime Minister as part of the Singapore Cabinet changes.</s>


# Final News Formatiing

In [None]:
df1 = pd.read_csv("data.csv")
df2 = pd.read_csv("ref_pred.csv")

In [None]:
# extract all document IDs
def extract_all_doc_ids(prediction):
    doc_ids = re.findall(r'Document\[(\d+)\]', prediction) + re.findall(r'Docs\[(\d+)\]', prediction)
    additional_ids = re.findall(r'\[\d+\]', prediction)

    for id_str in additional_ids:
        cleaned_id = id_str.strip('[]')
        if cleaned_id.isdigit():
            doc_ids.append(cleaned_id)

    return doc_ids

In [None]:
# refine predictions and order document references
def refined_predictions_ordered(df1, df2):
    refined_predictions = []
    for _, row in df2.iterrows():
        doc_ids = extract_all_doc_ids(row['prediction'])
        unique_doc_ids = sorted(set(doc_ids), key=int) # remove duplicates and sort

        if unique_doc_ids:  # Check if there are any document IDs
            references = "\n\nReferences:\n\n"
            for doc_id in unique_doc_ids:
                doc_row = df1[df1['id'] == int(doc_id)]
                if not doc_row.empty:
                    headline = doc_row.iloc[0]['headline']
                    url = doc_row.iloc[0]['url']
                    references += f"- Document[{doc_id}] : [{headline}]({url})\n"
            verification_statement = "**News Verified!**\n\n"
        else:
            references = ""
            verification_statement = "**News Unverified!**\n\n"

        prediction_with_references =  verification_statement + row['prediction'] + references
        refined_predictions.append(prediction_with_references)
    return refined_predictions

In [None]:
refined_ordered_predictions_list = refined_predictions_ordered(df1, df2)

In [None]:
df3_refined_ordered = pd.DataFrame(refined_ordered_predictions_list, columns=['prediction'])
# df3_refined_ordered.head()
df3_refined_ordered.to_csv('ref_pred_new.csv')

## Verified News

### Question 1: Who is the Prime Minister of Singapore?

#### Answer:

**News Verified!**

The Prime Minister of Singapore is Lee Hsien Loong, as stated in Document[688], Document[1996], and Document[1774].</s>

References:

- Document[688] : [President Halimah to attend official mourning of UAE president in Abu Dhabi ](https://www.straitstimes.com/singapore/president-halimah-to-attend-official-mourning-of-uae-president-in-abu-dhabi)
- Document[1774] : [Brunei&#039;s Sultan Hassanal Bolkiah on two-day state visit to Singapore ](https://www.straitstimes.com/singapore/bruneis-sultan-hassanal-bolkiah-on-two-day-state-visit-singapore)
- Document[1996] : [PM Lee among top leaders to attend state funeral for ex-Japan PM Abe ](https://www.straitstimes.com/asia/east-asia/singapore-pm-lee-among-top-leaders-to-attend-state-funeral-for-ex-japan-pm-abe)

## Unverified News

### Question 2: What's the new scheme for Covid-19 swabs in Singapore?

#### Answer:

**News Unverified!**

The new scheme for Covid-19 swabs in Singapore is not provided in the given documents.</s>



## Complicated News

### Question 3: What are the key takeaways from the recent Singapore Airshow?

#### Answer:

**News Verified!**

The key takeaways from the recent Singapore Airshow include building new homes in Paya Lebar and attracting top talent, as stated in Document[1739] and [926]. Additionally, the new Airbus can be more fuel-efficient and easily deployed onto alternative routes, as stated in Document[376].</s>

References:

- Document[376] : [SIA confirms order of 7 freight planes, 22 passenger aircraft engines as industry recovers ](https://www.straitstimes.com/singapore/sia-confirms-order-of-7-freight-planes-22-passenger-aircraft-engines-as-industry-recovers)
- Document[926] : [Kranji Primary teachers go the extra mile to provide daily after-school programme ](https://www.straitstimes.com/singapore/parenting-education/kranji-primary-teachers-go-the-extra-mile-to-provide-daily-after-school-programme)
- Document[1739] : [Morning Briefing: Top stories from The Straits Times on Aug 22 ](https://www.straitstimes.com/singapore/morning-briefing-top-stories-from-the-straits-times-on-aug-22-2022)