In [1]:
# Import necessary modules
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS, Chroma
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
from langchain.docstore.document import Document
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import torch
import numpy as np
from uuid import uuid4

  from .autonotebook import tqdm as notebook_tqdm


In [150]:
from huggingface_hub import login

login(token="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")

In [151]:
# Dataset is rag hallucinations 1000 

import pandas as pd

splits = {'train': 'data/train-00000-of-00001-44cee39b8c9485bf.parquet', 'test': 'data/test-00000-of-00001-ea85434570966ab6.parquet'}
train_df = pd.read_parquet("hf://datasets/neural-bridge/rag-hallucination-dataset-1000/" + splits["train"])
test_df =  pd.read_parquet("hf://datasets/neural-bridge/rag-hallucination-dataset-1000/" + splits["test"])


In [152]:
train_df= train_df.iloc[0:25, :]
train_df

Unnamed: 0,context,question,answer
0,The Lothian Cycle Campaign\nA new underpass of...,What is the proposed design of the underpass a...,This question cannot be answered.
1,Courage under fire. Farah Naqvi writes.india U...,What was John F Kennedy's stance on the first ...,This question cannot be answered.
2,Organizations take precautions in the midst of...,What is the name of the senior pastor of the L...,This question cannot be answered.
3,Ubuntu delusion: BuildYourUbuntu.com won’t com...,What is the name of the website that was propo...,This question cannot be answered.
4,jaw crushers jaw crushers are used for coarse ...,What is the maximum hardness of materials that...,This question cannot be answered.
5,It was Saturday morning and that i had simply ...,What is the origin of the Clay-Bar and its pur...,This question cannot be answered.
6,"!?php gravity_form_enqueue_scripts(2, true); ?...",What is the name of the charity that Oliver ra...,This question cannot be answered.
7,"Explore the Hausa, Yoruba and Igbo languages w...",What are the specific techniques used in the K...,This question cannot be answered.
8,It is a decision he has to come to on his own ...,What is the content of the sub_title in the gi...,This question cannot be answered.
9,Training Videos (HeinOnline) Search For Home T...,What is the purpose of the Congressional Recor...,This question cannot be answered.


In [153]:
# Convert DataFrame to documents with metadata with source id
documents = [
  Document(
      page_content=row['context'],
      metadata={'id': str(uuid4()), 'question': row['question'], 'answer': row['answer']}
  ) for _, row in train_df.iterrows()
]

documents

[Document(metadata={'id': '387c962f-54d3-4f66-a555-8518b3167383', 'question': 'What is the proposed design of the underpass at the Gogar rail/tram interchange?', 'answer': 'This question cannot be answered.'}, page_content='The Lothian Cycle Campaign\nA new underpass of the A8, at a critical location, will only allow bikes to be pushed – unless you object now!!\nNetwork Rail has submitted a planning application to build the Gogar rail/tram interchange – a project which Spokes fully supports. The interchange will be on the north side of the A8, just on the Edinburgh side of gogar roundabout.\nUnfortunately Network Rail is not taking cycling seriously. They are doing the easy bits, like connecting the new station to the north side of the A8 and providing bike parking. However, a new underpass of the A8, linking the Gyle to the new station will be designed for walkers only, although you will be allowed to push a bike.\nEven if you are short of time, it is quick and easy to make a very sho

## Only Textual matching

In Retrieval-Augmented Generation (RAG), indexing refers to the process of creating a data structure that maps input queries to relevant documents or passages in a large corpus. The goal of indexing is to enable efficient retrieval of relevant information that can be used to generate text.

In TF-IDF based retrieval, indexing involves creating an inverted index of the corpus, where each word or term is associated with a list of documents that contain it. The index is typically represented as a matrix, where each row corresponds to a document and each column corresponds to a term. The cell at row i and column j contains the TF-IDF weight of term j in document i.

When a query is issued, the TF-IDF weights are used to compute a similarity score between the query and each document in the index. The documents with the highest similarity scores are retrieved and used to generate text.

When designing the indexing step, there are a few design choices to make:
- Data processing mode
- Indexing model
- Text splitting method
- Chunking hyperparameters


In [154]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=500,
  chunk_overlap=50,
  length_function=len
)
chunks = text_splitter.split_documents(documents)
chunks

[Document(metadata={'id': '387c962f-54d3-4f66-a555-8518b3167383', 'question': 'What is the proposed design of the underpass at the Gogar rail/tram interchange?', 'answer': 'This question cannot be answered.'}, page_content='The Lothian Cycle Campaign\nA new underpass of the A8, at a critical location, will only allow bikes to be pushed – unless you object now!!\nNetwork Rail has submitted a planning application to build the Gogar rail/tram interchange – a project which Spokes fully supports. The interchange will be on the north side of the A8, just on the Edinburgh side of gogar roundabout.'),
 Document(metadata={'id': '387c962f-54d3-4f66-a555-8518b3167383', 'question': 'What is the proposed design of the underpass at the Gogar rail/tram interchange?', 'answer': 'This question cannot be answered.'}, page_content='Unfortunately Network Rail is not taking cycling seriously. They are doing the easy bits, like connecting the new station to the north side of the A8 and providing bike parkin

In [155]:
# Create TF-IDF encodings
tfidf = TfidfVectorizer()
tfidf_vectors = tfidf.fit_transform([chunk.page_content for chunk in chunks]) 
# fit_transform learns vocabulary from data and then applies tf-idf to convert the data into a matrix with tf-idf features
tfidf_vectors

<242x3695 sparse matrix of type '<class 'numpy.float64'>'
	with 11390 stored elements in Compressed Sparse Row format>

In [156]:
# Create BM25 index
tokenized_corpus = [chunk.page_content.split() for chunk in chunks]
bm25 = BM25Okapi(tokenized_corpus)
bm25

<rank_bm25.BM25Okapi at 0x7efe537c41c0>

In [157]:
# Setup LLAMA model
def init_llama():
  model_name = "meta-llama/Llama-2-7b-chat-hf"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      torch_dtype=torch.float16,
      device_map="auto"
  )
  pipe = pipeline(
      "text-generation",
      model=model,
      tokenizer=tokenizer,
      max_length=512,
      temperature=0.7
  )
  return HuggingFacePipeline(pipeline=pipe)

llm = init_llama()

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


There are a few things you would need to think about when designing the retrieval step:
- Retrieval strategy
- Retrieval hyperparameters
- Query transformations


In [158]:
# Create Chunk-level Retrievel Function with BM25
def retrieve_relevant_chunks(query, k=3):
  # Get BM25 scores
  tokenized_query = query.split()
  bm25_scores = bm25.get_scores(tokenized_query)
  
  # Get top k chunks based on BM25 scores
  top_k_indices = bm25_scores.argsort()[-k:][::-1]
  retrieved_chunks = [chunks[i] for i in top_k_indices]
  
  return retrieved_chunks

In [159]:
# Create Response generation Function
def generate_response(query):
  
  # Retrieve relevant chunks
  relevant_chunks = retrieve_relevant_chunks(query)

  # Print the chunks
  print("Retrieved Chunks:")
  for i, chunk in enumerate(relevant_chunks):
      print(f"\nChunk {i+1}:")
      print(chunk.page_content)
  
  # Combine chunks into context
  context = "\n".join([chunk.page_content for chunk in relevant_chunks])
  
  # Create prompt
  prompt = f"""Use the following context to answer the question. If you don't know the answer or if the answer is not in the context, just say "This question cannot be answered." Do not try to make up an answer.

  Context: {context}

  Question: {query}
  Answer:"""
  
  # Generate response
  response = llm(prompt)
  return response

In [160]:
train_df.iloc[5,1]

'What is the origin of the Clay-Bar and its purpose in car detailing?'

In [161]:
# Example usage
query = ' What is clay-bar and where is originated from? mention the purpose of that in car detailing?'
response = generate_response(query)
print(response)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Retrieved Chunks:

Chunk 1:
What do you think of Oliver’s achievement? Let us know in the comments.

Chunk 2:
What I found myelf doing during the reading was subsituting – at times unconciously – ‘the law or statutes’ with the words LOVE and GRACE. For God is LOVE and by his grace we live. And here are a few of the verses that jumped out:

Chunk 3:
I had a short conversation with a dear friend yesterday that stuck with me all day long and through a long drive in which I listened to an audiobook of Richard Rohr’s, thereby inviting him into the conversation going on in my little ‘ol head. My friend and I had raised questions with each other about the motivation behind prayer and what prayer looks like. We talked about the in-dwelling of the Holy Spirit. Transformation vs. performance and where the former comes from, and how. Rohr’s
Use the following context to answer the question. If you don't know the answer or if the answer is not in the context, just say "This question cannot be answe

The most important aspects of the evaluation step are:
- Evaluation protocol
- Evaluator prompts
- Model guidelines


In [162]:
def evaluate_model(df):
  results = []
  print("Results:", results)
  
  for _, row in df.iterrows():
      query = row['question']
      true_answer = row['answer']
      generated_answer = generate_response(query)
      retrieved_chunks = retrieve_relevant_chunks(query)
      
      results.append({
          'question': query,
          'true_answer': true_answer,
          'generated_answer': generated_answer,
          'retrieved_chunks': retrieved_chunks
          
      })
  
  return pd.DataFrame(results)


In [163]:
# Calculate metrics
def calculate_metrics(eval_results):
  total = len(eval_results)
  correct = sum(eval_results['true_answer'] == eval_results['generated_answer'])
  accuracy = correct / total
  hallucination_rate = 1 - accuracy
  
  return {
      'accuracy': accuracy,
      'hallucination_rate': hallucination_rate
  }

In [164]:
# Evaluate model
eval_results = evaluate_model(train_df)
metrics = calculate_metrics(eval_results)

print("Evaluation Results:", eval_results)
print(f"Accuracy: {metrics['accuracy']:.2f}")
print(f"Hallucination Rate: {metrics['hallucination_rate']:.2f}")


Results: []
Retrieved Chunks:

Chunk 1:
The Lothian Cycle Campaign
A new underpass of the A8, at a critical location, will only allow bikes to be pushed – unless you object now!!
Network Rail has submitted a planning application to build the Gogar rail/tram interchange – a project which Spokes fully supports. The interchange will be on the north side of the A8, just on the Edinburgh side of gogar roundabout.

Chunk 2:
b. The path from the underpass should continue through the station site so as to link on to other expected future developments in the area, allowing continuous cycle access between them and the Gyle. At present only a 2m wide footpath is proposed beyond the station to a future ‘International Business Gateway’ development.

Chunk 3:
What I found myelf doing during the reading was subsituting – at times unconciously – ‘the law or statutes’ with the words LOVE and GRACE. For God is LOVE and by his grace we live. And here are a few of the verses that jumped out:
Retrieved Chu

In [165]:
# Assuming your DataFrame is called 'df'
eval_results['extracted_answer'] = eval_results['generated_answer'].str.split('Answer:').str[-1].str.strip()

# If you want to remove the quotation marks at the beginning and end
eval_results['extracted_answer'] = eval_results['extracted_answer'].str.strip("'")

eval_results

Unnamed: 0,question,true_answer,generated_answer,retrieved_chunks,extracted_answer
0,What is the proposed design of the underpass a...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='The Lothian Cycle Campaign\nA n...,This question cannot be answered based on the ...
1,What was John F Kennedy's stance on the first ...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='Courage under fire. Farah Naqvi...,This question cannot be answered. The context ...
2,What is the name of the senior pastor of the L...,This question cannot be answered.,Use the following context to answer the questi...,"[page_content='Rev. Brian Jones, the senior pa...",Rev. Brian Jones
3,What is the name of the website that was propo...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='I can’t really understand if we...,This question cannot be answered based on the ...
4,What is the maximum hardness of materials that...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='jaw crushers jaw crushers are u...,This question cannot be answered based on the ...
5,What is the origin of the Clay-Bar and its pur...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='What do you think of Oliver’s a...,This question cannot be answered based on the ...
6,What is the name of the charity that Oliver ra...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='“On the day I solved it for the...,This question cannot be answered.
7,What are the specific techniques used in the K...,This question cannot be answered.,Use the following context to answer the questi...,"[page_content='Explore the Hausa, Yoruba and I...",This question cannot be answered based on the ...
8,What is the content of the sub_title in the gi...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='What do you think of Oliver’s a...,This question cannot be answered. The subtitle...
9,What is the purpose of the Congressional Recor...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='Resolutions in the U.S. Congres...,This question cannot be answered.


In [166]:
eval_results.loc[10,"question"]

"What is the correlation between a teenager's rebellious behavior and their parents' disciplinary methods?"

In [167]:
eval_results.loc[10,"retrieved_chunks"]

[Document(metadata={'id': 'e9c9fe75-661f-424f-9acb-014cda04ed11', 'question': "What is the correlation between a teenager's rebellious behavior and their parents' disciplinary methods?", 'answer': 'This question cannot be answered.'}, page_content='The problem is that most parents only have the model of discipline that was meted out to them as children and teens. For many, identification with the normal rebellious and defiant aspects of the teenage years has never been normalized by mature growth; thus they either subtly (or not so subtly) stimulate “bad behavior,” only to follow it, in their own anxiety and confusion, by administering punishment to the adolescent. Not surprisingly, this causes the teen perplexity, anxiety, and confusion.'),
 Document(metadata={'id': '4430e44b-e464-49ad-87c7-c071f03a2e14', 'question': 'What is the purpose of the Congressional Record Daily to Bound Locator in HeinOnline?', 'answer': 'This question cannot be answered.'}, page_content='the relationship be

In [168]:
eval_results.loc[10,"extracted_answer"]

'This question cannot be answered.'

Main Points to be remembered from this experiment

- I has used only 25 samples from my train_df.
- For documents Splitting I used "RecursiveCharacterTextSplitter" with chunk_size=500, chunk_overlap=50.
- Used TfidfVectorizer for creating TF-IDF encodings
- For indexing used bm25
- Used meta-llama/Llama-2-7b-chat-hf model for text generation purpose, torch_dtype is float16, max length of the each embeddings is 512, temperature is 0.7 
- Performed Chunk-level Retrievel with by taking BM25 scores and taking top 3 chunks based on the scores
- After Retrieving the relevant chunks, combined all these chunks into context to send to llm as a prompt
- Need to check for relevant evaluation metrics to calculate the hallucination scores and model accuracy.
- When testing the outputs of the model, I have noticed that out of 3 chunks which got retrieved, only one chunk is relevent with relevant context and (question, answer in metadata), and others are not at all relevent for the question asked to llm
- The answer is mostly This question cannot be answered with some additional explanation given by the llm
- The time taken for generating the results for 25 samples is around 7 min


## Only Semantic matching

In [169]:
documents

[Document(metadata={'id': '387c962f-54d3-4f66-a555-8518b3167383', 'question': 'What is the proposed design of the underpass at the Gogar rail/tram interchange?', 'answer': 'This question cannot be answered.'}, page_content='The Lothian Cycle Campaign\nA new underpass of the A8, at a critical location, will only allow bikes to be pushed – unless you object now!!\nNetwork Rail has submitted a planning application to build the Gogar rail/tram interchange – a project which Spokes fully supports. The interchange will be on the north side of the A8, just on the Edinburgh side of gogar roundabout.\nUnfortunately Network Rail is not taking cycling seriously. They are doing the easy bits, like connecting the new station to the north side of the A8 and providing bike parking. However, a new underpass of the A8, linking the Gyle to the new station will be designed for walkers only, although you will be allowed to push a bike.\nEven if you are short of time, it is quick and easy to make a very sho

In [170]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=500,
  chunk_overlap=50,
  length_function=len,
  separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)

In [171]:
# Split documents into chunks
chunks = text_splitter.split_documents(documents)
print(f"Number of documents after splitting: {len(chunks)}")

Number of documents after splitting: 247


In [172]:
# Initialize HuggingFace embeddings
embeddings = HuggingFaceEmbeddings(
  model_name="sentence-transformers/all-MiniLM-L6-v2",
  model_kwargs={'device': 'cuda'} if torch.cuda.is_available() else {'device': 'cpu'}
)

In [173]:
# Create FAISS vector store
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore
# Save the vector store (optional)
# vectorstore.save_local("faiss_index")

<langchain_community.vectorstores.faiss.FAISS at 0x7efe87f0de20>

In [174]:
# Set up LLaMA model
def init_llama():
  model_name = "meta-llama/Llama-2-7b-chat-hf"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      torch_dtype=torch.float16,
      device_map="auto"
  )
  pipe = pipeline(
      "text-generation",
      model=model,
      tokenizer=tokenizer,
      max_length=512,
      temperature=0.7
  )
  return HuggingFacePipeline(pipeline=pipe)

llm = init_llama()

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [175]:
# Create semantic retrieval function using cosine similarity

def retrieve_relevant_chunks(query, k=3):
  # Retrieve similar documents using cosine similarity
  relevant_docs = vectorstore.similarity_search(query, k=k)
  return relevant_docs

# # Function to inspect retrieved chunks (for debugging)
# def inspect_retrieved_chunks(chunks):
#   print("\nRetrieved Chunks:")
#   for i, chunk in enumerate(chunks, 1):
#       print(f"\nChunk {i}:")
#       print(f"Content: {chunk.page_content[:200]}...")
#       print(f"Metadata: {chunk.metadata}")

In [176]:
# Create response generation function

def generate_response(query, debug=False):
  # Retrieve relevant chunks
  relevant_chunks = retrieve_relevant_chunks(query)
  
#   if debug:
#       inspect_retrieved_chunks(relevant_chunks)
  
  # Combine chunks into context
  context = "\n".join([chunk.page_content for chunk in relevant_chunks])
  
  # Create prompt
  prompt = f"""Use the following context to answer the question. If you don't know the answer or if the answer is not in the context, just say "This question cannot be answered." Do not try to make up an answer.

  Context: {context}

  Question: {query}
  Answer:"""
  
  # Generate response
  response = llm(prompt)
  
#   if debug:
#       print(f"\nGenerated Response: {response}")
  
  return {"response": response, "retrieved_chunks": relevant_chunks}

In [177]:
# Evaluation functions

def evaluate_model(df):
  results = []
  for _, row in df.iterrows():
      query = row['question']
      true_answer = row['answer']
      
      # Get response and retrieved chunks
      result = generate_response(query)
      generated_answer = result["response"]
      retrieved_chunks = result["retrieved_chunks"]
      
      results.append({
          'question': query,
          'true_answer': true_answer,
          'generated_answer': generated_answer,
          'retrieved_chunks': retrieved_chunks
      })
  
  return pd.DataFrame(results)

def calculate_metrics(eval_results):
  total = len(eval_results)
  correct = sum(eval_results['true_answer'] == eval_results['generated_answer'])
  accuracy = correct / total
  hallucination_rate = 1 - accuracy
  
  return {
      'accuracy': accuracy,
      'hallucination_rate': hallucination_rate
  }

In [178]:
print("Starting evaluation...")
eval_results = evaluate_model(train_df)
metrics = calculate_metrics(eval_results)

print("\nEvaluation Results:")
print(f"Accuracy: {metrics['accuracy']:.2f}")
print(f"Hallucination Rate: {metrics['hallucination_rate']:.2f}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Starting evaluation...

Evaluation Results:
Accuracy: 0.00
Hallucination Rate: 1.00


In [179]:
eval_results

Unnamed: 0,question,true_answer,generated_answer,retrieved_chunks
0,What is the proposed design of the underpass a...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='The Lothian Cycle Campaign\nA n...
1,What was John F Kennedy's stance on the first ...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='Even ‘liberal’ John F Kennedy c...
2,What is the name of the senior pastor of the L...,This question cannot be answered.,Use the following context to answer the questi...,"[page_content='Rev. Brian Jones, the senior pa..."
3,What is the name of the website that was propo...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='Ubuntu delusion: BuildYourUbunt...
4,What is the maximum hardness of materials that...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='jaw crushers jaw crushers are u...
5,What is the origin of the Clay-Bar and its pur...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='So what is a new Clay Bar?\nIns...
6,What is the name of the charity that Oliver ra...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='“On the day I solved it for the...
7,What are the specific techniques used in the K...,This question cannot be answered.,Use the following context to answer the questi...,"[page_content='Explore the Hausa, Yoruba and I..."
8,What is the content of the sub_title in the gi...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='What is it the word means?\nDo ...
9,What is the purpose of the Congressional Recor...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='.S. Congressional Documents dat...


In [180]:
# Assuming your DataFrame is called 'df'
eval_results['extracted_answer'] = eval_results['generated_answer'].str.split('Answer:').str[-1].str.strip()

# If you want to remove the quotation marks at the beginning and end
eval_results['extracted_answer'] = eval_results['extracted_answer'].str.strip("'")

eval_results

Unnamed: 0,question,true_answer,generated_answer,retrieved_chunks,extracted_answer
0,What is the proposed design of the underpass a...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='The Lothian Cycle Campaign\nA n...,a. The underpass will only allow bikes to be p...
1,What was John F Kennedy's stance on the first ...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='Even ‘liberal’ John F Kennedy c...,This question cannot be answered based on the ...
2,What is the name of the senior pastor of the L...,This question cannot be answered.,Use the following context to answer the questi...,"[page_content='Rev. Brian Jones, the senior pa...",This question cannot be answered.
3,What is the name of the website that was propo...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='Ubuntu delusion: BuildYourUbunt...,This question cannot be answered based on the ...
4,What is the maximum hardness of materials that...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='jaw crushers jaw crushers are u...,This question cannot be answered based on the ...
5,What is the origin of the Clay-Bar and its pur...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='So what is a new Clay Bar?\nIns...,This question cannot be answered.
6,What is the name of the charity that Oliver ra...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='“On the day I solved it for the...,This question cannot be answered.
7,What are the specific techniques used in the K...,This question cannot be answered.,Use the following context to answer the questi...,"[page_content='Explore the Hausa, Yoruba and I...",This question cannot be answered.
8,What is the content of the sub_title in the gi...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='What is it the word means?\nDo ...,This question cannot be answered. The sub_titl...
9,What is the purpose of the Congressional Recor...,This question cannot be answered.,Use the following context to answer the questi...,[page_content='.S. Congressional Documents dat...,This question cannot be answered based on the ...


In [181]:
eval_results.loc[10,'question']

"What is the correlation between a teenager's rebellious behavior and their parents' disciplinary methods?"

In [182]:
eval_results.loc[10,"retrieved_chunks"]

[Document(metadata={'id': 'e9c9fe75-661f-424f-9acb-014cda04ed11', 'question': "What is the correlation between a teenager's rebellious behavior and their parents' disciplinary methods?", 'answer': 'This question cannot be answered.'}, page_content='The problem is that most parents only have the model of discipline that was meted out to them as children and teens. For many, identification with the normal rebellious and defiant aspects of the teenage years has never been normalized by mature growth; thus they either subtly (or not so subtly) stimulate “bad behavior,” only to follow it, in their own anxiety and confusion, by administering punishment to the adolescent. Not surprisingly, this causes the teen perplexity, anxiety, and confusion.'),
 Document(metadata={'id': 'e9c9fe75-661f-424f-9acb-014cda04ed11', 'question': "What is the correlation between a teenager's rebellious behavior and their parents' disciplinary methods?", 'answer': 'This question cannot be answered.'}, page_content=

In [183]:
eval_results.loc[10,"extracted_answer"]

'This question cannot be answered based on the provided context.'

Main Points to be remembered from this experiment

- I has used only 25 samples from my train_df.
- For documents Splitting I used "RecursiveCharacterTextSplitter" with chunk_size=500, chunk_overlap=50.
- Used Huggingfaceembeddings for creating vector representations
- Created the FAISS vector store where we indexed and stored all the vector embeddings in this store
- Used meta-llama/Llama-2-7b-chat-hf model for text generation purpose torch_dtype is float16, max length of the each embeddings is 512, temperature is 0.7
- Performed Chunk-level semantic Retrievel with by calculating the cosine similarity scores and taking top 3 chunks based on the scores
- After Retrieving the relevant chunks, combined all these chunks into context to send to llm as a prompt
- ! Need to check for relevant evaluation metrics to calculate the hallucination scores and model accuracy.
- When testing the outputs of the model, I have noticed that out of 3 chunks which got retrieved, most of them got retrieved correctly, but ins few samples 2 chunks are relevent with relevant context and (question, answer in metadata), and one is not at all relevent for the question asked to llm
- The answer is mostly This question cannot be answered with some additional explanation given by the llm
- The time taken for generating the results for 25 samples is around 6 min

## Hybrid matching

In [184]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

In [185]:
# Create TF-IDF encodings
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([chunk.page_content for chunk in chunks])

# Create semantic embeddings
embeddings = HuggingFaceEmbeddings(
  model_name="sentence-transformers/all-MiniLM-L6-v2",
  model_kwargs={'device': 'cuda'} if torch.cuda.is_available() else {'device': 'cpu'}
)

# semantic_embeddings = embeddings.embed_documents([chunk.page_content for chunk in chunks])


In [186]:
# Create FAISS index for semantic search
vectorstore = FAISS.from_documents(chunks, embeddings)

In [187]:
#Retrieval methods using BM25 and vector similarity
def bm25_retrieval(query, top_k=5):
  tokenized_corpus = [chunk.page_content.split() for chunk in chunks]
  bm25 = BM25Okapi(tokenized_corpus)
  tokenized_query = query.split()
  doc_scores = bm25.get_scores(tokenized_query)
  top_indices = np.argsort(doc_scores)[-top_k:][::-1] #top k chunks based on bm25 scores
  return [chunks[i] for i in top_indices]

def semantic_retrieval(query, top_k=5):
  return vectorstore.similarity_search(query, k=top_k)

In [188]:
# reciprocal rank fusion with source id with universal unique identifier
def reciprocal_rank_fusion(results_list, k=60):
  fused_scores = {}
  for rank, results in enumerate(results_list):
      for doc in results:
          doc_id = doc.metadata['id']
          if doc_id not in fused_scores:
              fused_scores[doc_id] = 0
          fused_scores[doc_id] += 1 / (rank + k)
  
  sorted_docs = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
  return [doc for doc in results_list[0] if doc.metadata['id'] in [doc_id for doc_id, _ in sorted_docs]]


def retrieve_and_fuse(query, top_k=5):
  bm25_results = bm25_retrieval(query, top_k)
  semantic_results = semantic_retrieval(query, top_k)
  fused_results = reciprocal_rank_fusion([bm25_results, semantic_results])
  return fused_results[:top_k]

In [189]:
def get_llama_pipeline():
  model_name = "meta-llama/Llama-2-7b-chat-hf"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=512)
  llm = HuggingFacePipeline(pipeline=pipe)
  return llm

In [190]:
def generate_response(query, top_k=3):
  retrieved_chunks = retrieve_and_fuse(query, top_k)
  context = "\n".join([chunk.page_content for chunk in retrieved_chunks])
  
  prompt_template = """
  Use the following pieces of context to answer the question at the end. If you don't know the answer or if the answer is not in the context, just say "This question cannot be answered." Do not try to make up an answer.

  Context:
  {context}

  Question: {question}
  Answer:
  """
  
  prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
  
  llm = get_llama_pipeline()
  chain = LLMChain(llm=llm, prompt=prompt)
  
  response = chain.run(context=context, question=query)
  return {"response": response,"retrieved_chunks": retrieved_chunks}

In [191]:
# Example usage
query = "What is the maximum hardness of materials that jaw crushers can be used for grinding?"
response = generate_response(query)
print(response)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'response': '\n  Use the following pieces of context to answer the question at the end. If you don\'t know the answer or if the answer is not in the context, just say "This question cannot be answered." Do not try to make up an answer.\n\n  Context:\n  jaw crushers jaw crushers are used for coarse grinding brittle,medium hard to hard materials up to a mohs hardness ofOnline Service \n4 jaw crusher a deflector plate at the top of the moving jaw means that no intrusive cross wall is required in the feed hopper.all of the effectiveOnline Service \ngrinding mill,stone production line,jaw crusher,impact crusher are our main products in jpr machinery.we are commited to serving our best toOnline Service\nsavona equipment offers new and used jaw crushers for sale in the usa,canada,mexico and south america.Online Service \nmbmmllc s jaw crushers are built in the usa provide years of amazing performance.multiple sizes are available,perfect for your job s needs.shop todayOnline Service\npowerscr

In [192]:
train_df.iloc[4,1]

'What is the maximum hardness of materials that jaw crushers can be used for grinding?'

In [193]:
def evaluate_model(df):
  results = []
  for _, row in df.iterrows():
      query = row['question']
      context = row['context']
      true_answer = row['answer']
      # Get response and retrieved chunks
      result = generate_response(query)
      generated_answer = result["response"]
      retrieved_chunks = result["retrieved_chunks"]
      
      results.append({
          'query': query,
          'context': context,
          'true_answer': true_answer,
          'generated_answer': generated_answer,
          'retrieved_chunks': retrieved_chunks
      })
  return pd.DataFrame(results)

In [194]:
def calculate_hallucination_metrics(evaluation_results):
  total = len(evaluation_results)
  correct_responses = sum(evaluation_results['true_answer'] == evaluation_results['generated_answer'])
  hallucinations = total - correct_responses
  
  hallucination_rate = hallucinations / total
  accuracy = correct_responses / total
  
  return {
      'hallucination_rate': hallucination_rate,
      'accuracy': accuracy
  }

In [195]:
# Assuming you have your test dataset in a DataFrame called test_df
evaluation_results = evaluate_model(train_df)
metrics = calculate_hallucination_metrics(evaluation_results)

print("Evaluation Metrics:")
print(f"Hallucination Rate: {metrics['hallucination_rate']:.2f}")
print(f"Accuracy: {metrics['accuracy']:.2f}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]
Some parameters are on the meta device because they were offloaded to the cpu.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s]
Some parameters are on the meta device because they were offloaded to the cpu.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely 

Evaluation Metrics:
Hallucination Rate: 1.00
Accuracy: 0.00


In [196]:
# Assuming your DataFrame is called 'df'
evaluation_results['extracted_answer'] = evaluation_results['generated_answer'].str.split('Answer:').str[-1].str.strip()

# If you want to remove the quotation marks at the beginning and end
evaluation_results['extracted_answer'] = evaluation_results['extracted_answer'].str.strip("'")

evaluation_results

Unnamed: 0,query,context,true_answer,generated_answer,retrieved_chunks,extracted_answer
0,What is the proposed design of the underpass a...,The Lothian Cycle Campaign\nA new underpass of...,This question cannot be answered.,\n Use the following pieces of context to ans...,[page_content='The Lothian Cycle Campaign\nA n...,1. This question cannot be answered.\n 2. The...
1,What was John F Kennedy's stance on the first ...,Courage under fire. Farah Naqvi writes.india U...,This question cannot be answered.,\n Use the following pieces of context to ans...,[page_content='Courage under fire. Farah Naqvi...,This question cannot be answered.
2,What is the name of the senior pastor of the L...,Organizations take precautions in the midst of...,This question cannot be answered.,\n Use the following pieces of context to ans...,"[page_content='Rev. Brian Jones, the senior pa...",
3,What is the name of the website that was propo...,Ubuntu delusion: BuildYourUbuntu.com won’t com...,This question cannot be answered.,\n Use the following pieces of context to ans...,[page_content='I can’t really understand if we...,This question cannot be answered.
4,What is the maximum hardness of materials that...,jaw crushers jaw crushers are used for coarse ...,This question cannot be answered.,\n Use the following pieces of context to ans...,[page_content='jaw crushers jaw crushers are u...,This question cannot be answered based on the ...
5,What is the origin of the Clay-Bar and its pur...,It was Saturday morning and that i had simply ...,This question cannot be answered.,\n Use the following pieces of context to ans...,[page_content='What do you think of Oliver’s a...,This question cannot be answered based on the ...
6,What is the name of the charity that Oliver ra...,"!?php gravity_form_enqueue_scripts(2, true); ?...",This question cannot be answered.,\n Use the following pieces of context to ans...,[page_content='“On the day I solved it for the...,This question cannot be answered.
7,What are the specific techniques used in the K...,"Explore the Hausa, Yoruba and Igbo languages w...",This question cannot be answered.,\n Use the following pieces of context to ans...,"[page_content='Explore the Hausa, Yoruba and I...",This question cannot be answered. The context ...
8,What is the content of the sub_title in the gi...,It is a decision he has to come to on his own ...,This question cannot be answered.,\n Use the following pieces of context to ans...,[page_content='What do you think of Oliver’s a...,This question cannot be answered.
9,What is the purpose of the Congressional Recor...,Training Videos (HeinOnline) Search For Home T...,This question cannot be answered.,\n Use the following pieces of context to ans...,[page_content='Resolutions in the U.S. Congres...,This question cannot be answered. The purpose ...


In [201]:
evaluation_results.loc[19,"query"]

'What is the name of the breast cancer education program for Hispanic women at Moffitt Cancer Center?'

In [202]:
evaluation_results.loc[19,"retrieved_chunks"]

[Document(metadata={'id': '911d0e33-b48c-45e3-bb4c-eaecc8df835d', 'question': 'What is the name of the breast cancer education program for Hispanic women at Moffitt Cancer Center?', 'answer': 'This question cannot be answered.'}, page_content='Hispanic women historically have lower rates of breast cancer than do white, black and other minority women, according to the American Cancer Society. But breast cancer is still the leading cause of cancer death in Hispanic women, possibly because it is detected at a more advanced stage.'),
 Document(metadata={'id': '911d0e33-b48c-45e3-bb4c-eaecc8df835d', 'question': 'What is the name of the breast cancer education program for Hispanic women at Moffitt Cancer Center?', 'answer': 'This question cannot be answered.'}, page_content='And while breast cancer rates among African-American women are lower than among white women, black women are 41 percent more likely to die of the disease. Lack of early detection may be contributing to this disparity, ma

In [204]:
evaluation_results.loc[19,"generated_answer"]

'\n  Use the following pieces of context to answer the question at the end. If you don\'t know the answer or if the answer is not in the context, just say "This question cannot be answered." Do not try to make up an answer.\n\n  Context:\n  Hispanic women historically have lower rates of breast cancer than do white, black and other minority women, according to the American Cancer Society. But breast cancer is still the leading cause of cancer death in Hispanic women, possibly because it is detected at a more advanced stage.\nAnd while breast cancer rates among African-American women are lower than among white women, black women are 41 percent more likely to die of the disease. Lack of early detection may be contributing to this disparity, many experts think.\n"That\'s why mammography screening is still so important for women in minority groups," said B. Lee Green, vice president for diversity at Moffitt Cancer Center and a senior member of the Health Outcomes and Behavior Program.\nTob

Main Points to be remembered from this experiment

- I has used only 25 samples from my train_df.
- For documents Splitting I used "RecursiveCharacterTextSplitter" with chunk_size=500, chunk_overlap=50.
- Used Huggingfaceembeddings for creating vector embeddings and tfidf for creating encodings
- Created the FAISS vector store where we indexed and stored all the vector embeddings in this store and used bm25 for indexing the encodings
- Used meta-llama/Llama-2-7b-chat-hf model for text generation purpose torch_dtype is float16, max length of the each embeddings is 512, temperature is 0.7
- Performed Chunk-level semantic Retrievel by calculating the cosine similarity scores for semantic embeddings and bm25 scores for text embeddings taking top 3 chunks based on the scores
- After Retrieving the relevant chunks, combined all these chunks and ranked them accordingly using reciprocal rank fusion into context to send to llm as a prompt
- ! Need to check for relevant evaluation metrics to calculate the hallucination scores and model accuracy.
- When testing the outputs of the model, I have noticed that out of 3 chunks which got retrieved, most of them got retrieved correctly, but ins few samples 2 chunks are relevent with relevant context and (question, answer in metadata), and one is not at all relevent for the question asked to llm
- The answer is mostly This question cannot be answered with some additional explanation given by the llm
- The time taken for generating the results for 25 samples is around 14 min