In [1]:
import os
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PDFPlumberLoader
from tqdm import tqdm
from bs4 import BeautifulSoup
import json
import gzip

In [2]:
def clean_html(raw_html):
    """Remove HTML tags from a string using BeautifulSoup."""
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text()

def load_nq_data_gz(filepath):
    """Load the Natural Questions dataset from a gzipped JSONL file and extract questions, long answers, and documents."""
    data = []
    i=0
    count=0
    
    with gzip.open(filepath, 'rt', encoding='utf-8') as f:  # 'rt' mode for reading text
        for line in tqdm(f):
            entry = json.loads(line)
            #print(entry)
            question = entry["question_text"]
            document_text = entry["document_text"]
            annotations = entry.get("annotations", [])
            long_answer_text = None
            short_answers = []
            
            # Extract long answer from token spans
            if annotations:
                long_answer = annotations[0].get("long_answer", {})
                start_idx, end_idx = long_answer.get("start_token"), long_answer.get("end_token")
                
                if start_idx is not None and end_idx is not None:
                    tokenized_text = document_text.split()  # Basic tokenization
                    long_answer_text = " ".join(tokenized_text[start_idx:end_idx])
            
            # Extract short answers
            if annotations and "short_answers" in annotations[0]:
                short_answers = [
                    " ".join(document_text.split()[ans["start_token"]:ans["end_token"]])
                    for ans in annotations[0]["short_answers"]
                ]

            # Ignore entries without a long answer
            if long_answer_text:
                data.append({
                    "question": question,
                    "long_answer": clean_html(long_answer_text),
                    "document_text": clean_html(document_text)
                })
                i+=1
                if i==200:break
    
    return data

# Load dataset from gzipped file
file_path = "v1.0-simplified_simplified-nq-train.jsonl.gz"
nq_data = load_nq_data_gz(file_path)

361it [00:20, 17.81it/s]


In [16]:
def create_chunks_and_map_questions(documents, chunk_size=2024, chunk_overlap=200):
    """Splits documents into chunks and maps questions to relevant chunks simultaneously, avoiding duplicate documents."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n\n", "\n\n", "\n",  "."],
        add_start_index=True
    )
    
    all_chunks = []
    question_chunk_map = []
    chunk_indices = {}
    seen_documents = set()
    
    for doc_idx, doc in enumerate(tqdm(documents, desc="Processing documents")):
        doc_text = doc["document_text"]
        
        if doc_text in seen_documents:
            # If the document is already processed, only map the question
            question = doc['question']
            long_answer = doc['long_answer']
            for chunk_idx, chunk in chunk_indices.items():
                if long_answer in chunk:
                    question_chunk_map.append({
                        "question": question,
                        "chunk": chunk_idx
                    })
            continue
        
        seen_documents.add(doc_text)
        chunks = text_splitter.split_text(doc_text)
        start_index = len(all_chunks)  # Track the starting index of the new chunks
        temp_chunks = []
        temp_indices = {}
        
        for idx, chunk in enumerate(chunks):
            temp_chunks.append(chunk)
            temp_indices[start_index + idx] = chunk  # Store temporarily
        
        question = doc['question']
        long_answer = doc['long_answer']
        found = any(long_answer in chunk for chunk in temp_chunks)
        
        if found:
            all_chunks.extend(temp_chunks)
            chunk_indices.update(temp_indices)
            for chunk_idx, chunk in temp_indices.items():
                if long_answer in chunk:
                    question_chunk_map.append({
                        "question": question,
                        "chunk": chunk_idx
                    })
    
    return all_chunks, question_chunk_map

# Example usage
chunks, question_map = create_chunks_and_map_questions(nq_data[:139])


Processing documents: 100%|█████████████████████████████████████████████████████████| 139/139 [00:00<00:00, 617.25it/s]


In [18]:
len(chunks)

1990

In [19]:
len(question_map)

100

Graph Evaluation

In [30]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

# Initialize embedding model
EMBEDDING_MODEL = OllamaEmbeddings(
    model="mxbai-embed-large",
    base_url="http://127.0.0.1:11434"
)

# Initialize in-memory vector store
DOCUMENT_VECTOR_DB = InMemoryVectorStore(EMBEDDING_MODEL)

# Convert each chunk into a Document object
for i in tqdm(range(len(chunks))):
    doc = [Document(page_content=chunks[i], metadata={"source": f"chunk{i}"})]
    
    # Add documents to vector database
    DOCUMENT_VECTOR_DB.add_documents(doc)


100%|███████████████████████████████████████████████████████████████████████████| 2090/2090 [19:28:50<00:00, 33.56s/it]


In [24]:
# Create graph
from MiniGraph import MiniRAGGraph

rag_graph = MiniRAGGraph(similarity_threshold=0.8)

# Construct Graph
rag_graph.construct_graph(chunks)  

2090it [3:02:08,  5.23s/it] 


In [100]:
rag_graph.visualize_graph()

KeyboardInterrupt: 

<Figure size 1400x1000 with 0 Axes>

In [59]:
%%time
x = DOCUMENT_VECTOR_DB.search("which is the most common use of opt-in e-mail marketing", search_type="similarity", k=3)

CPU times: total: 141 ms
Wall time: 358 ms


In [70]:
%%time
x = rag_graph.retrieve("which is the most common use of opt-in e-mail marketing", top_k=3)  

CPU times: total: 1.84 s
Wall time: 1.26 s


In [88]:
def recall(chunk_ids, k=5):
    """Evaluate Recall@k for document retrieval."""
    total = 0
    retrieved_correct = 0

    for i in range(len(question_map)):
        ground_truth = question_map[i]["chunk"]
        chunk_id = chunk_ids[i]
        if ground_truth and ground_truth in chunk_id:
            retrieved_correct += 1
        total += 1

    recall_k = retrieved_correct / total if total > 0 else 0
    return recall_k



In [74]:
retrieved_vec = [DOCUMENT_VECTOR_DB.search(entry["question"], search_type="similarity", k=20) for entry in question_map]
retreived_graph = [rag_graph.retrieve(entry["question"], top_k=20) for entry in question_map]

In [85]:
[doc["chunk_id"] for doc in retreived_graph[0]][:k]

[5]

In [99]:
for k in range(1, 21):
    vec_k_ids = []
    graph_k_ids = []
    # Select k ids
    for i in range(len(retrieved_vec)):
        vec_k_ids.append([int(doc.metadata["source"][5:]) for doc in retrieved_vec[i]][:k])
        graph_k_ids.append([doc["chunk_id"] for doc in retreived_graph[i]][:k])
    
    #print("k:", k)
    #print("vector:", recall(vec_k_ids))
    print(f"k= {k}, recall=", recall(graph_k_ids))
    #print()

k= 1, recall= 0.43
k= 2, recall= 0.56
k= 3, recall= 0.65
k= 4, recall= 0.7
k= 5, recall= 0.71
k= 6, recall= 0.75
k= 7, recall= 0.75
k= 8, recall= 0.79
k= 9, recall= 0.81
k= 10, recall= 0.82
k= 11, recall= 0.82
k= 12, recall= 0.83
k= 13, recall= 0.85
k= 14, recall= 0.86
k= 15, recall= 0.87
k= 16, recall= 0.89
k= 17, recall= 0.89
k= 18, recall= 0.9
k= 19, recall= 0.9
k= 20, recall= 0.9


Different Similaity thresholds