In [26]:
MODEL = 'llama3.2'
TRANSFORMER = 'all-MiniLM-L6-v2'

In [12]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)

In [36]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('dataset/book.pdf')
pages = loader.load_and_split()

In [42]:
import re

def clean_text(text):
    """
    Clean and preprocess the text:
    - Remove unnecessary newlines
    - Fix broken words at line breaks
    """
    text = re.sub(r'-\n', '', text) 
    text = re.sub(r'\n+', ' ', text) 
    text = re.sub(r'\s+', ' ', text) 
    return text.strip()

def format_pages(pages):
    """Convert the `pages` list into a structured list of dictionaries with cleaned text."""
    formatted_data = []
    for page in pages:
        page_number = page.metadata.get('page', 0)
        text = page.page_content
        cleaned_text = clean_text(text)
        formatted_data.append({"page": page_number, "text": cleaned_text})
    return formatted_data

book_data = format_pages(pages)

In [44]:
from datasets import Dataset

dataset = Dataset.from_list(book_data)

In [45]:
from sentence_transformers import SentenceTransformer
import faiss

embedding_model = SentenceTransformer(TRANSFORMER)
embeddings = embedding_model.encode([item['text'] for item in book_data], show_progress_bar=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

Batches: 100%|██████████| 28/28 [00:07<00:00,  3.83it/s]


In [46]:
page_metadata = {i: item for i, item in enumerate(book_data)}

In [57]:
def retrieve_sections(query, k=3):
    """Retrieve the top-k relevant sections based on the query."""
    
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, k)
    results = [{"distance": d, "page": page_metadata[i]['page'], "text": page_metadata[i]['text']} for d, i in zip(distances[0], indices[0])]
    
    pages = [result["page"] for result in results]
    distances = [result["distance"] for result in results]
    context = " ".join([result["text"] for result in results])
    final_results = {"pages": pages, "distances": distances, "context": context}
    
    return final_results

In [59]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't answer the question, reply "I don't know". 
Answer in complete sentences and in paragraph form. Donot break paragraph and donot use newline character.

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)

In [60]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = prompt | model | parser

In [61]:
import json

submission_results = []

with open('dataset/queries.json') as f:
    queries = json.load(f)
    for query in queries:
        query_id = query['query_id'] 
        question = query['question']
        
        results = retrieve_sections(question)
        
        context = results['context']
        pages = results['pages']
        distance = results['distances']
        
        answer = chain.invoke({'context': context, 'question': question})
        
        submission_results.append({
            'ID': query_id,
            'question': question,
            'answer': answer,
            'context': context,
            'references': "{'pages': '" + str(pages) + "'}"
        })

In [65]:
import csv
import re

def save_json_to_csv(json_data, csv_filename):
    def clean_text(text):
        return re.sub(r'\s+', ' ', text).strip()

    cleaned_data = [
        {key: clean_text(value) if key == 'answer' else value for key, value in entry.items() if key != 'question'}
        for entry in json_data
    ]

    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['ID', 'context', 'answer', 'references'])
        writer.writeheader()
        writer.writerows(cleaned_data)

save_json_to_csv(submission_results, 'submission.csv')