In [1]:
MODEL = 'llama3.2'
TRANSFORMER = 'all-MiniLM-L6-v2'

## Initialise OLLAMA 3.2 Model

In [None]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)

In [7]:
pdf_path = "dataset/book.pdf"
queries_path = "dataset/queries.json"

## Preprocess and Load the PDF Context into Sentence Transformer

In [8]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()

In [9]:
import re

def clean_text(text):
    """
    Clean and preprocess the text:
    - Remove unnecessary newlines
    - Fix broken words at line breaks
    """
    text = re.sub(r'-\n', '', text) 
    text = re.sub(r'\n+', ' ', text) 
    text = re.sub(r'\s+', ' ', text) 
    return text.strip()

def format_pages(pages):
    """Convert the `pages` list into a structured list of dictionaries with cleaned text."""
    formatted_data = []
    for page in pages:
        page_number = page.metadata.get('page', 0)
        text = page.page_content
        cleaned_text = clean_text(text)
        formatted_data.append({"page": page_number, "text": cleaned_text})
    return formatted_data

book_data = format_pages(pages)

In [None]:
from sentence_transformers import SentenceTransformer
import faiss

embedding_model = SentenceTransformer(TRANSFORMER, device='cuda')
embeddings = embedding_model.encode([item['text'] for item in book_data], show_progress_bar=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [11]:
page_metadata = {i: item for i, item in enumerate(book_data)};

## Generate Prompt Template

In [12]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. Ensure your answer directly reflects the context.
Provide supporting details explicitly from the context in a complete, single-paragraph form.

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)

## Generate Output Parser

In [13]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = prompt | model | parser

## Read Queries JSON File and retrieve Answers

In [14]:
page_to_structure = {
    "preface": {"start_page": 1, "sections": {}},
    "introduction_to_psychology": {
        "start_page": 7,
        "sections": {
            "introduction": {"start_page": 7},
            "what_is_psychology": {"start_page": 8},
            "history_of_psychology": {"start_page": 9},
            "contemporary_psychology": {"start_page": 18},
            "careers_in_psychology": {"start_page": 26},
            "key_terms": {"start_page": 30},
            "summary": {"start_page": 30},
            "review_questions": {"start_page": 32},
            "critical_thinking_questions": {"start_page": 34},
            "personal_application_questions": {"start_page": 34},
        },
    },
    "psychological_research": {
        "start_page": 35,
        "sections": {
            "introduction": {"start_page": 35},
            "why_is_research_important": {"start_page": 36},
            "approaches_to_research": {"start_page": 41},
            "analyzing_findings": {"start_page": 48},
            "ethics": {"start_page": 59},
            "key_terms": {"start_page": 63},
            "summary": {"start_page": 64},
            "review_questions": {"start_page": 66},
            "critical_thinking_questions": {"start_page": 69},
            "personal_application_questions": {"start_page": 70},
        },
    },
    "biopsychology": {
        "start_page": 71,
        "sections": {
            "introduction": {"start_page": 71},
            "human_genetics": {"start_page": 72},
            "cells_of_the_nervous_system": {"start_page": 78},
            "parts_of_the_nervous_system": {"start_page": 84},
            "the_brain_and_spinal_cord": {"start_page": 86},
            "the_endocrine_system": {"start_page": 97},
            "key_terms": {"start_page": 100},
            "summary": {"start_page": 102},
            "review_questions": {"start_page": 103},
            "critical_thinking_questions": {"start_page": 106},
            "personal_application_questions": {"start_page": 106},
        },
    },
    "states_of_consciousness": {
        "start_page": 109,
        "sections": {
            "introduction": {"start_page": 109},
            "what_is_consciousness": {"start_page": 110},
            "sleep_and_why_we_sleep": {"start_page": 114},
            "stages_of_sleep": {"start_page": 117},
            "sleep_problems_and_disorders": {"start_page": 121},
            "substance_use_and_abuse": {"start_page": 126},
            "other_states_of_consciousness": {"start_page": 134},
            "key_terms": {"start_page": 137},
            "summary": {"start_page": 139},
            "review_questions": {"start_page": 140},
            "critical_thinking_questions": {"start_page": 143},
            "personal_application_questions": {"start_page": 143},
        },
    },
    "sensation_and_perception": {
        "start_page": 145,
        "sections": {
            "introduction": {"start_page": 145},
            "sensation_versus_perception": {"start_page": 146},
            "waves_and_wavelengths": {"start_page": 149},
            "vision": {"start_page": 153},
            "hearing": {"start_page": 161},
            "the_other_senses": {"start_page": 164},
            "gestalt_principles_of_perception": {"start_page": 168},
            "key_terms": {"start_page": 172},
            "summary": {"start_page": 174},
            "review_questions": {"start_page": 175},
            "critical_thinking_questions": {"start_page": 178},
            "personal_application_questions": {"start_page": 179},
        },
    },
    "learning": {
        "start_page": 181,
        "sections": {
            "introduction": {"start_page": 181},
            "what_is_learning": {"start_page": 182},
            "classical_conditioning": {"start_page": 183},
            "operant_conditioning": {"start_page": 192},
            "observational_learning_modeling": {"start_page": 203},
            "key_terms": {"start_page": 207},
            "summary": {"start_page": 208},
            "review_questions": {"start_page": 208},
            "critical_thinking_questions": {"start_page": 210},
            "personal_application_questions": {"start_page": 211},
        },
    },
    "thinking_and_intelligence": {
        "start_page": 213,
        "sections": {
            "introduction": {"start_page": 213},
            "what_is_cognition": {"start_page": 214},
            "language": {"start_page": 218},
            "problem_solving": {"start_page": 222},
            "what_are_intelligence_and_creativity": {"start_page": 228},
            "measures_of_intelligence": {"start_page": 231},
            "the_source_of_intelligence": {"start_page": 237},
            "key_terms": {"start_page": 241},
            "summary": {"start_page": 242},
            "review_questions": {"start_page": 243},
            "critical_thinking_questions": {"start_page": 246},
            "personal_application_questions": {"start_page": 246},
        },
    },
    "memory": {
        "start_page": 247,
        "sections": {
            "introduction": {"start_page": 247},
            "how_memory_functions": {"start_page": 248},
            "parts_of_the_brain_involved_with_memory": {"start_page": 255},
            "problems_with_memory": {"start_page": 259},
            "ways_to_enhance_memory": {"start_page": 269},
            "key_terms": {"start_page": 273},
            "summary": {"start_page": 274},
            "review_questions": {"start_page": 275},
            "critical_thinking_questions": {"start_page": 276},
            "personal_application_questions": {"start_page": 277},
        },
    },
    "lifespan_development": {
        "start_page": 279,
        "sections": {
            "introduction": {"start_page": 279},
            "what_is_lifespan_development": {"start_page": 280},
            "lifespan_theories": {"start_page": 284},
            "stages_of_development": {"start_page": 292},
            "death_and_dying": {"start_page": 313},
            "key_terms": {"start_page": 315},
            "summary": {"start_page": 316},
            "review_questions": {"start_page": 317},
            "critical_thinking_questions": {"start_page": 319},
            "personal_application_questions": {"start_page": 320},
        },
    },
    "emotion_and_motivation": {
        "start_page": 321,
        "sections": {
            "introduction": {"start_page": 321},
            "motivation": {"start_page": 322},
            "hunger_and_eating": {"start_page": 328},
            "sexual_behavior": {"start_page": 334},
            "emotion": {"start_page": 342},
            "key_terms": {"start_page": 353},
            "summary": {"start_page": 354},
            "review_questions": {"start_page": 355},
            "critical_thinking_questions": {"start_page": 357},
            "personal_application_questions": {"start_page": 357},
        },
    },
    "personality": {
        "start_page": 359,
        "sections": {
            "introduction": {"start_page": 359},
            "what_is_personality": {"start_page": 360},
            "freud_and_the_psychodynamic_perspective": {"start_page": 362},
            "neo_freudians_adler_erikson_jung_and_horney": {"start_page": 368},
            "learning_approaches": {"start_page": 373},
            "humanistic_approaches": {"start_page": 377},
            "biological_approaches": {"start_page": 378},
            "trait_theorists": {"start_page": 379},
            "cultural_understandings_of_personality": {"start_page": 384},
            "personality_assessment": {"start_page": 386},
            "key_terms": {"start_page": 391},
            "summary": {"start_page": 392},
            "review_questions": {"start_page": 394},
            "critical_thinking_questions": {"start_page": 397},
            "personal_application_questions": {"start_page": 397},
        },
    },
    "social_psychology": {
        "start_page": 399,
        "sections": {
            "introduction": {"start_page": 399},
            "what_is_social_psychology": {"start_page": 400},
            "self_presentation": {"start_page": 406},
            "attitudes_and_persuasion": {"start_page": 409},
            "conformity_compliance_and_obedience": {"start_page": 415},
            "prejudice_and_discrimination": {"start_page": 422},
            "aggression": {"start_page": 429},
            "prosocial_behavior": {"start_page": 432},
            "key_terms": {"start_page": 437},
            "summary": {"start_page": 439},
            "review_questions": {"start_page": 440},
            "critical_thinking_questions": {"start_page": 444},
            "personal_application_questions": {"start_page": 444},
        },
    },
    "industrial_organizational_psychology": {
        "start_page": 447,
        "sections": {
            "introduction": {"start_page": 447},
            "what_is_industrial_and_organizational_psychology": {"start_page": 448},
            "industrial_psychology_selecting_and_evaluating_employees": {"start_page": 456},
            "organizational_psychology_the_social_dimension_of_work": {"start_page": 467},
            "human_factors_psychology_and_workplace_design": {"start_page": 477},
            "key_terms": {"start_page": 480},
            "summary": {"start_page": 481},
            "review_questions": {"start_page": 481},
            "critical_thinking_questions": {"start_page": 483},
            "personal_application_questions": {"start_page": 484},
        },
    },
    "stress_lifestyle_and_health": {
        "start_page": 485,
        "sections": {
            "introduction": {"start_page": 485},
            "what_is_stress": {"start_page": 486},
            "stressors": {"start_page": 496},
            "stress_and_illness": {"start_page": 502},
            "regulation_of_stress": {"start_page": 514},
            "the_pursuit_of_happiness": {"start_page": 521},
            "key_terms": {"start_page": 529},
            "summary": {"start_page": 530},
            "review_questions": {"start_page": 531},
            "critical_thinking_questions": {"start_page": 534},
            "personal_application_questions": {"start_page": 535},
        },
    },
    "psychological_disorders": {
        "start_page": 537,
        "sections": {
            "introduction": {"start_page": 537},
            "what_are_psychological_disorders": {"start_page": 538},
            "diagnosing_and_classifying_psychological_disorders": {"start_page": 542},
            "perspectives_on_psychological_disorders": {"start_page": 545},
            "anxiety_disorders": {"start_page": 548},
            "obsessive_compulsive_and_related_disorders": {"start_page": 554},
            "posttraumatic_stress_disorder": {"start_page": 558},
            "mood_and_related_disorders": {"start_page": 560},
            "schizophrenia": {"start_page": 570},
            "dissociative_disorders": {"start_page": 574},
            "disorders_in_childhood": {"start_page": 576},
            "personality_disorders": {"start_page": 582},
            "key_terms": {"start_page": 589},
            "summary": {"start_page": 591},
            "review_questions": {"start_page": 594},
            "critical_thinking_questions": {"start_page": 597},
            "personal_application_questions": {"start_page": 598},
        },
    },
    "therapy_and_treatment": {
        "start_page": 599,
        "sections": {
            "introduction": {"start_page": 599},
            "mental_health_treatment_past_and_present": {"start_page": 600},
            "types_of_treatment": {"start_page": 605},
            "treatment_modalities": {"start_page": 617},
            "substance_related_and_addictive_disorders_a_special_case": {"start_page": 621},
            "the_sociocultural_model_and_therapy_utilization": {"start_page": 623},
            "key_terms": {"start_page": 627},
            "summary": {"start_page": 628},
            "review_questions": {"start_page": 630},
            "critical_thinking_questions": {"start_page": 632},
            "personal_application_questions": {"start_page": 632},
        },
    },
    "references": {
        "start_page": 633, 
        "sections": {
            "": {"start_page": 633}
        }
    },
    "index": {
        "start_page": 733, 
        "sections": {
            "": {"start_page": 733}
        }
    }
}

In [15]:
def find_location_by_page(page_number):
    for chapter, details in page_to_structure.items():
        if page_number >= details["start_page"]:
            chapter_name = chapter
            last_section = None
            for section, section_details in details.get("sections", {}).items():
                if page_number >= section_details["start_page"]:
                    last_section = section
                else:
                    return f"{chapter}/{last_section}"
    return "index/"

In [16]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    return word_tokenize(text.lower())

def keyword_search(query, book_data):
    """Perform keyword-based search using BM25 ranking."""
    tokenized_book_data = [tokenize_text(item['text']) for item in book_data]
    bm25 = BM25Okapi(tokenized_book_data)
    tokenized_query = tokenize_text(query.lower())
    scores = bm25.get_scores(tokenized_query)
    top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:5]
    keyword_results = [{"page": book_data[i]['page'], "text": book_data[i]['text'], "score": scores[i]} 
                       for i in top_k_indices]
    return keyword_results

In [17]:
def combine_rankings(faiss_distances, faiss_indices, keyword_results, book_data):
    """Combine results from FAISS and keyword search based on ranking."""
    combined_results = []
    
    for i in range(faiss_indices.shape[1]):  
        faiss_index = faiss_indices[0][i] 
        faiss_score = faiss_distances[0][i]  
        
        faiss_result = {
            "page": book_data[faiss_index]["page"],
            "text": book_data[faiss_index]["text"],
            "score": faiss_score 
        }
        combined_results.append(faiss_result)
    
    for keyword_result in keyword_results:
        keyword_score = keyword_result["score"]  
        combined_results.append({
            "page": keyword_result["page"],
            "text": keyword_result["text"],
            "score": keyword_score
        })
    
    combined_results = sorted(combined_results, key=lambda x: x["score"], reverse=True)
    
    return combined_results

In [18]:
from sentence_transformers import util
import numpy as np

def hybrid_search(index, query, book_data, k=5):
    query_embedding = embedding_model.encode(query)
    query_embedding = np.expand_dims(query_embedding, axis=0)  # Reshaping to 2D (1, embedding_dim)
    
    faiss_distances, faiss_indices = index.search(query_embedding, k)
    keyword_results = keyword_search(query, book_data)
    combined_results = combine_rankings(faiss_distances, faiss_indices, keyword_results, book_data)
    
    return combined_results

def retrieve_sections(query, k=5):
    """Retrieve the top-k relevant sections based on the query."""
    results = hybrid_search(index, query, book_data, k)
    pages = [result["page"] for result in results]
    sections = [find_location_by_page(page_no) for page_no in pages]
    context = " ".join([result["text"] for result in results])
    
    final_results = {"pages": pages, "sections": sections,  "context": context}
    return final_results

In [None]:
import json

submission_results = []

with open(queries_path) as f:
    queries = json.load(f)
    for query in queries:
        query_id = query['query_id'] 
        question = query['question']
        
        results = retrieve_sections(question)
        
        context = results['context']
        pages = results['pages']
        sections = results['sections']
        
        answer = chain.invoke({'context': context, 'question': question})
        
        submission_results.append({
            'ID': query_id,
            'question': question,
            'answer': answer,
            'context': context,
            'references': "{'sections': '" + str(sections) + "', 'pages': '" + str(pages) + "'}"
        })
        print(query_id)

## Save Answers as CSV

In [21]:
import csv
import re

def save_json_to_csv(json_data, csv_filename):
    def clean_text(text):
        return re.sub(r'\s+', ' ', text).strip()

    cleaned_data = [
        {key: clean_text(value) if key == 'answer' else value for key, value in entry.items() if key != 'question'}
        for entry in json_data
    ]

    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['ID', 'context', 'answer', 'references'])
        writer.writeheader()
        writer.writerows(cleaned_data)

save_json_to_csv(submission_results, 'submission.csv')