# Installing the dependencies

In [6]:
!pip install langchain chromadb faiss-cpu sentence-transformers pypdf --quiet

In [7]:
!pip install PyPDF2



In [8]:
!pip install -U langchain_community



# Importing the Libraries

In [9]:
import os
import re
from typing import List, Dict
import PyPDF2
from google.colab import files
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# HuggingFace API token
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "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"

# Upload & Process PDF

In [10]:
#uploading the file
uploaded = files.upload()
pdf_path = next(iter(uploaded))

def extract_pdf_pages(pdf_path: str) -> List[Dict]:
    # Extracts each pages text as a string with page number
    pages = []
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for i, page in enumerate(reader.pages):
            text = page.extract_text()
            if text:
                pages.append({'content': text, 'page_num': i + 1})
    return pages

pages = extract_pdf_pages(pdf_path)
print(f"Loaded {len(pages)} pages from {pdf_path}")

Saving For Task - Policy file.pdf to For Task - Policy file (1).pdf
Loaded 6 pages from For Task - Policy file (1).pdf


# sentence extraction

In [11]:
import re
from typing import List, Dict, Optional

def extract_complete_sentences(text: str) -> List[str]:
    # Normalize space and join broken lines
    normalized = re.sub(r'\s+', ' ', text)
    # initiating sentence split that handles abbreviations and others
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s+'
    sentences = re.split(sentence_endings, normalized)
    return [s.strip() for s in sentences if s.strip()]

def extract_table(lines: List[str]) -> Dict:
    # Extracts tables as lists of rows and columns
    table_data = []
    for line in lines:
        # Split by 2+ spaces or tabs
        cells = re.split(r'\s{2,}|\t+', line.strip())
        if cells and any(cells):
            table_data.append([cell.strip() for cell in cells if cell.strip()])
    return {"type": "table", "rows": table_data}

def extract_sections_from_text(text: str) -> List[Dict]:
    header_pattern = re.compile(r'^(\d+(\.\d+)*\s+)?[A-Z][A-Z\s]+$')
    bullet_pattern = re.compile(r'^[\u2022•\-\*]\s*(.+)')
    table_title_pattern = re.compile(r'^Table\s+\d+\.\d+\.\d+')
    section_title = None
    results = []
    lines = text.split('\n')
    table_buffer = []
    is_table = False
    sentence_buffer = []
    for line in lines:
        line = line.strip()
        if not line:
            continue

        # detect table
        if table_title_pattern.match(line):
            is_table = True
            table_buffer = [line]
            continue

        # Table accumulation
        if is_table:
            # Table ends if next section header or table title or blank line encountered
            if header_pattern.match(line) or table_title_pattern.match(line):
                results.append({'type': 'table', 'content': '\n'.join(table_buffer), 'section': section_title, 'table_struct': extract_table(table_buffer)})
                table_buffer = []
                is_table = False
            else:
                table_buffer.append(line)
            continue

        # Section header detection
        if header_pattern.match(line):
            section_title = line
            results.append({'type': 'section', 'content': line})
            continue

        # Bullet point detection
        bullet_match = bullet_pattern.match(line)
        if bullet_match:
            # Group bullets with section
            results.append({'type': 'bullet', 'content': bullet_match.group(1), 'section': section_title})
            continue

        # Sentence processing (buffer fragments)
        if line and line[-1] not in {'.', '?', '!'}:
            sentence_buffer.append(line)
        else:
            if sentence_buffer:
                complete_sentence = ' '.join(sentence_buffer + [line])
                sentences = extract_complete_sentences(complete_sentence)
                for sentence in sentences:
                    results.append({'type': 'sentence', 'content': sentence, 'section': section_title})
                sentence_buffer = []
            else:
                sentences = extract_complete_sentences(line)
                for sentence in sentences:
                    results.append({'type': 'sentence', 'content': sentence, 'section': section_title})

    # Handle any remaining sentence fragments
    if sentence_buffer:
        results.append({'type': 'sentence', 'content': ' '.join(sentence_buffer), 'section': section_title})

    # Add any leftover table
    if is_table and table_buffer:
        results.append({'type': 'table', 'content': '\n'.join(table_buffer), 'section': section_title, 'table_struct': extract_table(table_buffer)})

    return results

financial_keywords = [
    'budget', 'debt', 'infrastructure', 'expenditure', 'revenue', 'investment',
    'deficit', 'surplus', 'credit rating', 'taxation', 'superannuation', 'assets'
]

key_points = []
table_points = []
for pg in pages:
    page_results = extract_sections_from_text(pg['content'])
    for item in page_results:
        # For sentences and bullets: filter by keywords
        if item['type'] in ('sentence', 'bullet'):
            if any(re.search(r'\b{}\b'.format(re.escape(kw)), item['content'].lower()) for kw in financial_keywords):
                key_points.append({
                    'type': item['type'],
                    'text': item['content'],
                    'page_num': pg['page_num'],
                    'section': item.get('section', "N/A")
                })
        # For tables: always keep (or filter if you want)
        elif item['type'] == 'table':
            table_points.append({
                'type': 'table',
                'raw': item['content'],
                'table_struct': item['table_struct'],
                'page_num': pg['page_num'],
                'section': item.get('section', "N/A")
            })

print(f"Extracted {len(key_points)} key points and {len(table_points)} tables.\n")
for i, kp in enumerate(key_points):
    print(f"{i+1}: [{kp['type'].upper()}] {kp['text']} (Page {kp['page_num']}, Section: {kp['section']})")

print("\nSample Table Extraction:")
for i, tbl in enumerate(table_points[:2]):
    print(f"Table {i+1}: (Page {tbl['page_num']}, Section: {tbl['section']})")
    for row in tbl['table_struct']['rows']:
        print(row)
    print()

Extracted 20 key points and 4 tables.

1: [SENTENCE] The presentation and preparation of the Territory’s Budget is provided for in sections 11 and 11A of the Financial Management Act 1996 (the Act). (Page 1, Section: ST ATEMENT)
2: [SENTENCE] Strategic Priorities and Financial Policy In this budget, the Governm ent continues its commitment to the principles of responsible financial m anagem ent. (Page 1, Section: ST ATEMENT)
3: [BULLET] maintain a balanced budget over the econom ic cycle; (Page 1, Section: ST ATEMENT)
4: [BULLET] maintain low levels of  debt; (Page 1, Section: ST ATEMENT)
5: [BULLET] maintain a triple A credit rating; and (Page 1, Section: ST ATEMENT)
6: [SENTENCE] Strategic priorities, as they relate to the Territory’s Budget, are sum marised as: sustainability of service delivery. (Page 1, Section: ST ATEMENT)
7: [SENTENCE] The 2005-06 Budget and Forward Estim ates have b een prepared taking into account the need to provide sustainable social and econom ic serv ices 

# Vector Database Setup

In [12]:
class FinancialPolicyVectorDB:
    def __init__(self, persist_dir: str = "financial_policy_db"):
        self.embedding_model = HuggingFaceEmbeddings(
            model_name='sentence-transformers/all-MiniLM-L6-v2',
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        self.persist_dir = persist_dir
        self.db = None

    def initialize_from_keypoints(self, key_points: List[Dict], table_points: List[Dict] = None):

        # Prepare texts and metadata
        texts = []
        metadatas = []

        # Add key points (sentences/bullets)
        for kp in key_points:
            texts.append(kp['text'])
            metadatas.append({
                'page_num': kp['page_num'],
                'section': kp['section'],
                'type': kp['type'],
                'source': 'text'
            })

        # Add tablesd
        if table_points:
            for tbl in table_points:
                texts.append(tbl['raw'])
                metadatas.append({
                    'page_num': tbl['page_num'],
                    'section': tbl['section'],
                    'type': 'table',
                    'source': 'table',
                })

        #  load ChromaDB
        self.db = Chroma.from_texts(
            texts=texts,
            embedding=self.embedding_model,
            metadatas=metadatas,
            persist_directory=self.persist_dir,
            collection_metadata={"hnsw:space": "cosine"}
        )
        return self

    def semantic_search(self, query: str, filter_by: Dict = None, k: int = 5) -> List[Dict]:

        if not self.db:
            raise ValueError("Database not initialized. Call initialize_from_keypoints() first.")

        return self.db.similarity_search(
            query=query,
            k=k,
            filter=filter_by
        )

    def save(self):
        """Persist database to disk."""
        if self.db:
            self.db.persist()

    @classmethod
    def load(cls, persist_dir: str = "financial_policy_db"):
        """Load existing database."""
        instance = cls(persist_dir)
        instance.db = Chroma(
            persist_directory=persist_dir,
            embedding_function=instance.embedding_model
        )
        return instance

# Initialize with extracted data
vector_db = FinancialPolicyVectorDB(persist_dir="policy_chroma_db")
vector_db.initialize_from_keypoints(key_points, table_points)

print(f"✅ VectorDB initialized with {len(key_points)} text entries and {len(table_points)} tables")
print("Sample metadata:", vector_db.semantic_search("budget", k=1)[0].metadata)

✅ VectorDB initialized with 20 text entries and 4 tables
Sample metadata: {'type': 'sentence', 'source': 'text', 'page_num': 1, 'section': 'ST ATEMENT'}


# LLM Answer Generator (using transformers pipeline)


In [13]:
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_length=256)

def format_table(text):
    # Basic Markdown table parser for tabular text
    lines = [l.strip() for l in text.split('\n') if l.strip()]
    if len(lines) < 2: return text
    header = "| " + " | ".join(re.split(r'\s{2,}', lines[0])) + " |"
    separator = "| " + " | ".join(['---'] * len(re.split(r'\s{2,}', lines[0]))) + " |"
    rows = []
    for line in lines[1:]:
        row = "| " + " | ".join(re.split(r'\s{2,}', line)) + " |"
        rows.append(row)
    return "\n".join([header, separator] + rows)

def format_answer(docs):
    answer_parts = []
    for doc in docs:
        content = doc.page_content.strip()
        dtype = doc.metadata.get('type', 'sentence')
        # Table formatting
        if dtype == 'table' or ("Table" in content and re.search(r"\d", content[:10])):
            formatted = format_table(content)
        # Bullet formatting
        elif dtype == 'bullet' or re.search(r"([\u2022•\-]\s+)", content):
            bullets = re.split(r"[\u2022••\-]\s+", content)
            formatted = "\n".join([f"- {b.strip()}" for b in bullets if b.strip()])
        else:
            formatted = content
        page = doc.metadata.get('page_num', 'N/A')
        section = doc.metadata.get('section', 'N/A')
        answer_parts.append(f"{formatted}\n\n*Source: Page {page}, Section: {section}*")
    final_answer = "\n---\n".join(answer_parts)
    return final_answer

def generate_answer(context, question):
    prompt = f"Use ONLY the following context to answer the question.\nContext:\n{context}\nQuestion: {question}\nAnswer:"
    response = qa_pipeline(prompt)[0]['generated_text']
    return response.strip()

Device set to use cuda:0


# Chatbot with Memory and Formatted Answer
basic implementation provides vague or unclear answers so I did some basic fomration of the answers....though it should be more formated

In [14]:
class PolicyChatbot:
    def __init__(self, vector_db, qa_fn, format_fn):
        self.vector_db = vector_db
        self.qa_fn = qa_fn
        self.format_fn = format_fn
        self.last_topic = None
        self.memory = []

    def ask(self, query):
        # Adding context for follow-up questions
        if self.last_topic and (re.search(r"\bwhat\b|\babout\b|\btell me more\b", query.lower())):
            query = f"{self.last_topic} {query}"
        # Corrected method call
        docs = self.vector_db.semantic_search(query, k=5)
        formatted_context = self.format_fn(docs)
        self.last_topic = query
        self.memory.append({'question': query, 'context': formatted_context})
        return formatted_context

chatbot = PolicyChatbot(vector_db, generate_answer, format_answer)

In [16]:
# Cell 8: Interactive Chatbot Demo (Colab)
print("Type financial policy questions below (type 'quit' to quit):")
while True:
    user_q = input("You: ")
    if user_q.lower().strip() in ['quit']:
        break
    response = chatbot.ask(user_q)
    print("Bot:\n", response)

Type financial policy questions below (type 'quit' to quit):
You: what are the Principles of Responsible Financial Management
Bot:
 The presentation and preparation of the Territory’s Budget is provided for in sections 11 and 11A of the Financial Management Act 1996  (the Act).

*Source: Page 1, Section: ST ATEMENT*
---
The presentation and preparation of the Territory’s Budget is provided for in sections 11 and 11A of the Financial Management Act 1996 (the Act).

*Source: Page 1, Section: ST ATEMENT*
---
| Table 1.2.7 |
| --- |
| Percentage funding of Superannuation Liabilities |
| Assets Liabilities % Funded |
| 30 June $'000 $'000 |
| 2005 | 1 | 447 | 094 | 2 | 480 | 943 58% |
| 2006 | 1 | 626 | 868 | 2 | 707 | 023 60% |
| 2007 | 1 | 829 | 509 | 2 | 927 | 773 62% |
| 2008 | 2 | 042 | 190 | 3 | 146 | 890 65% |
| 2009 2 | 266 | 537 3 | 365 | 107 67% |
| Principles of Responsible Financial Managem ent |
| The key financial m easures established by the Governm ent satisfy various princi

## How the Chatbot Remembers and Why I Designed the Search This Way

**Making the Chatbot "Remember":**  
When I built this chatbot, I added a simple memory feature: after each question, the chatbot keeps track of what I asked and the document pieces it found. If I ask a follow-up like, “What about debt?” right after “Tell me about infrastructure?”, it looks for cues in my wording (“what,” “about,” “tell me more”) and automatically links my given new question to the last thing I was discussing. This lets the chatbot connect the dots and keep the conversation flowing, so I don’t have to repeat Myself or restate the whole topic each time.

**Why I Set Up the Search This Way:**  
For the search, I wanted the answers to be both accurate and easy to trace back to the source. Instead of just matching keywords, the chatbot uses what’s called semantic search: it compares my  question to all the sentences, bullet points, and tables pulled from the document, and finds the ones that really match what you mean—even if you phrase things differently. Each answer comes with a page and section reference, so I could know exactly where it is came from. This means I can trust the replies, and it’s especially helpful if im digging into long, complex documents where details matter.