In [228]:
from bs4 import BeautifulSoup
import requests
import re
import chromadb
import uuid
from ollama import chat, ChatResponse, show

In [3]:
CHROMA_URL = "10.10.2.8"
CHROMA_PORT = 8000
OLLAMA_MODEL = "gemma3"
BILL_URL = "https://www.congress.gov/119/bills/hr1/generated/BILLS-119hr1eas.html"

In [64]:

def extract_bill_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Optionally remove navigation/header/footer if present
    for tag in soup.select("nav, header, footer, script, style"):
        tag.decompose()

    # Remove hyperlinks but keep the text
    for a in soup.find_all('a'):
        a.replace_with(a.get_text())

    # Get all visible text
    text = soup.get_text(separator='\n')

    # Collapse excessive whitespace
    clean_lines = [line.strip() for line in text.splitlines() if line.strip()]
    return '\n'.join(clean_lines)

def scrape_congress_bill(url):
    response = requests.get(url)
    response.raise_for_status()
    response.encoding = 'utf-8'
    return extract_bill_text_from_html(response.text)

def has_toc(body: str) -> bool:
    # looks for more than 1 instance of "TITLE I—" indicating a table of contents
    return len(re.findall(r'TITLE\s+[IVXLCDM]+\s*—', body)) > 1
def validate_bill_text(body: str) -> bool:
    return len(re.findall(r'TITLE\s+[IVXLCDM]+\s*—', body)) > 0

def extract_body(body: str) -> str:
    if not has_toc(body):
        return body.strip()
    toc_start = body.find('TITLE I—')
    body_start = body.find('TITLE I—', toc_start + 1)
    return body[body_start:].strip()

In [None]:
def get_sections(body:str) -> list:
    # splite body in sections based on the following token: SEC. 
    # list items should start with SEC. and end where the next SEC. starts
    sections = []
    sec_pattern = re.compile(r'(SEC\.\s+.*?)(?=SEC\.|$)', re.DOTALL)
    matches = sec_pattern.findall(body)
    for match in matches:
        section_text = match.strip()
        if section_text:
            sections.append(section_text)
    return sections
def get_subsections(section: str) -> list:
    # split section into subsections which are typically denoted by "\n(a) ", "\n(b) ", etc.
    # subsections are expected to start with a letter in parentheses and should end before the next subsection starts
    subsections = []
    subsec_pattern = re.compile(r'\n\([a-z]\)\s+.*?(?=\n\([a-z]\)|$)', re.DOTALL)
    matches = subsec_pattern.findall(section)
    for match in matches:
        subsection_text = match.strip()
        if subsection_text:
            subsections.append(subsection_text)
    return subsections
def get_divisions(subsection: str) -> list:
    # split subsection into divisions which are typically denoted by "\n“(1) ", "\n“(2) ", etc.
    # divisions are expected to start with a number in parentheses and should end before the next division starts
    divisions = []
    div_pattern = re.compile(r'\n“\(\d+\)\s+.*?(?=\n“\(\d+\)|$)', re.DOTALL)
    matches = div_pattern.findall(subsection)
    for match in matches:
        division_text = match.strip()
        if division_text:
            divisions.append(division_text)
    return divisions

def extract_section_title(section: str) -> str:
    sec_index = section.index("SEC")
    newline_index = section.index("\n", sec_index)
    if newline_index == -1:
        return section[sec_index:].strip()
    return section[sec_index:newline_index].strip()
def extract_section_number(section: str) -> str:
    sec_index = section.index("SEC.")
    end_index = section.index(".", sec_index + 4)
    if end_index == -1:
        return section[sec_index:].strip()
    return section[sec_index + 4:end_index].strip()

def collapse_whitespace(text: str) -> str:
    return re.sub(r'\s+', ' ', text).strip()
def number_to_letter(number: int) -> str:
    # Convert a number to a letter (1 -> 'a', 2 -> 'b', ..., 26 -> 'z')
    # double letters for values above 26 (27 -> 'aa', 28 -> 'ab', etc.)
    if number < 1 or number > 702:  # 702 is 'zz'
        raise ValueError("Number must be between 1 and 702 inclusive.")
    if number <= 26:
        return chr(ord('a') + number - 1)
    else:
        first_letter = chr(ord('a') + (number - 1) // 26 - 1)
        second_letter = chr(ord('a') + (number - 1) % 26)
        return first_letter + second_letter
    
def get_section_by_number(sections: list, section_number: str) -> str:
    # Find the section with the given number
    for section in sections:
        if extract_section_number(section) == section_number:
            return section
    return None
    
def build_messages(user: str, system: str = None) -> list:
    messages = []
    if system:
        messages.append({"role": "system", "content": system})
    messages.append({"role": "user", "content": user})
    return messages

def get_ollama_response(messages: list) -> str:
    response: ChatResponse = chat(
        model=OLLAMA_MODEL,
        messages=messages
    )
    return response.message.content

In [None]:
scraped_text = scrape_congress_bill(BILL_URL)

In [69]:
body = extract_body(scraped_text)

In [176]:
sections = get_sections(body)

In [198]:
bill_data = []
for section in sections:
    number = extract_section_number(section)
    print(f"Processing section: {number}")
    title = extract_section_title(section)
    
    subsections = get_subsections(section)
    if not subsections:
        # If no subsections, treat the section as a single subsection
        subsections = [section]
    subsection_count = 0
    for subsection in subsections:
        subsection_count += 1
        divisions = get_divisions(subsection)
        if not divisions:
            # If no divisions, treat the subsection as a single division
            divisions = [subsection]
        division_count = 0
        for division in divisions:
            division_count += 1
            bill_data.append({
                'section_title': title,
                'section_number': number,
                'subsection_letter': number_to_letter(subsection_count),
                'division_number': division_count,
                'division_text': collapse_whitespace(division)
            })

Processing section: 10101
Processing section: 10102
Processing section: 10103
Processing section: 10104
Processing section: 10105
Processing section: 10106
Processing section: 10107
Processing section: 10108
Processing section: 10201
Processing section: 10301
Processing section: 10302
Processing section: 10303
Processing section: 10304
Processing section: 10305
Processing section: 10306
Processing section: 10307
Processing section: 10308
Processing section: 10309
Processing section: 10310
Processing section: 10311
Processing section: 10312
Processing section: 10313
Processing section: 10314
Processing section: 10401
Processing section: 10501
Processing section: 10502
Processing section: 10503
Processing section: 10504
Processing section: 10505
Processing section: 10506
Processing section: 10507
Processing section: 10601
Processing section: 10602
Processing section: 10603
Processing section: 10604
Processing section: 10605
Processing section: 10606
Processing section: 10607
Processing s

In [None]:
chroma_client = chromadb.HttpClient(host=CHROMA_URL, port=CHROMA_PORT)
chroma_client.heartbeat()

1751580523031337715

In [None]:
#chroma_client.delete_collection("Bill")
collection = chroma_client.get_or_create_collection("Bill")

In [207]:
for d in bill_data:
    collection.add(
        documents=[d['division_text']],
        metadatas=[{
            'section_title': d['section_title'],
            'section_number': d['section_number'],
            'subsection_letter': d['subsection_letter'],
            'division_number': d['division_number']
        }],
        ids=[str(uuid.uuid4())]
    )

In [236]:
def query_vector_store(query: str):
    results = collection.query(
        query_texts=[query],
        n_results=10
    )
    return results

def query_bill(query: str) -> str:
    results = query_vector_store(query)
    rel_sections = set()
    for r in results['metadatas'][0]:
        rel_sections.add(r["section_number"])

    rel_sections_str = ""
    for s in rel_sections:
        section = get_section_by_number(sections, s)
        if section:
            rel_sections_str += f"{section}\n\n"
    system_prompt = "You are a helpful assistant that provides information about US Congress bills. You will be given a question about a bill and the relevant sections of the bill. Use the sections to answer the question. If the question is not related to the bill, respond with something like 'There's nothing in the bill related to that query'."
    user_prompt = ""
    if rel_sections_str:
        user_prompt = f"Relevant sections of the bill:\n\n{rel_sections_str}\n\n"
    user_prompt += f"Question: {query}"
    messages = build_messages(user_prompt, system_prompt)
    response = get_ollama_response(messages)
    ret = {
        "relevant_sections": rel_sections_str,
        "relevant_section_numbers": list(rel_sections),
        "response": response
    }
    return ret
    

In [1]:
question = "What, if anything, does this bill say about items covered by NFA?"

In [None]:
#investigation = query_bill(question)

In [None]:
#print(investigation["response"])


This bill doesn't explicitly mention or address items covered by the National Firearms Act (NFA). It focuses on various healthcare-related initiatives, including:

*   FEHB (Federal Employees Health Benefits) program improvements
*   Trump accounts contribution pilot program
*   Rescissions of funding for specific programs related to clean vehicles and the Clean Air Act.

It does not contain any provisions directly relating to NFA items.
