<a href="https://colab.research.google.com/github/fwdbyte/Compliance-Sample-RAG/blob/main/Financial_Services_Compliance_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview
This example is leverage on Gemini API to understand and clasify all internal regulations (in polish) according to some sample regulations put as a few-shot prompts.
Then internal documents are imported and utilise with Gemini to create embeddings.
At the end some sample prompts are generated to understand whether internal document is fulfilling external regulations and suggestions what can be done to mitigate those inconsistencies.




# Setup
Install **tools**

In [None]:
!pip uninstall -qqy jupyterlab kfp  # Remove unused conflicting packages
!pip install -U -q "google-genai==1.7.0"
!pip install chromadb
!pip install langchain langchain-community
!pip install PyPDF

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-1.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.25.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_2

## Setup libs
* Import basic libraries



In [None]:
from google import genai
from google.genai import types

* get google api key
* initialize the client
* Version check

## Define Models

In [None]:
# Define embedding Model
EMBEDDING_MODEL_ID = "models/embedding-001"
ANSWER_MODEL_ID = "gemini-2.5-pro-exp-03-25"

In [None]:
import sys
import os

def check_environment():
    """Checks if the code is running in a Colab or Kaggle environment."""
    if 'google.colab' in sys.modules:
        return 'colab'
    elif 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    else:
        return 'other'

environment = check_environment()

get secret,
initialize the API

In [None]:
if environment == 'colab':
    print("Running in Google Colab.")

    from google.colab import userdata
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
elif environment == 'kaggle':
    print("Running in Kaggle.")

    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    GOOGLE_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")
else:
    print("Running in another environment (not Colab or Kaggle).")
    GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")


In [None]:
import requests
from bs4 import BeautifulSoup
import os # Import os for potential future use, though not directly used in the function's core logic

def find_pdf_links_on_page(url):
    """
    Searches for and returns a list of unique PDF links found on a given web page.

    Args:
        url (str): The URL of the web page to search.

    Returns:
        list: A list of unique strings, where each string is a potential PDF link
              found on the page. Returns an empty list if no links are found
              or if an error occurs.
    """
    print(f"Attempting to fetch page: {url}")
    pdf_links = []

    try:
        # Fetch the content of the page
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        response.raise_for_status() # This will raise an HTTPError for bad responses (4xx or 5xx)

        print("Successfully fetched the page.")

        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the links on the page
        links = soup.find_all('a')

        print(f"Found {len(links)} links on the page. Checking for PDF links...")

        # Iterate through the links and check if they point to PDF files
        for link in links:
            href = link.get('href') # Get the value of the 'href' attribute

            # Ensure href is not None and is a string
            if href and isinstance(href, str):
                # Check if the link contains '.pdf' (case-insensitive)
                # This is a simple check suitable for many cases.
                # More robust methods might involve URL parsing.
                if '.pdf' in href.lower():
                     # Add the potential PDF link to our list
                     # We will make them unique later
                     pdf_links.append(href)

        # Return unique links
        return list(set(pdf_links))

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return [] # Return empty list on error
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return [] # Return empty list on error




In [None]:
# The URL of internal regulations
target_url = 'https://github.com/fwdbyte/host-sample-pdf/wiki'

# Call the function to find PDF links
regulation_links = find_pdf_links_on_page(target_url)

# Print the results
if regulation_links:
    print("\nFound the following unique PDF links:")
    for pdf_link in regulation_links:
        print(pdf_link)
else:
    print("\nNo PDF links found or an error occurred.")

In [None]:
from langchain.document_loaders import PyPDFLoader
import os

documents = []
for url in regulation_links:
  loader = PyPDFLoader(url) # Load using the filename
  documents = loader.load()

print(f"Loaded {len(documents)} document chunks.")


In [None]:
from google import genai
from google.genai import types
from typing import List, Dict, Any
import numpy as np
import pandas as pd

from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings

client = genai.Client(api_key=GOOGLE_API_KEY)



## Store the embeddings
Use chroma_db

In [None]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry

from google.genai import types


# Define a helper to retry when per-minute quota is reached.
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})
class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model=EMBEDDING_MODEL_ID,
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]

In [None]:
import chromadb

chroma_client = chromadb.Client()

embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

db = chroma_client.get_or_create_collection(name=DB_NAME, embedding_function=embed_fn)

for i, d in enumerate(documents):
    db.add(
        documents=[d.page_content],
        ids=str(i)
    )

db.count()

## Confirm that there's content (option)

Getting the relevant document
db is a Chroma collection object. You can call query on it to perform a nearest neighbors search to find similar embeddings or documents.




In [None]:
sample_data = db.get(include=['documents', 'embeddings'])

df = pd.DataFrame({
    "IDs": sample_data['ids'][:3],
    "Documents": sample_data['documents'][:3],
    "Embeddings": [str(emb)[:50] + "..." for emb in sample_data['embeddings'][:3]]  # Truncate embeddings
})

print(df)

# Document Retrieval

In [None]:
def get_relevant_passage(query, db):
  passage = db.query(query_texts=[query], n_results=1)['documents'][0][0]
  return passage

## Perform test embedding search (option)
Uncomment the following to test sample retrieval

In [None]:
# Perform test embedding search
# uncoment to check the results
# passage = get_relevant_passage("Jak należy zarządzać uprawnieniami?", db)
# Markdown(passage) # uncomment to check the embedding passage selected


# Generate answer


Prompt to get info with the right tone, format etc.

In [None]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""
    You are a helpful and informative bot that answers questions using
    text from the reference passage included below.
    Be sure to respond in a complete sentence, being comprehensive,
    including all relevant background information.
    However, you are talking to a non-technical audience, so be sure to
    break down complicated concepts and strike a friendly
    and converstional tone. If the passage is irrelevant to the answer,
    you may ignore it.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [None]:
def get_answer(query, db, model_id="gemini-2.5-pro-exp-03-25"):

    # 1. Get relevant passage using the query
    relevant_passage = get_relevant_passage(query, db)


    # 2. Create the prompt with query and passage
    prompt = make_prompt(
         query,
         relevant_passage
         )

    # 3. Get the answer from the LLM
    answer = client.models.generate_content(
        model=ANSWER_MODEL_ID,
        contents=prompt
        )

    return Markdown(answer.text)

In [None]:
get_answer(query,db, MODEL_ID)