##### Copyright 2025 Google LLC.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Customer Financial Profiler

<a target="_blank" href="https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/Customer_financial_profiler.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" height=30/></a>

<!-- Community Contributor Badge -->
<table>
  <tr>
    <!-- Author Avatar Cell -->
    <td bgcolor="#d7e6ff">
      <a href="https://github.com/sharathrushi" target="_blank" title="View Sharath Rushi's profile on GitHub">
        <img src="https://github.com/sharathrushi.png?size=100"
             alt="Sharath's GitHub avatar"
             width="100"
             height="100">
      </a>
    </td>
    <!-- Text Content Cell -->
    <td bgcolor="#d7e6ff">
      <h2><font color='black'>This notebook was contributed by <a href="https://github.com/sharathrushi" target="_blank"><font color='#217bfe'><strong>Sharath Rushi</strong></font></a>.</font></h2>
      <h5><font color='black'><a href="https://github.com/sharathrushi" target="_blank">
      <!-- Footer -->
      <font color='black'><small><em>Have a cool Gemini example? Feel free to <a href="https://github.com/google-gemini/cookbook/blob/main/CONTRIBUTING.md" target="_blank"><font color="#078efb">share it too</font></a>!</em></small></font>
    </td>
  </tr>
</table>

# Overview
This tutorial demonstrates estimating customer financial status
Given a customer's financial documents such as payslips, rental agreements, house valuation, shares

Estimating customer monthly income, movable assets, immovable assets

This is helpful for financial institutions to check for loan eligibility, estimating customer's value

## Setup

### Install SDK

In [None]:
%pip install -U -q "google-genai>=1.0.0"  # Install the Python SDK

# Always set at least 1.0.0 as the minimal version as there were breaking
# changes through the previous versions
# Of course, if your notebook uses a new feature and needs a more recent
# version, set the right minimum version to indicate when the feature was
# introduced.
# Always test your notebook with that fixed version (eg. '==1.0.0') to make.
# sure it's really the minimum version.


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.1/719.1 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m234.9/234.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires google-auth==2.43.0, but you have google-auth 2.47.0 which is incompatible.[0m[31m
[0m

In [None]:
# This step might ask you to restart the session for the installed packages to be reflected

%pip install -U -q pymupdf nougat-ocr tools "albumentations==1.3.1"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.7/125.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.5/82.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.5/431.5 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m846.0/846.0 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Set up your API key

To run the following cell, your API key must be stored it in a Colab Secret named `GOOGLE_API_KEY`. If you don't already have an API key, or you're not sure how to create a Colab Secret, see the [Authentication ![image](https://storage.googleapis.com/generativeai-downloads/images/colab_icon16.png)](../quickstarts/Authentication.ipynb) quickstart for an example.

In [None]:
from google.colab import userdata
from google import genai
from google.genai import types

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=GOOGLE_API_KEY)

Now select the model you want to use in this guide, either by selecting one in the list or writing it down. Keep in mind that some models, like the 2.5 ones are thinking models and thus take slightly more time to respond (cf. [thinking notebook](./Get_started_thinking.ipynb) for more details and in particular learn how to switch the thiking off).

In [None]:
MODEL_ID = "gemini-3-flash-preview" # @param ["gemini-2.5-flash-lite", "gemini-2.5-flash", "gemini-2.5-pro", "gemini-3-flash-preview", "gemini-3-pro-preview"] {"allow-input":true, isTemplate: true}

# Ideally order the model by "cabability" ie. generation then within generation
# 8b/flash-lite then flash then pro

# Configurable params

In [None]:
# Change this to the person you are interested in and upload their financial documents
person_of_interest = "Jesse Nathan"

# Loading necessary sample files
##### Below code downloads the sample input files to session storage
##### Note: These files get deleted after session expires

In [None]:
# Replace below code to load your customer/s financial documents
file_numbers = [1, 2, 3]
base_github_url = "https://github.com/sharathrushi/generative-ai/blob/587bddd11aea294f9ae512e1a860465452449e87/Jesse%20Nathan_"

for num in file_numbers:
    github_url = f"{base_github_url}{num}.pdf"
    raw_github_url = github_url.replace("github.com", "raw.githubusercontent.com").replace("/blob", "")
    file_name = raw_github_url.split('/')[-1].replace('%20', ' ') # Added .replace('%20', ' ')
    print(f"Downloading '{file_name}'...")
    !wget -O "{file_name}" "{raw_github_url}"
    print(f"Downloaded '{file_name}' to local storage.\n")

Downloading 'Jesse Nathan_1.pdf'...
--2026-01-21 10:59:11--  https://raw.githubusercontent.com/sharathrushi/generative-ai/587bddd11aea294f9ae512e1a860465452449e87/Jesse%20Nathan_1.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 909521 (888K) [application/octet-stream]
Saving to: ‘Jesse Nathan_1.pdf’


2026-01-21 10:59:11 (14.2 MB/s) - ‘Jesse Nathan_1.pdf’ saved [909521/909521]

Downloaded 'Jesse Nathan_1.pdf' to local storage.

Downloading 'Jesse Nathan_2.pdf'...
--2026-01-21 10:59:11--  https://raw.githubusercontent.com/sharathrushi/generative-ai/587bddd11aea294f9ae512e1a860465452449e87/Jesse%20Nathan_2.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubus

# Imports and files

In [None]:
import io
import os
import json
import re
import pymupdf as fitz
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq



In [None]:
grounding_tool = types.Tool(google_search=types.GoogleSearch())
config = types.GenerateContentConfig(tools=[grounding_tool])

In [None]:
all_files = os.listdir('.')
person_of_interest_files = [f for f in all_files if f.startswith(person_of_interest)]
results = []
model_id = "facebook/nougat-small"
local_model_dir = "./nougat_local_model"
processor = AutoProcessor.from_pretrained(model_id)
model_base = AutoModelForVision2Seq.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/165 [00:00<?, ?B/s]

# AI ML Checks
Checks whether the person is linked to illegal activities, terrorism or money laundering

In [None]:
def add_citations(response):
    text = response.text

    # Check if grounding metadata exists and has supports/chunks
    if not response.candidates or not response.candidates[0].grounding_metadata:
        print("No grounding metadata available. Returning original text.")
        return text

    grounding_metadata = response.candidates[0].grounding_metadata
    supports = grounding_metadata.grounding_supports
    chunks = grounding_metadata.grounding_chunks

    if not supports or not chunks:
        print("No grounding supports or chunks available. Returning original text.")
        return text

    # Sort supports by end_index in descending order to avoid shifting issues when inserting.
    sorted_supports = sorted(supports, key=lambda s: s.segment.end_index, reverse=True)

    for support in sorted_supports:
        end_index = support.segment.end_index
        if support.grounding_chunk_indices:
            # Create citation string like [1](link1)[2](link2)
            citation_links = []
            for i in support.grounding_chunk_indices:
                if i < len(chunks):
                    uri = chunks[i].web.uri
                    citation_links.append(f"[{i + 1}]({uri})")

            citation_string = ", ".join(citation_links)
            text = text[:end_index] + citation_string + text[end_index:]

    return text

In [None]:
def aml_screening_with_grounding(name):
    prompt = f"""
    Based on recent news and publicly available information, is there any credible indication that {name} is involved in money laundering, terrorism financing, or other illegal activities?
    Return 'True' if there are clear red flags or strong suspicious mentions; otherwise, return 'False'. Respond with only 'True' or 'False'.
    """

    response = client.models.generate_content(
        model=MODEL_ID,
        contents=prompt,
        config=config,
    )
    print(f"Raw response from Google: {response.text}")

    # Add citations for transparency
    text_with_citations = add_citations(response)
    print(f"Grounding analysis for {name}:\n{text_with_citations}")

    # Convert model's response to boolean
    if response.text:
      genai_analysis = response.text.strip().lower()
      if "true" in genai_analysis:
        return True
      else:
        return False
    else:
        return False

In [None]:
red_flags = aml_screening_with_grounding(person_of_interest)
print(f"Red flags for {person_of_interest}: {red_flags}")

Raw response from Google: False
No grounding supports or chunks available. Returning original text.
Grounding analysis for Jesse Nathan:
False
Red flags for Jesse Nathan: None


# Processing Source of Wealth Files
##### Below code loads pdf, extracts text and generates the financial profile of the customer
##### Note: It also supports image or scanned pdf documents

In [None]:
def pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page_num, page in enumerate(doc):
        page_text_fitz = page.get_text()

        # If fitz extracts no text, assume it's an image-based PDF and use Nougat
        if not page_text_fitz.strip():
            # Render page to an image (high resolution for OCR)
            # Using a matrix to increase resolution (e.g., 2x2 or 3x3) can improve OCR accuracy.
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            # Process image with Nougat
            pixel_values = processor(images=img, return_tensors="pt").pixel_values

            # Generate text using the Nougat model. Removed decoder_input_ids as it's not needed for starting generation from image.
            outputs = model_base.generate(
                pixel_values,
                min_length=1,
                max_new_tokens=4096,
                bad_words_ids=[[processor.tokenizer.unk_token_id]],
            )
            page_text_ocr = processor.batch_decode(outputs, skip_special_tokens=True)[0]
            full_text += page_text_ocr + "\n\n" # Add some separation between pages
        else:
            full_text += page_text_fitz + "\n\n" # Add some separation between pages
    return full_text

In [None]:
def verify_and_extract(text, target_name):
    prompt = f"""
    Analyze the following text from multiple sources regarding {target_name}.
    1. Verify if all sources document is primarily about {target_name}. (Yes/No)
    2. Aggregate the values to retrieve the following information:
        - Monthly Income
        - Immovable Assets (Real Estate)
        - Movable Assets (Cash/Stocks)
    3. Format as JSON.
    Text: {text[:10000]} # Truncate for token limits
    """
    response = client.models.generate_content(
        model=MODEL_ID,
        contents=prompt
        )
    return response.text

In [None]:
json_schema = """
    {
      "all_sources_primarily_about_target_name": "Yes/No",
      "aggregated_values": {
        "monthly_income": {
          "value": "number",
          "currency": "string",
          "period": "string"
        },
        "immovable_assets": [
          {
            "description": "string",
            "value": "number",
            "currency": "string",
            "type": "string",
            "valuation_basis": "string",
            "age_of_property_years": "number"
          }
        ],
        "movable_assets": {
          "stocks": [
            {
              "company_name": "string",
              "number_of_shares": "number",
              "par_value_per_share": "number",
              "currency_per_share": "string",
              "total_value": "number",
              "total_value_currency": "string",
              "owner": "string"
            }
          ],
          "cash": {
            "value": "number or 0",
            "currency": "string or null",
            "note": "string"
          }
        }
      }
    }
    """

In [None]:
combined_text = ""
for file in person_of_interest_files:
    text = pdf_to_text(file)
    print(f"Extracting information from {file}")
    combined_text += text + "\n" # Added newline character
result = verify_and_extract(combined_text, person_of_interest)
print(result)

Extracting information from Jesse Nathan_2.pdf
Extracting information from Jesse Nathan_3.pdf
Extracting information from Jesse Nathan_1.pdf
```json
{
  "is_primarily_about_jesse_nathan": "Yes",
  "extracted_data": {
    "monthly_income": {
      "net_pay": 8000,
      "gross_earnings": 8800,
      "currency": "Unspecified",
      "source": "Payslip (Zoonodle Inc)"
    },
    "immovable_assets": [
      {
        "property_type": "Real Estate (Flat/House/Land/Villa/Farm/Plot/Bungalow)",
        "address": "Florence 1, Embassy Road, Global Marker, VISA Arena, Across India, India",
        "market_value": 10000000,
        "currency": "INR",
        "owner": "Mr. Jesse Nathan"
      }
    ],
    "movable_assets": [
      {
        "asset_type": "Stocks",
        "corporate_name": "Everest Financial Group Inc.",
        "number_of_shares": 250,
        "par_value_per_share": 1.00,
        "total_estimated_value": 250.00,
        "currency": "USD",
        "owner": "Jesse Nathan"
      }
 

In [None]:
# Extract JSON string from the markdown output
json_match = re.search(r'```json\n(.*)\n```', result, re.DOTALL)
if json_match:
    json_string = json_match.group(1)
    result_json = json.loads(json_string)
    print("Successfully parsed JSON output.")
else:
    print("Could not find JSON output in the expected format.")
    result_json = None

display(result_json)

Successfully parsed JSON output.


{'is_primarily_about_jesse_nathan': 'Yes',
 'extracted_data': {'monthly_income': {'net_pay': 8000,
   'gross_earnings': 8800,
   'currency': 'Unspecified',
   'source': 'Payslip (Zoonodle Inc)'},
  'immovable_assets': [{'property_type': 'Real Estate (Flat/House/Land/Villa/Farm/Plot/Bungalow)',
    'address': 'Florence 1, Embassy Road, Global Marker, VISA Arena, Across India, India',
    'market_value': 10000000,
    'currency': 'INR',
    'owner': 'Mr. Jesse Nathan'}],
  'movable_assets': [{'asset_type': 'Stocks',
    'corporate_name': 'Everest Financial Group Inc.',
    'number_of_shares': 250,
    'par_value_per_share': 1.0,
    'total_estimated_value': 250.0,
    'currency': 'USD',
    'owner': 'Jesse Nathan'}]}}

## Next steps
1. Token & Cost Management

    Token Counts: Use model.count_tokens(prompt) before calling the API.

    Billing: Gemini 1.5 Flash is significantly cheaper than Pro. For high-volume PDF parsing, use Flash.

    Reduction: Implement Semantic Chunking. Instead of sending a 100-page PDF, use a lightweight NLP tool (like spaCy) to find pages containing "Balance Sheet" or "Assets" and only send those pages to the LLM.

2. Metrics & Benchmarking
Metric	Description
Extraction Accuracy	Compare LLM output against a manually labeled "Golden Dataset."
Latency	Time taken from PDF upload to final report (Target: < 30s).
F1 Score	For NER (Named Entity Recognition) of asset values.
3. Hallucination Check & Monitoring

    Self-Reflection: Ask the model to "Provide the exact quote from the text where you found this asset value." If it can't, flag it as a potential hallucination.

    NLI (Natural Language Inference): Use a smaller model to check if the generated "Summary" is logically entailed by the "Source Text."

    Monitoring: Use tools like Arize Phoenix or LangSmith to track drift and "faithfulness" scores in production.

4. Human-in-the-Loop (HITL)

In financial compliance, AI should never make the final "Reject" decision.

    Confidence Thresholds: If the LLM confidence is <0.85, route the case to a compliance officer.

    UI Annotation: Highlight the source PDF text in a dashboard so the human can quickly verify the AI's extraction.

5. NLP Alternatives (Non-GenAI)

To reduce costs or increase speed, use:

    Entity Extraction: spaCy or Hugging Face FinBERT (fine-tuned for finance).

    Pattern Matching: Regular Expressions (Regex) for specific formats like CIBIL scores or Currency amounts.

    Classification: XGBoost on TF-IDF vectors to categorize documents before they reach the LLM.