In [15]:
!pip install fastapi pydantic pypdf pymupdf python-docx unstructured google-generativeai elasticsearch opencv-python numpy pytesseract pdf2image uvicorn nest-asyncio

import os
import pytesseract
import fitz  # PyMuPDF for PDF processing
from PIL import Image
from pdf2image import convert_from_path
from docx import Document
from unstructured.partition.email import partition_email
from elasticsearch import Elasticsearch
import numpy as np
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import google.generativeai as genai
import uvicorn
import nest_asyncio


genai.configure(api_key="")

app = FastAPI()

class EmailRequest(BaseModel):
    file_path: str
    sender: str
    subject: str
    predefined_categories: dict = None
    priority_rules: dict = None

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_eml(eml_path):
    elements = partition_email(filename=eml_path)
    return "\n".join(str(element) for element in elements)

def extract_text_with_ocr(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image) + "\n"
    return text.strip()

def extract_email_content(file_path):
    ext = file_path.split(".")[-1].lower()
    if ext == "pdf":
        return extract_text_from_pdf(file_path) + "\n" + extract_text_with_ocr(file_path)
    elif ext == "docx":
        return extract_text_from_docx(file_path)
    elif ext == "eml":
        return extract_text_from_eml(file_path)
    return ""

# Classification using Google Gemini API
def classify_with_gemini(email_text, sender, subject, priority_rules, predefined_categories):
    model = genai.GenerativeModel("gemini-pro")

    prompt = f"""
    Classify the following email into predefined categories:

    Email:
    Subject: {subject}
    Sender: {sender}
    Content: {email_text}

    Categories:
    {predefined_categories}

    Priority Rules:
    {priority_rules}

    Return a JSON with `primary_category`, `categories`, and `confidence_score`.
    """

    response = model.generate_content(prompt)
    return response.text  # Ensure correct JSON parsing


@app.post("/classify_email")
async def classify_email(request: EmailRequest):
    try:
        file_path = request.file_path
        sender = request.sender
        subject = request.subject

        if not file_path or not os.path.exists(file_path):
            raise HTTPException(status_code=400, detail="File not found")

        # Extract email text
        email_text = extract_email_content(file_path)

        # Default categories if not provided
        predefined_categories = request.predefined_categories or {
        }

        # Default priority rules if not provided
        priority_rules = request.priority_rules or {
            "content_weightage": 0.7,
            "attachment_weightage": 0.3,
            "keywords_priority": {}
        }

        # Classify email
        classification = classify_with_gemini(email_text, sender, subject, priority_rules, predefined_categories)

        response = {
            "classification": classification,
            "primary_category": classification.get("primary_category")
        }

        return response

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))



ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-10' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=SystemExit(1)>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 163, in startup
    server = await loop.create_server(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/base_events.py", line 1536, in create_server
    raise OSError(err.errno, msg) from None
OSError: [Errno 98] error while attempting to bind on address ('0.0.0.0', 8001): address already in use

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-14-9918d5e3f5a5>", line 162, in ru



INFO:     Started server process [377]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)


https://muyr3qi6pk7-496ff2e9c6d22116-8002-colab.googleusercontent.com/


In [16]:
import requests

url = "https://muyr3qi6pk7-496ff2e9c6d22116-8002-colab.googleusercontent.com/classify_email"
data = {
    "file_path": "sample_email.pdf",
    "sender": "test@example.com",
    "subject": "Billing Issue"
}

response = requests.post(url, json=data)
print(response.json())


JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [12]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(8001)"))


https://tw3vmdam0t-496ff2e9c6d22116-8001-colab.googleusercontent.com/


In [1]:
pip install google-genai

Collecting google-genai
  Downloading google_genai-1.7.0-py3-none-any.whl.metadata (32 kB)
Collecting websockets<15.1.0,>=13.0.0 (from google-genai)
  Downloading websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.8 kB)
Downloading google_genai-1.7.0-py3-none-any.whl (144 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl (173 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.3/173.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: websockets, google-genai
Successfully installed google-genai-1.7.0 websockets-15.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install

In [7]:
import base64
import os
from google import genai
from google.genai import types


def generate(emailContent):
    client = genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
    )

    model = "gemini-2.0-flash"
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(
                    text=f"""Classify This Email, give top 3 results
                    ```{emailContent}```
                    """
                ),
            ],
        )
    ]
    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        top_p=0.2,
        top_k=40,
        max_output_tokens=8192,
        response_mime_type="text/plain",
        system_instruction=[
            types.Part.from_text(
                text="""You are part of commercial bank lending service team, you have to classify the email/service request into the following request and sub request types.  There can be multiple classifications also. Output should be in a json array format like this. return in order of the confidence_score in descending order, the first should be the primary result having more score than others. Return 3 results if not user does not mention the number of results.
```
[{
\"request_type\":\"\",
\"sub_request_type\":\"\",
\"confidence_score\":\"\"
}]
```

Request and sub request types
```
I. Loan Adjustments & Changes:
* Adjustment:
    * Interest Rate Adjustment (Floating to Fixed, etc.)
    * Payment Schedule Adjustment
    * Covenant Adjustment
    * Loan Term Adjustment
* Commitment Change:
    * Commitment Increase
    * Commitment Decrease
    * Cashless Roll
    * Reallocation Principal
* Fee Adjustments:
    * Amendment Fee
    * Reallocation Fee
    * Late Payment Fee Waiver/Adjustment
    * Fee Dispute/Resolution
II. Account User (AU) & Access Management:
* AU Transfer:
    * Transfer of Account Ownership/Responsibility
    * Change of Authorized Signatories
    * User Access Modification (Add/Remove)
    * Online Banking Access Management
III. Loan Closure & Termination:
* Closing Notice:
    * Loan Payoff Notification
    * Account Closure Confirmation
    * Final Statement Generation
    * Sub Fee breakdowns.
        * reallocation fee
        * amendment fee
* Loan Termination:
    * Early Loan Termination.
    * Scheduled Loan Termination.
IV. Fee Management:
* Fee Payment:
    * Ongoing Fee Payment (e.g., annual maintenance)
    * Letter of Credit Fee Payment
    * Wire Transfer Fee Payment
    * Document Preparation Fee Payment
    * Loan syndication Fee.
V. Money Movement (Funds Transfer):
* Money Movement: Inbound:
    * Principal Payment
    * Interest Payment
    * Principal + Interest Payment
    * Principal + Interest + Fee Payment
    * Loan draw down.
* Money Movement: Outbound:
    * Time-Bound Transfer
    * Foreign Currency Transfer
    * Disbursement to Vendor/Third Party
    * Loan proceeds dispersal.
    * Escrow disbursement.
VI. Loan Documentation & Reporting:
* Document Requests:
    * Loan Agreement Copies
    * Security Agreement Copies
    * Compliance Certificates
    * Financial Statement Submissions
* Reporting:
    * Loan Balance Reports
    * Interest Accrual Reports
    * Payment History Reports
    * Custom Report Generation

VII. Specialized Requests:
* Letter of Credit (LC) Requests:
    * LC Issuance
    * LC Amendment
    * LC Payment
    * LC Cancellation.
* Trade Finance Requests:
    * Import Financing.
    * Export Financing.
    * Documentary Collections.
* Syndicated loan requests.
    * syndicate participant change.
    * syndicate information request.
    * syndicate payment distribution.
```"""
            ),
        ],
    )

    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        print(chunk.text, end="")


if __name__ == "__main__":
    generate(
        """Date: 10-Nov-2025
To: WELLS FARGO BANK, NATIONAL ASSOCIATION
Attention: AGENT DEFAULT
Fax: 877-606-9426
Re: ABTB MID-ATLANTIC LLC $171.3MM 11-4-2022, TERM LOAN A-2
Description: User Access Modification
BORROWER: ABTB MID-ATLANTIC LLC
DEAL NAME: ABTB MID-ATLANTIC LLC $171.3MM 11-4-2022
Effective 10-Nov-2025, the user access modification has been processed.
For: WELLS FARGO BANK, NA
Reference: ABTB MID-ATLANTIC LLC $171.3MM 11-4-2022
If you have any questions, please call the undersigned.
***********************************************
COMMENT
PLEASE ACKNOWLEDGE THE CHANGE
***********************************************
Citizens Bank, N.A.
Loan Agency Services

Regards,
SCOTT WALLACE
Telephone #:
Fax #:
Citizens Commercial Banking is a brand name of Citizens Bank, N.A. Member FDIC
November 10, 2025 - 9:00:00 AM"""
    )

```json
[
  {
    "request_type": "Account User (AU) & Access Management",
    "sub_request_type": "User Access Modification (Add/Remove)",
    "confidence_score": "0.95"
  },
  {
    "request_type": "Account User (AU) & Access Management",
    "sub_request_type": "Online Banking Access Management",
    "confidence_score": "0.75"
  },
  {
    "request_type": "Loan Documentation & Reporting",
    "sub_request_type": "Loan Balance Reports",
    "confidence_score": "0.3"
  }
]
```