# Email & document classification for triage

## Email Classification
- Uses GPT to classify emails into Request Type and Sub Request Type.
- Returns a confidence score for classification.

In [22]:
!pip install --no-cache-dir openai langchain langchain_openai pdfplumber pdf2image easyocr python-docx fpdf pymupdf chromadb sentence-transformers gradio pandas extract_msg

Collecting extract_msg
  Downloading extract_msg-0.54.0-py3-none-any.whl.metadata (15 kB)
Collecting olefile==0.47 (from extract_msg)
  Downloading olefile-0.47-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting compressed-rtf<2,>=1.0.6 (from extract_msg)
  Downloading compressed_rtf-1.0.6.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ebcdic<2,>=1.1.1 (from extract_msg)
  Downloading ebcdic-1.1.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting RTFDE<0.2,>=0.1.1 (from extract_msg)
  Downloading RTFDE-0.1.2-py3-none-any.whl.metadata (3.9 kB)
Collecting red-black-tree-mod<=1.23,>=1.20 (from extract_msg)
  Downloading red-black-tree-mod-1.22.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting lark~=1.1.8 (from RTFDE<0.2,>=0.1.1->extract_msg)
  Downloading lark-1.1.9-py3-none-any.whl.metadata (1.9 kB)
Collecting oletools>=0.56 (from RTFDE<0.2,>=0.1.1->extract_msg)
  Downloading oletools-0.60.2-py2.py3-none-any.whl.metadata (16 kB

## Import the Required Libraries

In [2]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

import chromadb
from sentence_transformers import SentenceTransformer

import numpy as np
import os
import openai
import json
import requests
import pandas as pd
from docx import Document
import email
import fitz  # PyMuPDF for PDFs
import docx
import pytesseract
import pdfplumber
from pdfminer.pdfparser import PDFSyntaxError
import mimetypes
from email import policy
from email.parser import BytesParser
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import uuid
import base64

import gradio as gr


## Mount the google drive

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Setup the Api Key for using Open Api

In [4]:
# Set the API key
folder_path = "/content/drive/MyDrive/EmailClassification/"

# Read the text file containing the API key
with open(folder_path + 'OpenAI_API_Key.txt', 'r') as f:
  openai.api_key = ' '.join(f.readlines())

# Update the OpenAI API key by updating the environment variable
os.environ["OPENAI_API_KEY"] = openai.api_key

### Apis for extracting texts and attachments from the email (eml files with attachments as pdf, doc, images or another eml file)

In [23]:
import os
import email
import easyocr
import pdfplumber
import extract_msg  # Library for .msg file support
import traceback
import numpy as np
from email import policy
from email.parser import BytesParser
from io import BytesIO
from PIL import Image
from docx import Document
from bs4 import BeautifulSoup  # For HTML parsing

# Initialize EasyOCR Reader
ocr_reader = easyocr.Reader(["en"])  # Specify language (English)

def extract_text_from_pdf_bytes(pdf_bytes):
    """Extract text from a PDF file."""
    text = ""
    try:
        with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
    except Exception as e:
        print(f"❌ PDF Extraction Error: {str(e)}")
    return text.strip()

def extract_text_from_docx_bytes(doc_bytes):
    """Extract text from a DOCX file."""
    text = ""
    try:
        doc_stream = BytesIO(doc_bytes)
        doc = Document(doc_stream)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"❌ DOC/DOCX Extraction Error: {str(e)}")
    return text.strip()

def extract_text_from_image(image_bytes):
    """Extract text from images using EasyOCR."""
    text = ""
    try:
        img = Image.open(BytesIO(image_bytes))
        results = ocr_reader.readtext(np.array(img))  # Convert Image to NumPy array
        text = "\n".join([res[1] for res in results])
    except Exception as e:
        print(f"❌ Image Extraction Error: {str(e)}")
    return text.strip()

def process_eml_bytes(eml_bytes, is_nested=False):
    """Processes an EML file given as bytes, handling nested emails properly."""
    try:
        msg = BytesParser(policy=policy.default).parse(BytesIO(eml_bytes))

        email_data = {
            "subject": msg["subject"],
            "from": msg["from"],
            "to": msg["to"],
            "date": msg["date"],
            "body": "",
            "attachments": []
        }

        # Extract email body (only for non-nested emails)
        if not is_nested:
            body_text = []
            for part in msg.walk():
                content_type = part.get_content_type()
                content_disposition = str(part.get("Content-Disposition", ""))

                # Extract text/plain parts, but exclude nested emails
                if content_type == "text/plain" and "attachment" not in content_disposition:
                    decoded_text = part.get_payload(decode=True).decode(errors="ignore").strip()
                    if decoded_text:
                        body_text.append(decoded_text)

                # Extract HTML content if plain text is empty
                elif content_type == "text/html" and not body_text:
                    soup = BeautifulSoup(part.get_payload(decode=True), "html.parser")
                    decoded_text = soup.get_text().strip()
                    if decoded_text:
                        body_text.append(decoded_text)

            email_data["body"] = "\n".join(body_text).strip()

        # Process attachments
        for part in msg.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition", ""))
            file_name = part.get_filename()
            payload = part.get_payload(decode=True)  # Decode base64 content
            extracted_text = ""

            if not file_name and content_type == "message/rfc822":
                # Gmail often stores nested EMLs without a filename
                file_name = "nested_email.eml"

            if not file_name:
                continue  # Ignore if no filename is found

            # Handle PDFs
            if content_type in ["application/pdf", "application/octet-stream"] and file_name.lower().endswith(".pdf"):
                extracted_text = extract_text_from_pdf_bytes(payload)

            # Handle Word Documents
            elif content_type in ["application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"] or file_name.lower().endswith((".doc", ".docx")):
                extracted_text = extract_text_from_docx_bytes(payload)

            # Handle Images
            elif content_type.startswith("image/") or (content_type == "application/octet-stream" and file_name.lower().endswith(("png", "jpg", "jpeg"))):
                extracted_text = extract_text_from_image(payload)

            # Handle nested EML files (Gmail nested emails included)
            elif content_type == "message/rfc822" or (content_type == "application/octet-stream" and file_name.lower().endswith(".eml")):
                nested_email_data = process_eml_bytes(payload, is_nested=True)  # Process as nested email

                email_data["attachments"].append({
                    "file_name": file_name,
                    "content_type": content_type,
                    "nested_email": nested_email_data  # Store full nested email structure instead of text
                })
                continue  # Skip normal processing for nested emails

            email_data["attachments"].append({
                "file_name": file_name,
                "content_type": content_type,
                "extracted_text": extracted_text
            })

        return email_data

    except Exception as e:
        print(f"❌ Email Parsing Error: {traceback.format_exc()}")
        return None

def process_eml_file(eml_path):
    """Processes an EML file from a given file path and extracts email content and attachments."""
    try:
        with open(eml_path, "rb") as f:
            eml_bytes = f.read()
        return process_eml_bytes(eml_bytes)
    except Exception as e:
        print(f"❌ Error processing EML file ({eml_path}): {str(e)}")
        return None

def process_msg_bytes(msg_bytes):
    """Processes a .msg file given as bytes and extracts email content and attachments."""
    try:
        with BytesIO(msg_bytes) as msg_stream:
            msg = extract_msg.Message(msg_stream)

        msg_data = {
            "subject": msg.subject,
            "from": msg.sender,
            "to": msg.to,
            "date": msg.date,
            "body": msg.body,
            "attachments": []
        }

        # Process attachments
        for attachment in msg.attachments:
            extracted_text = ""
            file_name = attachment.longFilename or attachment.shortFilename

            if file_name.lower().endswith(".pdf"):
                extracted_text = extract_text_from_pdf_bytes(attachment.data)
            elif file_name.lower().endswith((".doc", ".docx")):
                extracted_text = extract_text_from_docx_bytes(attachment.data)
            elif file_name.lower().endswith((".png", ".jpg", ".jpeg")):
                extracted_text = extract_text_from_image(attachment.data)
            elif file_name.lower().endswith(".eml"):
                extracted_text = process_eml_bytes(attachment.data, is_nested=True)

            msg_data["attachments"].append({
                "file_name": file_name,
                "extracted_text": extracted_text
            })

        return msg_data

    except Exception as e:
        print(f"❌ Error processing MSG file: {str(e)}")
        return None

def process_msg_file(msg_path):
    """Processes a .msg file from a given file path and extracts email content and attachments."""
    try:
        with open(msg_path, "rb") as f:
            msg_bytes = f.read()
        return process_msg_bytes(msg_bytes)
    except Exception as e:
        print(f"❌ Error processing MSG file ({msg_path}): {str(e)}")
        return None




Only for Testing - should be removed

In [24]:
EMAIL_DIR = "/content/drive/MyDrive/EmailClassification/emails/"
for eml_file in os.listdir(EMAIL_DIR):
    if eml_file.endswith(".eml"):
        # Step 1: Process email and extract content
        processed_email = process_eml_file(os.path.join(EMAIL_DIR, eml_file))
        print("📩 Processed Email:", processed_email)

📩 Processed Email: {'subject': 'Facility Lender Share Adjustment', 'from': 'scott.wallace@citizensbank.com', 'to': 'ramakrishna.kunchala@wellsfargo.com', 'date': None, 'body': 'Citizens Bank, N.A.\nLoan Agency Services\n\nDate: 05-Feb-2025\n\nTO: WELLS FARGO BANK, NATIONAL ASSOCIATION\nATTN: RAMAKRISHNA KUNCHALA\nFax: 877-606-9426\n\nRe: ABTB MID-ATLANTIC LLC $171.3MM 11-4-2022, TERM LOAN A-2\n\nDescription: Facility Lender Share Adjustment\nBORROWER: ABTB MID-ATLANTIC LLC\nDEAL NAME: ABB MID-ATLANTIC LLC $171.3MM 11-4-2022\n\nEffective 04-Feb-2025, the Lender Shares of facility TERM LOAN A-2 have been adjusted.\nYour share of the commitment was USD 5,518,249.19. It has been Increased to USD 5,542,963.55.\n\nFor: WELLS FARGO BANK, NA\nReference: ABIB MID-ATLANTIC LIC $171.3MM 11-4-2022\n\nIf you have any questions, please call the undersigned.\n\n** COMMENT *\nPLEASE FUND YOUR SHARE OF $24,714.36\n\nBank Name: Citizens Bank NA\nABA # 011500120\nAccount #: 0026693011\nAccount Name: LIQ 

### Function Calling Api to Extract Request & sub request in json format
Using the Function Calling API to create a function schema to extract data directly into the defined JSON format

In [7]:
functions = [
    {
        "name": "classify_email",
        "description": "Classifies the email into a request type and sub-request type.",
        "parameters": {
            "type": "object",
            "properties": {
                "request_type": {
                    "type": "string",
                    "description": "The high-level category of the request based on the primary intent of the email."
                },
                "sub_request_type": {
                    "type": "string",
                    "description": "The specific sub-category under the request type based on the primary intent of the email."
                },
                "duplicate_flag": {
                    "type": "boolean",
                    "description": "Flag to indicate if the email is a duplicate."
                },
                "confidence_score": {
                    "type": "number",
                    "description": "Confidence score between 0 and 1."
                },
                "reason": {
                    "type": "string",
                    "description": "Explanation for classification and confidence score."
                }
            },
            "required": [
                "request_type",
                "sub_request_type",
                "duplicate_flag",
                "confidence_score",
                "reason"
            ]
        }
    },
    {
        "name": "extract_metadata_fields",
        "description": "Extracts metadata fields from the email based on the identified request type and sub-request type.",
        "parameters": {
            "type": "object",
            "properties": {
                "request_type": {
                    "type": "string",
                    "description": "The request type determined from email classification."
                },
                "sub_request_type": {
                    "type": "string",
                    "description": "The sub-request type determined from email classification."
                },
                "key_metadata_fields": {
                    "type": "object",
                    "description": "A key-value pair map containing extracted metadata fields relevant to the request type and sub-request type.",
                    "minProperties": 1  # 🚨 Ensures at least one metadata field is always present
                }
            },
            "required": [
                "request_type",
                "sub_request_type",
                "key_metadata_fields"
            ]
        }
    }
]


### Create Pormpt for email classification with request type, sub request type, confidence score, reason for classification and duplicate check flag

In [8]:
def create_email_classification_prompt(email_text, duplicate_flag):
    predefined_request_types = """
    Request Types:

    - Adjustment: (Subtypes: Fee Adjustment, Principal Adjustment, Interest Adjustment)
    - AU Transfer: (Subtypes: Intra-Bank Transfer, Inter-Bank Transfer, Scheduled Transfer)
    - Closing Notice: (Subtypes: Reallocation Fees, Amendment Fees, Reallocation Principal)
    - Commitment Change: (Subtypes: Cashless Roll, Decrease, Increase)
    - Fee Payment: (Subtypes: Ongoing Fee, Letter of Credit Fee)
    - Money Movement-Inbound: (Subtypes: Principal, Interest, Principal + Interest, Principal+Interest+Fee)
    - Money Movement-Outbound: (Subtypes: Timebound, Foreign Currency)
    - Loan Origination: (Subtypes: Application Submission, Documentation Provision, Credit Evaluation, Approval Notification)
    - Loan Disbursement: (Subtypes: Fund Transfer, Disbursement Schedule, Disbursement Confirmation)
    - Loan Repayment: (Subtypes: Repayment Schedule Setup, Early Repayment, Payment Rescheduling, Payment Confirmation)
    - Loan Information: (Subtypes: Balance Inquiry, Amortization Schedule, Interest Statement, Tax Certificate)
    - Loan Closure: (Subtypes: Closure Statement, No Dues Certificate, Security Release)
    - Loan Service: (Subtypes: Statement Requests, Document Retrieval, Account Linking)
    - Loan Grievance: (Subtypes: Dispute Resolution, Complaint Registration, Feedback Submission)
    """

    classification_prompt = f"""
    You are a subject matter expert in Commercial Bank Lending Services, responsible for classifying emails into predefined **Request Types** and **Sub Request Types**.

    **Priority Considerations:**
    1. **Email body takes priority** over attachments for classification.
    2. **Primary intent of the customer is the key focus**, even when multiple requests are mentioned.
    3. If an email contains **both a discussion and an explicit ask**, prioritize the **ask** as the primary intent.
    4. **Money movement-related requests take priority** in case of conflicts.
    5. Mark **duplicate emails** based on the flag extracted from similarity search of earlier emails, which is **{duplicate_flag}**.

    **Predefined Request Categories & Subtypes:**
    {predefined_request_types}

    **Task:**
    - Analyze the email and classify it into the most relevant **Request Type** and **Sub Request Type**.
    - Provide a **confidence score** (0.0 to 1.0) indicating the likelihood of correct classification.
    - Explain the **reasoning** for classification.

    **Email Content for Classification:**
    "{email_text}"

    **Output Format (JSON):**
    ```json
    {{
        "request_type": "Determined request type based on primary intent",
        "sub_request_type": "Determined sub-request type",
        "duplicate_flag": {duplicate_flag},
        "confidence_score": confidence_value (between 0-1),
        "reason": "Explanation for classification and confidence score"
    }}
    ```
    """

    return [
        {"role": "system", "content": "You are a subject matter expert in Commercial Bank Lending Services. Your task is to classify emails and determine the request type and sub-request type with high accuracy."},
        {"role": "user", "content": classification_prompt}
    ]


### Create Pormpt for extracting metadata fields from the email based on the extracted request type and sub request type

In [9]:
def create_metadata_fields_extraction_prompt(email_text, request_type, sub_request_type):
  metadata_fields_mapping = {
      "Adjustment - Fee Adjustment": ["deal_name", "adjustment_amount", "effective_date", "lender_name", "reason"],
      "Adjustment - Principal Adjustment": ["deal_name", "principal_amount", "effective_date", "lender_name"],
      "Adjustment - Interest Adjustment": ["deal_name", "interest_amount", "effective_date", "lender_name"],

      "AU Transfer - Intra-Bank Transfer": ["transfer_id", "source_account_number", "destination_account_number", "transfer_amount", "transfer_date"],
      "AU Transfer - Inter-Bank Transfer": ["transfer_id", "source_bank", "destination_bank", "transfer_amount", "transfer_date"],
      "AU Transfer - Scheduled Transfer": ["transfer_id", "source_account_number", "destination_account_number", "transfer_amount", "scheduled_date"],

      "Closing Notice - Reallocation Fees": ["deal_name", "fee_type", "reallocation_amount", "effective_date", "lender_name"],
      "Closing Notice - Amendment Fees": ["deal_name", "fee_type", "amendment_amount", "effective_date", "lender_name"],
      "Closing Notice - Reallocation Principal": ["deal_name", "principal_amount", "effective_date", "lender_name"],

      "Commitment Change - Cashless Roll": ["deal_name", "lender_name", "commitment_type", "amount_rolled", "effective_date"],
      "Commitment Change - Decrease": ["deal_name", "lender_name", "commitment_decrease", "new_commitment_amount", "effective_date"],
      "Commitment Change - Increase": ["deal_name", "lender_name", "commitment_increase", "new_commitment_amount", "effective_date"],

      "Fee Payment - Ongoing Fee": ["fee_type", "due_date", "amount_paid", "outstanding_amount", "payment_date", "reference_number"],
      "Fee Payment - Letter of Credit Fee": ["fee_type", "amount_paid", "payment_date", "credit_reference_number"],

      "Money Movement - Inbound - Principal": ["amount", "currency", "transaction_id", "sender_bank", "receiver_bank", "transfer_date"],
      "Money Movement - Inbound - Interest": ["amount", "currency", "transaction_id", "sender_bank", "receiver_bank", "transfer_date"],
      "Money Movement - Inbound - Principal + Interest": ["total_amount", "currency", "transaction_id", "sender_bank", "receiver_bank", "transfer_date"],
      "Money Movement - Outbound - Timebound": ["amount", "currency", "transaction_id", "receiver_bank", "transfer_date", "time_constraint"],
      "Money Movement - Outbound - Foreign Currency": ["amount", "currency", "exchange_rate", "transaction_id", "receiver_bank", "transfer_date"],

      "Loan Origination - Application Submission": ["borrower_name", "loan_application_id", "requested_amount", "submission_date"],
      "Loan Origination - Documentation Provision": ["borrower_name", "document_type", "submission_date"],
      "Loan Origination - Credit Evaluation": ["borrower_name", "credit_score", "evaluation_date"],
      "Loan Origination - Approval Notification": ["borrower_name", "loan_application_id", "approval_status", "approval_date"],

      "Loan Disbursement - Fund Transfer": ["fund_transfer_id", "disbursement_amount", "disbursement_date", "beneficiary_account"],
      "Loan Disbursement - Disbursement Schedule": ["loan_id", "disbursement_plan", "schedule_dates"],
      "Loan Disbursement - Disbursement Confirmation": ["loan_id", "confirmation_date", "amount_disbursed"],

      "Loan Repayment - Repayment Schedule Setup": ["loan_id", "installment_amount", "payment_due_date", "repayment_term"],
      "Loan Repayment - Early Repayment": ["loan_id", "remaining_balance", "early_repayment_date"],
      "Loan Repayment - Payment Rescheduling": ["loan_id", "new_payment_schedule", "reschedule_reason"],
      "Loan Repayment - Payment Confirmation": ["loan_id", "payment_date", "amount_paid"],

      "Loan Information - Balance Inquiry": ["account_number", "current_balance", "last_transaction_date"],
      "Loan Information - Amortization Schedule": ["loan_id", "remaining_payments", "monthly_installment"],
      "Loan Information - Interest Statement": ["loan_id", "interest_rate", "accrued_interest", "statement_period"],
      "Loan Information - Tax Certificate": ["borrower_name", "loan_id", "tax_year", "interest_paid"],

      "Loan Closure - Closure Statement": ["loan_id", "closure_date", "final_payment_amount"],
      "Loan Closure - No Dues Certificate": ["loan_id", "certificate_issue_date"],
      "Loan Closure - Security Release": ["loan_id", "release_date", "security_details"],

      "Loan Service - Statement Requests": ["account_number", "statement_period", "delivery_preference"],
      "Loan Service - Document Retrieval": ["document_type", "request_date", "delivery_preference"],
      "Loan Service - Account Linking": ["primary_account_number", "linked_account_number", "linking_type"],

      "Loan Grievance - Dispute Resolution": ["dispute_id", "dispute_description", "resolution_status", "resolution_date"],
      "Loan Grievance - Complaint Registration": ["complaint_id", "complaint_category", "complaint_details"],
      "Loan Grievance - Feedback Submission": ["feedback_id", "feedback_category", "feedback_text"]
    }

  # Lookup metadata fields using request_type + sub_request_type
  metadata_fields = metadata_fields_mapping.get(
      f"{request_type} - {sub_request_type}",
      metadata_fields_mapping.get(request_type, [])
  )

  metadata_prompt = f"""
  You are an AI assistant specializing in extracting **structured metadata** from emails related to Commercial Bank Lending Services.

  **Your task:**
  - Extract only the metadata fields relevant to **Request Type: {request_type}** and **Sub-Request Type: {sub_request_type}** and provide the output content in a json format.
  - Ignore classification details like request type, confidence score, or reasoning.
  - Extract metadata **only as key-value pairs**, ensuring values are **explicitly mentioned in the email text**.
  - **Expected Metadata Fields:** {', '.join(metadata_fields)}
  - If metadata_fields is empty, identify the possible metadata_fields based on the request_type {request_type}, sub_request_type {sub_request_type} and your knowledge in commercial bank lending service and prepare a json by extracting corresponding values in the format given in few shot examples.
  - **Output must be a flat JSON object** with key-value pairs.
  - If no valid metadata is found, output  an empty json response.

  ---
  **Email Content for Metadata Extraction:**
  "{email_text}"

  ---
  **Example 1: Commitment Change - Increase**
  **Email:**
  "Dear Loan Team,
  We request an increase in our commitment under the ABC Infrastructure Fund.
  Our current commitment is $2,000,000, and we would like to increase it to $2,500,000.
  Please process this request and confirm.

  Regards,
  John Doe, CFO, XYZ Corp"

  **Expected JSON Output:**
  {{
      "deal_name": "ABC Infrastructure Fund",
      "lender_name": "XYZ Corp",
      "commitment_increase": "USD 500,000",
      "new_commitment_amount": "USD 2,500,000"
  }}

  ---
  **🚨 IMPORTANT:**
  - Ensure **every field** has a value.
  - If no metadata is found, return an empty json response.
  - **Final Output Format:**
  {{
      "field_1": "Extracted value",
      "field_2": "Extracted value"
  }}
  """

  return [
      {"role": "system", "content": "You are an expert AI model trained to extract metadata fields from commercial banking emails. You return only key-value pairs without classification details."},
      {"role": "user", "content": metadata_prompt}
  ]

In [10]:
def get_email_classification(email_text, duplicate_flag, functions):
    user_input = create_email_classification_prompt(email_text, duplicate_flag)
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=user_input,
        functions=functions,
        function_call="auto"
    )
    return response.choices[0].message

In [27]:
def get_metadata_fields(email_text, request_type, sub_request_type, functions):
    user_input = create_metadata_fields_extraction_prompt(email_text, request_type, sub_request_type)
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=user_input,
        functions=functions,
        function_call="auto"
    )
    return response.choices[0].message

In [12]:
def compute_email_embedding(embedding_model, email_text):
    """Converts email text into an embedding vector compatible with ChromaDB."""
    return embedding_model.encode(email_text, convert_to_numpy=True).tolist()

In [13]:
def check_duplicate_email(email_collection, embedding_model, email_text, threshold=0.90):
    """Checks if the email is a duplicate by performing similarity search in ChromaDB."""
    email_embedding = compute_email_embedding(embedding_model, email_text)

    # Perform similarity search in ChromaDB
    results = email_collection.query(
        query_embeddings=[email_embedding],  # Search for similar emails
        n_results=2
    )

    if len(results['distances'][0]) > 1 and results['distances'][0][1] <= (1 - threshold):
            print(f"Duplicate found: Email is similar to {results['ids'][0][1]} with similarity {1 - results['distances'][0][1]:.2f}")
            return True, results["ids"][0][0]

    return False, None  # Not a duplicate

In [14]:
def store_email_vector(embedding_model, email_collection, unique_eml_file_id, email_text):
    """Stores email embedding in ChromaDB with correct formatting."""
    email_embedding = compute_email_embedding(embedding_model, email_text)

    # Ensure embedding is a list of lists
    if isinstance(email_embedding[0], float):
        email_embedding = [email_embedding]  # Wrap it correctly

    email_collection.add(
        ids=[unique_eml_file_id],  # Unique eml ID
        embeddings=email_embedding,  # Correctly formatted embedding
        metadatas=[{"email_text": email_text}]
    )

In [15]:
def classify_emails_from_dir(email_dir):

  # Initialize ChromaDB
  chroma_client = chromadb.PersistentClient(path="email_vectors_db")
  email_collection = chroma_client.get_or_create_collection(name="emails")

  # Load Sentence Transformer Model for Embeddings
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

  # Process all emails in the directory
  final_outputs = []  # Store final JSON outputs for all emails

  for eml_file in os.listdir(EMAIL_DIR):
      if eml_file.endswith(".eml"):
          # Step 1: Process email and extract content
          processed_email = process_eml_file(os.path.join(EMAIL_DIR, eml_file))
          #print("📩 Processed Email:", processed_email)

          # Step 2: Store the email embedding first before duplicate check
          store_email_vector(embedding_model, email_collection, eml_file, str(processed_email))

          # Step 3: Check for duplicates after storing the embedding
          duplicate_flag, duplicate_id = check_duplicate_email(email_collection, embedding_model, str(processed_email))

          # If it's a duplicate, return JSON with a reason
          if duplicate_flag:
              duplicate_json_output = {
                  "eml_file_name": eml_file,  # First element
                  "duplicate_flag": True,
                  "reason": f"⚠️ Duplicate Email Detected! (Matches ID: {duplicate_id})"
              }
              final_outputs.append(duplicate_json_output)
              #print("📜 Duplicate Email JSON Output:")
              #print(json.dumps(duplicate_json_output, indent=4))
              continue  # Skip further processing for duplicates

          # Step 4: Classify the email
          classification_raw = get_email_classification(processed_email, duplicate_flag, functions)

          # Convert classification response to JSON
          classification_response = json.loads(classification_raw.function_call.arguments)
          #print("📌 Classification Response:", classification_response)

          # Extract request type and sub-request type
          request_type = classification_response.get("request_type")
          sub_request_type = classification_response.get("sub_request_type")

          if not request_type or not sub_request_type:
              print("❌ Error: Missing request_type or sub_request_type.")
              continue  # Skip to next email

          # Step 5: Extract metadata fields
          metadata_raw = get_metadata_fields(request_type, sub_request_type, processed_email, functions)
          # print(metadata_raw)

          try:
            if metadata_raw.content is None or str(metadata_raw.content).strip().lower() == "none":
                metadata_response = {}
            else:
              metadata_response = json.loads(metadata_raw.content)
          except Exception as e:
              metadata_response = {"error": f"Failed to parse metadata response: {str(e)}"}

          # print("🔑 Metadata Response:", metadata_response)

          # Step 6: Construct final JSON output
          final_json_output = {
              "eml_file_name": eml_file,  # First element
              **classification_response,  # Classification elements (request_type, sub_request_type, etc.)
              "metadata_fields": metadata_response  # Metadata as key-value pair
          }

          final_outputs.append(final_json_output)

          # Print final formatted output
          # print("📜 Final JSON Output:")
          print(json.dumps(final_json_output, indent=4))
  return final_outputs

In [18]:
EMAIL_DIR = "/content/drive/MyDrive/EmailClassification/emails/"

extracted_detail_json_list = classify_emails_from_dir(EMAIL_DIR)
print(extracted_detail_json_list)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



{
    "eml_file_name": "sample_email_duplicate.eml",
    "request_type": "Adjustment",
    "sub_request_type": "Principal Adjustment",
    "duplicate_flag": false,
    "confidence_score": 0.85,
    "reason": "The email mentions a 'Facility Lender Share Adjustment' where the lender shares have been adjusted from USD 5,518,249.19 to USD 5,542,963.55. This aligns with the 'Adjustment' request type and specifically falls under 'Principal Adjustment' subtype. The content indicates a clear request for adjustment without any conflicting or additional requests.",
    "metadata_fields": {
        "borrower": "ABTB MID-ATLANTIC LLC",
        "deal_name": "ABB MID-ATLANTIC LLC $171.3MM 11-4-2022",
        "effective_date": "04-Feb-2025",
        "previous_commitment_amount": "USD 5,518,249.19",
        "new_commitment_amount": "USD 5,542,963.55",
        "commitment_change_amount": "USD 24,714.36"
    }
}
Duplicate found: Email is similar to sample_email.eml with similarity 1.00
{
    "eml_file_n

## UI code

In [19]:
import time
def format_metadata_as_html(metadata_dict):
    """Formats metadata fields as an HTML table."""
    if not metadata_dict or not isinstance(metadata_dict, dict):
        return "-"

    metadata_table = "<table border='1' style='border-collapse: collapse; width: 100%; font-size: 12px;'>"
    for key, value in metadata_dict.items():
        metadata_table += f"<tr><td><b>{key}</b></td><td>{value}</td></tr>"
    metadata_table += "</table>"

    return metadata_table

def process_email_directory(email_dir):
    """Processes email directory and returns classification details as an HTML table with a loading indicator."""
    if not os.path.exists(email_dir):
        return "<p style='color:red;'>❌ Directory does not exist!</p>"

    # Display loading message
    loading_message = "<p style='color:blue; font-size: 16px;'>⏳ Processing emails... Please wait.</p>"
    yield loading_message  # Show loading text in UI

    time.sleep(1)  # Simulating a short delay for better UX

    # Call the actual function to classify emails
    result_json = classify_emails_from_dir(email_dir)

    # Convert JSON list into an HTML table
    table_html = "<table border='1' style='border-collapse: collapse; width: 100%;'>"
    table_html += "<tr><th>S.No</th><th>Email File</th><th>Request Type</th><th>Sub Request Type</th><th>Duplicate?</th><th>Confidence</th><th>Reason</th><th>Metadata Fields</th></tr>"

    for index, entry in enumerate(result_json, start=1):
        metadata_html = format_metadata_as_html(entry.get("metadata_fields", {}))  # Subtable for metadata
        row_html = f"""
            <tr>
                <td>{index}</td>
                <td>{entry.get("eml_file_name", "")}</td>
                <td>{entry.get("request_type", "-")}</td>
                <td>{entry.get("sub_request_type", "-")}</td>
                <td>{"✅" if entry.get("duplicate_flag") else "❌"}</td>
                <td>{f"{entry.get('confidence_score'):.2f}" if entry.get("confidence_score") else "-"}</td>
                <td>{entry.get("reason", "-")}</td>
                <td>{metadata_html}</td>
            </tr>
        """
        table_html += row_html

    table_html += "</table>"

    yield table_html  # Update with final results

# Gradio UI Setup
with gr.Blocks() as demo:
    gr.Markdown("## 📩 Email Classification & Metadata Extraction")
    gr.Markdown("### Enter Email Directory Path and Classify Emails")

    email_dir_input = gr.Textbox(label="Email Directory Path", placeholder="Enter the path to email directory")
    classify_button = gr.Button("Classify Emails")

    output_html = gr.HTML()

    classify_button.click(process_email_directory, inputs=email_dir_input, outputs=output_html)

# Launch the Gradio Interface
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bd03da3ea5be24cb78.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## Only for Testing - use the below code for cleaning up the collection from vector db

In [28]:
chroma_client = chromadb.PersistentClient(path="email_vectors_db")
chroma_client.delete_collection(name='emails')

## Only for Testing - use the below cod for generating eml file with attachment

In [32]:
import smtplib
import email
import pdfplumber
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from fpdf import FPDF

# Step 1: Create the PDF Attachment
pdf_content = """\
BANK OF AMERICA
Bank of America, N.A.

To: WELLS FARGO BANK NATIONAL ASSOCIATION
Date: 8-Nov-2023
ATTN:
Phone: 999-999-9999
Fax: 877-606-9426
Email: DENLCFX@wellsfargo.com

Re: CANTOR FITZGERALD LP USD 425MM MAR22 / REVOLVER / CANTOR FIT00037

Deal CUSIP: 13861EAE0
Deal ISIN: US13861EAE05
Facility CUSIP: 13861EAF7
Facility ISIN: US13861EAF79
Lender MEI: US1L058422

Effective 10-Nov-2023, CANTOR FITZGERALD LP has elected to repay under the SOFR (US) Term option, a total of USD 20,000,000.00.
Previous Global principal balance: USD 45,000,000.00
New Global principal balance: USD 25,000,000.00

Your share of the USD 20,000,000.00 SOFR (US) Term option payment is USD 1,411,764.71.
Previous Lender Share Principal Balance: USD 3,176,470.59
New Lender Share Principal Balance: USD 1,764,705.88

We will remit USD 1,411,764.71 on the effective date. Please note that:
(i) if the Borrower has not in fact made such payment; or
(ii) any payment you receive is in excess of what was paid by the Borrower or
(iii) we notify you that the payment was erroneously made, then pursuant to the provisions of the credit facility, you agree to return such payment.

For: WELLS FARGO BANK NATIONAL ASSOCIATION
To: WELLS FARGO BANK, NA
ABA Number: 121000248
Account No: XXXXXXXXXX0720
Reference: CANTOR FITZGERALD LP USD 425MM MAR22, SOFR (US) Term Principal Payment (CANTOR FIT00037)

Thanks & Regards,
JONNY HERNANDEZ
Telephone #: +19803883225
Email id: jonny.hernandez@bofa.com
"""

# Create PDF
pdf_filename = "attachment.pdf"
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=12)

for line in pdf_content.split("\n"):
    pdf.cell(200, 10, txt=line, ln=True)

pdf.output(pdf_filename)

# Step 2: Create the Email Message in .eml Format
msg = MIMEMultipart()
msg["From"] = "scott.wallace@citizensbank.com"
msg["To"] = "ramakrishna.kunchala@wellsfargo.com"
msg["Subject"] = "Facility Lender Share Adjustment"

email_body = """\
Citizens Bank, N.A.
Loan Agency Services

Date: 05-Feb-2025

TO: WELLS FARGO BANK, NATIONAL ASSOCIATION
ATTN: RAMAKRISHNA KUNCHALA
Fax: 877-606-9426

Re: ABTB MID-ATLANTIC LLC $171.3MM 11-4-2022, TERM LOAN A-2

Description: Facility Lender Share Adjustment
BORROWER: ABTB MID-ATLANTIC LLC
DEAL NAME: ABB MID-ATLANTIC LLC $171.3MM 11-4-2022

Effective 04-Feb-2025, the Lender Shares of facility TERM LOAN A-2 have been adjusted.
Your share of the commitment was USD 5,518,249.19. It has been Increased to USD 5,542,963.55.

For: WELLS FARGO BANK, NA
Reference: ABIB MID-ATLANTIC LIC $171.3MM 11-4-2022

If you have any questions, please call the undersigned.

** COMMENT *
PLEASE FUND YOUR SHARE OF $24,714.36

Bank Name: Citizens Bank NA
ABA # 011500120
Account #: 0026693011
Account Name: LIQ CLO Operating Account
Ref: ABTB Mid-Atlantic LLC

Regards,
SCOTT WALLACE
"""

msg.attach(MIMEText(email_body, "plain"))

# Attach PDF
with open(pdf_filename, "rb") as attachment:
    part = MIMEBase("application", "octet-stream")
    part.set_payload(attachment.read())

# Encode attachment
encoders.encode_base64(part)
part.add_header(
    "Content-Disposition",
    f"attachment; filename={pdf_filename}",
)
msg.attach(part)

# Save as .eml file
eml_filename = "sample_email.eml"
with open(eml_filename, "w") as eml_file:
    eml_file.write(msg.as_string())

print(f"EML file '{eml_filename}' has been generated successfully.")


EML file 'sample_email.eml' has been generated successfully.
