# Email & document classification for triage

## Email Classification
- Uses GPT to classify emails into Request Type and Sub Request Type.
- Returns a confidence score for classification.

In [56]:
!pip install --no-cache-dir openai langchain langchain_openai pdfplumber pdf2image easyocr python-docx fpdf pymupdf chromadb sentence-transformers gradio pandas



## Import the Required Libraries

In [57]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

import chromadb
from sentence_transformers import SentenceTransformer

import numpy as np
import os
import openai
import json
import requests
import pandas as pd
from docx import Document
import email
import fitz  # PyMuPDF for PDFs
import docx
import pdfplumber
from pdfminer.pdfparser import PDFSyntaxError
import mimetypes
from email import policy
from email.parser import BytesParser
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import uuid
import base64

import gradio as gr


## Mount the google drive

In [58]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Setup the Api Key for using Open Api

In [59]:
# Set the API key
folder_path = "/content/drive/MyDrive/EmailClassification/"

# Read the text file containing the API key
with open(folder_path + 'OpenAI_API_Key.txt', 'r') as f:
  openai.api_key = ' '.join(f.readlines())

# Update the OpenAI API key by updating the environment variable
os.environ["OPENAI_API_KEY"] = openai.api_key

### Apis for extracting texts and attachments from the email (eml files with attachments as pdf, doc, images or another eml file)

In [60]:
import os
import email
import easyocr
import pdfplumber
import traceback
import numpy as np
from email import policy
from email.parser import BytesParser
from io import BytesIO
from PIL import Image
from docx import Document
from bs4 import BeautifulSoup  # For HTML parsing

# Initialize EasyOCR Reader
ocr_reader = easyocr.Reader(["en"])  # Specify language (English)

def extract_text_from_pdf_bytes(pdf_bytes):
    """Extract text from a PDF file."""
    text = ""
    try:
        with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
    except Exception as e:
        print(f"❌ PDF Extraction Error: {str(e)}")
    return text.strip()

def extract_text_from_docx_bytes(doc_bytes):
    """Extract text from a DOCX file."""
    text = ""
    try:
        doc_stream = BytesIO(doc_bytes)
        doc = Document(doc_stream)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"❌ DOC/DOCX Extraction Error: {str(e)}")
    return text.strip()

def extract_text_from_image(image_bytes):
    """Extract text from images using EasyOCR."""
    text = ""
    try:
        img = Image.open(BytesIO(image_bytes))
        results = ocr_reader.readtext(np.array(img))  # Convert Image to NumPy array
        text = "\n".join([res[1] for res in results])
    except Exception as e:
        print(f"❌ Image Extraction Error: {str(e)}")
    return text.strip()

def process_eml_bytes(eml_bytes, is_nested=False):
    """Processes an EML file given as bytes, handling nested emails properly."""
    try:
        msg = BytesParser(policy=policy.default).parse(BytesIO(eml_bytes))

        email_data = {
            "subject": msg["subject"],
            "from": msg["from"],
            "to": msg["to"],
            "date": msg["date"],
            "body": "",
            "attachments": []
        }

        # Extract email body (only for non-nested emails)
        if not is_nested:
            body_text = []
            for part in msg.walk():
                content_type = part.get_content_type()
                content_disposition = str(part.get("Content-Disposition", ""))

                # Extract text/plain parts, but exclude nested emails
                if content_type == "text/plain" and "attachment" not in content_disposition:
                    decoded_text = part.get_payload(decode=True).decode(errors="ignore").strip()
                    if decoded_text:
                        body_text.append(decoded_text)

                # Extract HTML content if plain text is empty
                elif content_type == "text/html" and not body_text:
                    soup = BeautifulSoup(part.get_payload(decode=True), "html.parser")
                    decoded_text = soup.get_text().strip()
                    if decoded_text:
                        body_text.append(decoded_text)

            email_data["body"] = "\n".join(body_text).strip()

        # Process attachments
        for part in msg.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition", ""))
            file_name = part.get_filename()
            payload = part.get_payload(decode=True)  # Decode base64 content
            extracted_text = ""

            if not file_name and content_type == "message/rfc822":
                # Gmail often stores nested EMLs without a filename
                file_name = "nested_email.eml"

            if not file_name:
                continue  # Ignore if no filename is found

            # Handle PDFs
            if content_type in ["application/pdf", "application/octet-stream"] and file_name.lower().endswith(".pdf"):
                extracted_text = extract_text_from_pdf_bytes(payload)

            # Handle Word Documents
            elif content_type in ["application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"] or file_name.lower().endswith((".doc", ".docx")):
                extracted_text = extract_text_from_docx_bytes(payload)

            # Handle Images
            elif content_type.startswith("image/") or (content_type == "application/octet-stream" and file_name.lower().endswith(("png", "jpg", "jpeg"))):
                extracted_text = extract_text_from_image(payload)

            # Handle nested EML files (Gmail nested emails included)
            elif content_type == "message/rfc822" or (content_type == "application/octet-stream" and file_name.lower().endswith(".eml")):
                nested_email_data = process_eml_bytes(payload, is_nested=True)  # Process as nested email

                email_data["attachments"].append({
                    "file_name": file_name,
                    "content_type": content_type,
                    "nested_email": nested_email_data  # Store full nested email structure instead of text
                })
                continue  # Skip normal processing for nested emails

            email_data["attachments"].append({
                "file_name": file_name,
                "content_type": content_type,
                "extracted_text": extracted_text
            })

        return email_data

    except Exception as e:
        print(f"❌ Email Parsing Error: {traceback.format_exc()}")
        return None

def process_eml_file(eml_path):
    """Processes an EML file from a given file path and extracts email content and attachments."""
    try:
        with open(eml_path, "rb") as f:
            eml_bytes = f.read()
        return process_eml_bytes(eml_bytes)
    except Exception as e:
        print(f"❌ Error processing EML file ({eml_path}): {str(e)}")
        return None





Only for Testing - should be removed

In [61]:
EMAIL_DIR = "/content/drive/MyDrive/EmailClassification/emails/"
for eml_file in os.listdir(EMAIL_DIR):
    if eml_file.endswith(".eml"):
        # Step 1: Process email and extract content
        processed_email = process_eml_file(os.path.join(EMAIL_DIR, eml_file))
        print("📩 Processed Email:", processed_email)

📩 Processed Email: {'subject': 'Facility Lender Share Adjustment', 'from': 'scott.wallace@citizensbank.com', 'to': 'ramakrishna.kunchala@wellsfargo.com', 'date': None, 'body': 'Citizens Bank, N.A.\nLoan Agency Services\n\nDate: 05-Feb-2025\n\nTO: WELLS FARGO BANK, NATIONAL ASSOCIATION\nATTN: RAMAKRISHNA KUNCHALA\nFax: 877-606-9426\n\nRe: ABTB MID-ATLANTIC LLC $171.3MM 11-4-2022, TERM LOAN A-2\n\nDescription: Facility Lender Share Adjustment\nBORROWER: ABTB MID-ATLANTIC LLC\nDEAL NAME: ABB MID-ATLANTIC LLC $171.3MM 11-4-2022\n\nEffective 04-Feb-2025, the Lender Shares of facility TERM LOAN A-2 have been adjusted.\nYour share of the commitment was USD 5,518,249.19. It has been Increased to USD 5,542,963.55.\n\nFor: WELLS FARGO BANK, NA\nReference: ABIB MID-ATLANTIC LIC $171.3MM 11-4-2022\n\nIf you have any questions, please call the undersigned.\n\n** COMMENT *\nPLEASE FUND YOUR SHARE OF $24,714.36\n\nBank Name: Citizens Bank NA\nABA # 011500120\nAccount #: 0026693011\nAccount Name: LIQ 

### Function Calling Api to Extract Request & sub request in json format
Using the Function Calling API to create a function schema to extract data directly into the defined JSON format

In [62]:
functions = [
    {
        "name": "classify_email",
        "description": "Classifies the email into a request type and sub-request type.",
        "parameters": {
            "type": "object",
            "properties": {
                "request_type": {
                    "type": "string",
                    "description": "The high-level category of the request based on the primary intent of the email."
                },
                "sub_request_type": {
                    "type": "string",
                    "description": "The specific sub-category under the request type based on the primary intent of the email."
                },
                "duplicate_flag": {
                    "type": "boolean",
                    "description": "Flag to indicate if the email is a duplicate."
                },
                "confidence_score": {
                    "type": "number",
                    "description": "Confidence score between 0 and 1."
                },
                "reason": {
                    "type": "string",
                    "description": "Explanation for classification and confidence score."
                }
            },
            "required": [
                "request_type",
                "sub_request_type",
                "duplicate_flag",
                "confidence_score",
                "reason"
            ]
        }
    }
]


### Create Pormpt for email classification with request type, sub request type, confidence score, reason for classification and duplicate check flag

In [63]:
def create_email_classification_prompt(email_text, duplicate_flag, additional_rules=None, additional_request_types=None):
    # Default Predefined Request Types
    predefined_request_types = """
    Request Types:

    - Adjustment: (Subtypes: Fee Adjustment, Principal Adjustment, Interest Adjustment)
    - AU Transfer: (Subtypes: Intra-Bank Transfer, Inter-Bank Transfer, Scheduled Transfer)
    - Closing Notice: (Subtypes: Reallocation Fees, Amendment Fees, Reallocation Principal)
    - Commitment Change: (Subtypes: Cashless Roll, Decrease, Increase)
    - Fee Payment: (Subtypes: Ongoing Fee, Letter of Credit Fee)
    - Money Movement-Inbound: (Subtypes: Principal, Interest, Principal + Interest, Principal+Interest+Fee)
    - Money Movement-Outbound: (Subtypes: Timebound, Foreign Currency)
    - Loan Origination: (Subtypes: Application Submission, Documentation Provision, Credit Evaluation, Approval Notification)
    - Loan Disbursement: (Subtypes: Fund Transfer, Disbursement Schedule, Disbursement Confirmation)
    - Loan Repayment: (Subtypes: Repayment Schedule Setup, Early Repayment, Payment Rescheduling, Payment Confirmation)
    - Loan Information: (Subtypes: Balance Inquiry, Amortization Schedule, Interest Statement, Tax Certificate)
    - Loan Closure: (Subtypes: Closure Statement, No Dues Certificate, Security Release)
    - Loan Service: (Subtypes: Statement Requests, Document Retrieval, Account Linking)
    - Loan Grievance: (Subtypes: Dispute Resolution, Complaint Registration, Feedback Submission)
    """

    # Append Additional Request Types if Provided
    if additional_request_types:
        predefined_request_types += f"\n\n### Additional Request Types & Subtypes:\n{additional_request_types}"

    # Default Priority Considerations
    priority_considerations = """
    **Priority Considerations:**
    1. **Email body takes priority** over attachments for classification.
    2. **Primary intent of the customer is the key focus**, even when multiple requests are mentioned.
    3. If an email contains **both a discussion and an explicit ask**, prioritize the **ask** as the primary intent.
    4. **Money movement-related requests take priority** in case of conflicts.
    5. Mark **duplicate emails** based on the flag extracted from similarity search of earlier emails, which is **{duplicate_flag}**.
    6. If multiple request types are identified, explain the reson for prioritization in the reason field.
    """

    # Append Additional Rules if Provided
    if additional_rules:
        priority_considerations += f"\n\n### Additional Classification Rules:\n{additional_rules}"

    # Construct the Classification Prompt
    classification_prompt = f"""
    You are a subject matter expert in Commercial Bank Lending Services, responsible for classifying emails into predefined **Request Types** and **Sub Request Types**.

    {priority_considerations}

    **Predefined Request Categories & Subtypes:**
    {predefined_request_types}

    **Task:**
    - Analyze the email and classify it into the most relevant **Request Type** and **Sub Request Type**.
    - Provide a **confidence score** (0.0 to 1.0) indicating the likelihood of correct classification.
    - Explain the **reasoning** for classification.

    **Email Content for Classification:**
    "{email_text}"

    **Output Format (JSON):**
    {{
        "request_type": "Determined request type based on primary intent",
        "sub_request_type": "Determined sub-request type",
        "duplicate_flag": {duplicate_flag},
        "confidence_score": confidence_value (between 0-1),
        "reason": "Explanation for classification and confidence score"
    }}
    """

    return [
        {"role": "system", "content": "You are a subject matter expert in Commercial Bank Lending Services. Your task is to classify emails and determine the request type and sub-request type with high accuracy."},
        {"role": "user", "content": classification_prompt}
    ]


### Create Pormpt for extracting metadata fields from the email based on the extracted request type and sub request type

In [64]:
def create_metadata_fields_extraction_prompt(email_text, request_type, sub_request_type):
  metadata_fields_mapping = {
      "Adjustment - Fee Adjustment": ["deal_name", "adjustment_amount", "effective_date", "lender_name", "reason"],
      "Adjustment - Principal Adjustment": ["deal_name", "principal_amount", "effective_date", "lender_name"],
      "Adjustment - Interest Adjustment": ["deal_name", "interest_amount", "effective_date", "lender_name"],

      "AU Transfer - Intra-Bank Transfer": ["transfer_id", "source_account_number", "destination_account_number", "transfer_amount", "transfer_date"],
      "AU Transfer - Inter-Bank Transfer": ["transfer_id", "source_bank", "destination_bank", "transfer_amount", "transfer_date"],
      "AU Transfer - Scheduled Transfer": ["transfer_id", "source_account_number", "destination_account_number", "transfer_amount", "scheduled_date"],

      "Closing Notice - Reallocation Fees": ["deal_name", "fee_type", "reallocation_amount", "effective_date", "lender_name"],
      "Closing Notice - Amendment Fees": ["deal_name", "fee_type", "amendment_amount", "effective_date", "lender_name"],
      "Closing Notice - Reallocation Principal": ["deal_name", "principal_amount", "effective_date", "lender_name"],

      "Commitment Change - Cashless Roll": ["deal_name", "lender_name", "commitment_type", "amount_rolled", "effective_date"],
      "Commitment Change - Decrease": ["deal_name", "lender_name", "commitment_decrease", "new_commitment_amount", "effective_date"],
      "Commitment Change - Increase": ["deal_name", "lender_name", "commitment_increase", "new_commitment_amount", "effective_date"],

      "Fee Payment - Ongoing Fee": ["fee_type", "due_date", "amount_paid", "outstanding_amount", "payment_date", "reference_number"],
      "Fee Payment - Letter of Credit Fee": ["fee_type", "amount_paid", "payment_date", "credit_reference_number"],

      "Money Movement - Inbound - Principal": ["amount", "currency", "transaction_id", "sender_bank", "receiver_bank", "transfer_date"],
      "Money Movement - Inbound - Interest": ["amount", "currency", "transaction_id", "sender_bank", "receiver_bank", "transfer_date"],
      "Money Movement - Inbound - Principal + Interest": ["total_amount", "currency", "transaction_id", "sender_bank", "receiver_bank", "transfer_date"],
      "Money Movement - Outbound - Timebound": ["amount", "currency", "transaction_id", "receiver_bank", "transfer_date", "time_constraint"],
      "Money Movement - Outbound - Foreign Currency": ["amount", "currency", "exchange_rate", "transaction_id", "receiver_bank", "transfer_date"],

      "Loan Origination - Application Submission": ["borrower_name", "loan_application_id", "requested_amount", "submission_date"],
      "Loan Origination - Documentation Provision": ["borrower_name", "document_type", "submission_date"],
      "Loan Origination - Credit Evaluation": ["borrower_name", "credit_score", "evaluation_date"],
      "Loan Origination - Approval Notification": ["borrower_name", "loan_application_id", "approval_status", "approval_date"],

      "Loan Disbursement - Fund Transfer": ["fund_transfer_id", "disbursement_amount", "disbursement_date", "beneficiary_account"],
      "Loan Disbursement - Disbursement Schedule": ["loan_id", "disbursement_plan", "schedule_dates"],
      "Loan Disbursement - Disbursement Confirmation": ["loan_id", "confirmation_date", "amount_disbursed"],

      "Loan Repayment - Repayment Schedule Setup": ["loan_id", "installment_amount", "payment_due_date", "repayment_term"],
      "Loan Repayment - Early Repayment": ["loan_id", "remaining_balance", "early_repayment_date"],
      "Loan Repayment - Payment Rescheduling": ["loan_id", "new_payment_schedule", "reschedule_reason"],
      "Loan Repayment - Payment Confirmation": ["loan_id", "payment_date", "amount_paid"],

      "Loan Information - Balance Inquiry": ["account_number", "current_balance", "last_transaction_date"],
      "Loan Information - Amortization Schedule": ["loan_id", "remaining_payments", "monthly_installment"],
      "Loan Information - Interest Statement": ["loan_id", "interest_rate", "accrued_interest", "statement_period"],
      "Loan Information - Tax Certificate": ["borrower_name", "loan_id", "tax_year", "interest_paid"],

      "Loan Closure - Closure Statement": ["loan_id", "closure_date", "final_payment_amount"],
      "Loan Closure - No Dues Certificate": ["loan_id", "certificate_issue_date"],
      "Loan Closure - Security Release": ["loan_id", "release_date", "security_details"],

      "Loan Service - Statement Requests": ["account_number", "statement_period", "delivery_preference"],
      "Loan Service - Document Retrieval": ["document_type", "request_date", "delivery_preference"],
      "Loan Service - Account Linking": ["primary_account_number", "linked_account_number", "linking_type"],

      "Loan Grievance - Dispute Resolution": ["dispute_id", "dispute_description", "resolution_status", "resolution_date"],
      "Loan Grievance - Complaint Registration": ["complaint_id", "complaint_category", "complaint_details"],
      "Loan Grievance - Feedback Submission": ["feedback_id", "feedback_category", "feedback_text"]
    }

  # Lookup metadata fields using request_type + sub_request_type
  metadata_fields = metadata_fields_mapping.get(
      f"{request_type} - {sub_request_type}",
      metadata_fields_mapping.get(request_type, [])
  )

  metadata_prompt = f"""
  You are an AI assistant specializing in extracting **structured metadata** from emails related to Commercial Bank Lending Services.

  **Your task:**
  - Extract only the metadata fields relevant to **Request Type: {request_type}** and **Sub-Request Type: {sub_request_type}** and provide the output content in a json format.
  - Ignore classification details like request type, confidence score, or reasoning.
  - Extract metadata **only as key-value pairs**, ensuring values are **explicitly mentioned in the email text**.
  - **Expected Metadata Fields:** {', '.join(metadata_fields)}
  - If metadata_fields is empty, identify the possible metadata_fields based on the request_type {request_type}, sub_request_type {sub_request_type} and your knowledge in commercial bank lending service and prepare a json by extracting corresponding values in the format given in few shot examples.
  - **Output must be a flat JSON object** with key-value pairs.
  - If no valid metadata is found, output  an empty json response.

  ---
  **Email Content for Metadata Extraction:**
  "{email_text}"

  ---
  **Example 1: Commitment Change - Increase**
  **Email:**
  "Dear Loan Team,
  We request an increase in our commitment under the ABC Infrastructure Fund.
  Our current commitment is $2,000,000, and we would like to increase it to $2,500,000.
  Please process this request and confirm.

  Regards,
  John Doe, CFO, XYZ Corp"

  **Expected JSON Output:**
  {{
      "deal_name": "ABC Infrastructure Fund",
      "lender_name": "XYZ Corp",
      "commitment_increase": "USD 500,000",
      "new_commitment_amount": "USD 2,500,000"
  }}

  ---
  **🚨 IMPORTANT:**
  - Ensure **every field** has a value.
  - If no metadata is found, return an empty json response.
  - **Final Output Format:**
  {{
      "field_1": "Extracted value",
      "field_2": "Extracted value"
  }}
  """

  return [
      {"role": "system", "content": "You are an expert AI model trained to extract metadata fields from commercial banking emails. You return only key-value pairs without classification details."},
      {"role": "user", "content": metadata_prompt}
  ]

In [65]:
def get_email_classification(email_text, duplicate_flag, functions, additional_rules=None, additional_request_types=None):
    user_input = create_email_classification_prompt(email_text, duplicate_flag, additional_rules, additional_request_types)
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=user_input,
        functions=functions,
        function_call="auto"
    )
    return response.choices[0].message

In [76]:
def get_metadata_fields(email_text, request_type, sub_request_type, functions):
    user_input = create_metadata_fields_extraction_prompt(email_text, request_type, sub_request_type)
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=user_input,
        functions=functions,
        function_call="auto"
    )
    return response.choices[0].message

In [67]:
def compute_email_embedding(embedding_model, email_text):
    """Converts email text into an embedding vector compatible with ChromaDB."""
    return embedding_model.encode(email_text, convert_to_numpy=True).tolist()

In [88]:
def check_duplicate_email(email_collection, embedding_model, email_text, threshold=0.90):
    """Checks if the email is a duplicate by performing similarity search in ChromaDB."""
    email_embedding = compute_email_embedding(embedding_model, email_text)

    # Perform similarity search in ChromaDB
    results = email_collection.query(
        query_embeddings=[email_embedding],  # Search for similar emails
        n_results=2
    )

    for idx, distance in enumerate(results['distances'][0]):
            # Skip the first result if it's an exact match to itself (distance = 0)
            if idx == 0:
                continue

            if distance <= (1 - threshold):  # Check if similarity is above threshold
                duplicate_file_id = results["ids"][0][idx]
                print(f"✅ Duplicate found: Matches {duplicate_file_id} with similarity {1 - distance:.2f}")
                return True, f"{duplicate_file_id} with similarity {1 - distance:.2f}"

    return False, None  # Not a duplicate

In [69]:
def store_email_vector(embedding_model, email_collection, unique_eml_file_id, email_text):
    """Stores email embedding in ChromaDB with correct formatting."""
    email_embedding = compute_email_embedding(embedding_model, email_text)

    # Ensure embedding is a list of lists
    if isinstance(email_embedding[0], float):
        email_embedding = [email_embedding]  # Wrap it correctly

    email_collection.add(
        ids=[unique_eml_file_id],  # Unique eml ID
        embeddings=email_embedding,  # Correctly formatted embedding
        metadatas=[{"email_text": email_text}]
    )

In [70]:
def classify_emails_from_dir(email_dir, additional_rules=None, additional_request_types=None):

  # Initialize ChromaDB
  chroma_client = chromadb.PersistentClient(path="email_vectors_db")
  email_collection = chroma_client.get_or_create_collection(name="emails")

  # Load Sentence Transformer Model for Embeddings
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

  # Process all emails in the directory
  final_outputs = []  # Store final JSON outputs for all emails

  for eml_file in os.listdir(EMAIL_DIR):
      if eml_file.endswith(".eml"):
          # Step 1: Process email and extract content
          processed_email = process_eml_file(os.path.join(EMAIL_DIR, eml_file))
          #print("📩 Processed Email:", processed_email)

          # Step 2: Store the email embedding first before duplicate check
          store_email_vector(embedding_model, email_collection, eml_file, str(processed_email))

          # Step 3: Check for duplicates after storing the embedding
          duplicate_flag, duplicate_id = check_duplicate_email(email_collection, embedding_model, str(processed_email))

          # If it's a duplicate, return JSON with a reason
          if duplicate_flag:
              duplicate_json_output = {
                  "eml_file_name": eml_file,  # First element
                  "duplicate_flag": True,
                  "reason": f"⚠️ Duplicate Email Detected! (Matches ID: {duplicate_id})"
              }
              final_outputs.append(duplicate_json_output)
              #print("📜 Duplicate Email JSON Output:")
              #print(json.dumps(duplicate_json_output, indent=4))
              continue  # Skip further processing for duplicates

          # Step 4: Classify the email
          classification_raw = get_email_classification(processed_email, duplicate_flag, functions, additional_rules, additional_request_types)

          # Convert classification response to JSON
          classification_response = json.loads(classification_raw.function_call.arguments)
          #print("📌 Classification Response:", classification_response)

          # Extract request type and sub-request type
          request_type = classification_response.get("request_type")
          sub_request_type = classification_response.get("sub_request_type")

          if not request_type or not sub_request_type:
              print("❌ Error: Missing request_type or sub_request_type.")
              continue  # Skip to next email

          # Step 5: Extract metadata fields
          metadata_raw = get_metadata_fields(request_type, sub_request_type, processed_email, functions)
          # print(metadata_raw)

          try:
            if metadata_raw.content is None or str(metadata_raw.content).strip().lower() == "none":
                metadata_response = {}
            else:
              metadata_response = json.loads(metadata_raw.content)
          except Exception as e:
              metadata_response = {"error": f"Failed to parse metadata response: {str(e)}"}

          # print("🔑 Metadata Response:", metadata_response)

          # Step 6: Construct final JSON output
          final_json_output = {
              "eml_file_name": eml_file,  # First element
              **classification_response,  # Classification elements (request_type, sub_request_type, etc.)
              "metadata_fields": metadata_response  # Metadata as key-value pair
          }

          final_outputs.append(final_json_output)

          # Print final formatted output
          # print("📜 Final JSON Output:")
          print(json.dumps(final_json_output, indent=4))
  return final_outputs

### For debugging the code without UI

In [81]:
EMAIL_DIR = "/content/drive/MyDrive/EmailClassification/emails/"
extracted_detail_json_list = classify_emails_from_dir(EMAIL_DIR)
print(extracted_detail_json_list)



{
    "eml_file_name": "balance_inquiry_loan_info_request.eml",
    "request_type": "Loan Information",
    "sub_request_type": "Balance Inquiry",
    "duplicate_flag": false,
    "confidence_score": 0.95,
    "reason": "The email primarily focuses on a request for loan information specifically regarding the outstanding balance. The subject and content of the email clearly indicate the intent for a balance inquiry. The attachment further supports the request for specific loan information, reinforcing the classification.",
    "metadata_fields": {
        "account_number": "4455667788",
        "requested_information": "Current outstanding loan balance"
    }
}
{
    "eml_file_name": "amortization_schedule_loan_info_request.eml",
    "request_type": "Loan Information",
    "sub_request_type": "Amortization Schedule",
    "duplicate_flag": false,
    "confidence_score": 0.9,
    "reason": "The email specifically requests loan information related to an amortization schedule, indicating a 



{
    "eml_file_name": "feedback_submission_loan_grievance.eml",
    "request_type": "Loan Grievance",
    "sub_request_type": "Feedback Submission",
    "duplicate_flag": false,
    "confidence_score": 0.95,
    "reason": "The email is primarily focused on submitting a grievance regarding the loan account, specifically related to feedback submission. The content clearly indicates the intention of the customer to provide feedback on customer service satisfaction, making 'Feedback Submission' the most relevant sub-request type under 'Loan Grievance'. The email body explicitly mentions the issue and the requested action, highlighting the primary intent.",
    "metadata_fields": {
        "account_number": "3344556677",
        "issue": "Satisfaction with customer service",
        "requested_action": "No action required"
    }
}




{
    "eml_file_name": "letter_of_credit_fee_payment_updated.eml",
    "request_type": "Fee Payment",
    "sub_request_type": "Letter of Credit Fee",
    "duplicate_flag": false,
    "confidence_score": 0.9,
    "reason": "The primary intent of the email is to request a payment for a specific fee which is the Letter of Credit Fee. The email explicitly states the amount, due date, and account number for processing the payment. The content in the email body clearly indicates the Fee Payment request with high confidence.",
    "metadata_fields": {
        "account_number": "987654321",
        "fee_type": "Letter of Credit Fee",
        "amount": "$12,500.00",
        "due_date": "April 15, 2025"
    }
}
Duplicate found: Email is similar to feedback_submission_loan_grievance.eml with similarity 0.91
{
    "eml_file_name": "dispute_resolution_loan_grievance.eml",
    "request_type": "Loan Grievance",
    "sub_request_type": "Dispute Resolution",
    "duplicate_flag": false,
    "confidence



Duplicate found: Email is similar to ReallocationOfFeesForLoan.eml with similarity 1.00




Duplicate found: Email is similar to sample_email_duplicate.eml with similarity 1.00




Duplicate found: Email is similar to sample_email_duplicate.eml with similarity 1.00
{
    "eml_file_name": "ongoing_fee_payment_updated.eml",
    "request_type": "Fee Payment",
    "sub_request_type": "Ongoing Fee",
    "duplicate_flag": false,
    "confidence_score": 0.9,
    "reason": "The email is a payment request for an ongoing fee payment, clearly indicating the primary intent. The email body specifically mentions processing the payment for an ongoing service fee with details like Account Number, Amount, and Due Date.",
    "metadata_fields": {
        "account_number": "123456789",
        "fee_type": "Ongoing Service Fee",
        "amount": "$5,000.00",
        "due_date": "April 10, 2025"
    }
}
[{'eml_file_name': 'balance_inquiry_loan_info_request.eml', 'request_type': 'Loan Information', 'sub_request_type': 'Balance Inquiry', 'duplicate_flag': False, 'confidence_score': 0.95, 'reason': 'The email primarily focuses on a request for loan information specifically regarding th

## UI code

In [91]:
import time
import os
import re
import gradio as gr

def format_metadata_as_html(metadata_dict):
    """Formats metadata fields as an HTML table."""
    if not metadata_dict or not isinstance(metadata_dict, dict):
        return "-"

    metadata_table = "<table border='1' style='border-collapse: collapse; width: 100%; font-size: 12px;'>"
    for key, value in metadata_dict.items():
        metadata_table += f"<tr><td><b>{key}</b></td><td>{value}</td></tr>"
    metadata_table += "</table>"

    return metadata_table

def parse_request_types(input_text):
    """Parses request types from user input in the format:
       Adjustment (Fee Adjustment, Principal Adjustment, Interest Adjustment)"""

    request_types = []
    # Split input by new lines or semicolons for multiple entries
    entries = re.split(r'\n|;', input_text.strip())

    for entry in entries:
        entry = entry.strip()
        if not entry:
            continue

        match = re.match(r"^(.+?)\s*\((.*?)\)$", entry)
        if match:
            request_type = match.group(1).strip()
            subtypes = [sub.strip() for sub in match.group(2).split(",")]
            request_types.append({"request_type": request_type, "sub_request_types": subtypes})
        else:
            # If no subtypes provided, assume it's a standalone request type
            request_types.append({"request_type": entry, "sub_request_types": []})

    return request_types

def process_email_directory(email_dir, additional_rules, request_types_text):
    """Processes email directory and returns classification details as an HTML table with a loading indicator."""

    if not os.path.exists(email_dir):
        return "<p style='color:red;'>❌ Directory does not exist!</p>"

    # Parse request types
    parsed_request_types = parse_request_types(request_types_text)

    # Display loading message
    loading_message = "<p style='color:blue; font-size: 16px;'>⏳ Processing emails... Please wait.</p>"
    yield loading_message  # Show loading text in UI

    time.sleep(1)  # Simulating a short delay for better UX

    # Call the actual function to classify emails
    result_json = classify_emails_from_dir(email_dir, additional_rules, parsed_request_types)

    # Convert JSON list into an HTML table
    table_html = "<table border='1' style='border-collapse: collapse; width: 100%;'>"
    table_html += "<tr><th>S.No</th><th>Email File</th><th>Request Type</th><th>Sub Request Type</th><th>Duplicate?</th><th>Confidence</th><th>Reason</th><th>Metadata Fields</th></tr>"

    for index, entry in enumerate(result_json, start=1):
        metadata_html = format_metadata_as_html(entry.get("metadata_fields", {}))  # Subtable for metadata
        row_html = f"""
            <tr>
                <td>{index}</td>
                <td>{entry.get("eml_file_name", "")}</td>
                <td>{entry.get("request_type", "-")}</td>
                <td>{entry.get("sub_request_type", "-")}</td>
                <td>{"✅" if entry.get("duplicate_flag") else "❌"}</td>
                <td>{f"{entry.get('confidence_score'):.2f}" if entry.get("confidence_score") else "-"}</td>
                <td>{entry.get("reason", "-")}</td>
                <td>{metadata_html}</td>
            </tr>
        """
        table_html += row_html

    table_html += "</table>"

    yield table_html  # Update with final results

# Gradio UI Setup
with gr.Blocks() as demo:
    gr.Markdown("## 📩 AI-Powered Email Classification & Metadata Extraction")
    gr.Markdown("### Enter Email Directory Path and Provide Additional Classification Rules")

    email_dir_input = gr.Textbox(label="Email Directory Path", placeholder="Enter the path to email directory")
    additional_rules_input = gr.Textbox(label="Additional Classification Rules", placeholder="Enter any additional rules for classification (optional)")

    request_types_input = gr.Textbox(
        label="Request Types & Subtypes(Format: RequestType (SubRequestType1, SubRequestType2))",
        placeholder="Enter request types and subtypes in the specified format, one per line",
        lines=4
    )

    classify_button = gr.Button("Classify Emails")
    output_html = gr.HTML()

    classify_button.click(
        process_email_directory,
        inputs=[email_dir_input, additional_rules_input, request_types_input],
        outputs=output_html
    )

# Launch the Gradio Interface
demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://734f08727918424625.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## Only for Testing - use the below code for cleaning up the collection from vector db

In [89]:
chroma_client = chromadb.PersistentClient(path="email_vectors_db")
chroma_client.delete_collection(name='emails')

In [90]:
import os
import shutil

def delete_all_files_in_folder(folder_path):
    """Deletes all files inside a folder but keeps the folder itself."""
    if not os.path.exists(folder_path):
        print(f"Folder '{folder_path}' does not exist.")
        return

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)  # Delete file
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Delete subdirectories
        except Exception as e:
            print(f"Error deleting {file_path}: {e}")

# Example usage:
folder_to_clean = "/content/drive/MyDrive/EmailClassification/emails/"
delete_all_files_in_folder(EMAIL_DIR)