# Email & document classification for triage

## Email Classification
- Uses GPT to classify emails into Request Type and Sub Request Type.
- Returns a confidence score for classification.

In [None]:
!pip install openai langchain_openai pdfplumber pytesseract pdf2image python-docx fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=5acb63e4c43d8465d9dcad147dbaafe3e6fafa3f5d3756c249c45ee1a278f11b
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


## Import the Required Libraries

In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

import os
import openai
import json

import email
import pdfplumber
import pytesseract
from email import policy
from email.parser import BytesParser
from bs4 import BeautifulSoup
from PIL import Image
from pdf2image import convert_from_path
from docx import Document
from io import BytesIO


## Mount the google drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Setup the Api Key for using Open Api

In [None]:
# Set the API key
folder_path = "/content/drive/MyDrive/EmailClassification/"

# Read the text file containing the API key
with open(folder_path + 'OpenAI_API_Key.txt', 'r') as f:
  openai.api_key = ' '.join(f.readlines())

# Update the OpenAI API key by updating the environment variable
os.environ["OPENAI_API_KEY"] = openai.api_key

### Extract text from email (EML format) with attachments

In [None]:
def extract_email_text(eml_file_path):
    wit h open(eml_file_path, "rb") as f:
        msg = BytesParser(policy=policy.default).parse(f)

    # Extract subject and body
    subject = msg["subject"]
    body = None

    # Process email body (plain text or HTML)
    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            if content_type == "text/plain":
                body = part.get_payload(decode=True).decode(errors="ignore")
      xp    a    elif content_type == "text/html":
                soup = BeautifulSoup(part.get_payload(decode=True), "html.parser")
                body = soup.get_text()
    else:
        body = msg.get_payload(decode=True).decode(errors="ignore")

    return subject, body

# Function to extract text from PDF attachments
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text.strip()

# Function to extract text from images using OCR
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text.strip()

# Function to extract text from DOCX files
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Function to process email and attachments
def process_email_with_attachments(eml_file_path, attachment_dir="attachments"):
    subject, body = extract_email_text(eml_file_path)

    # Create a directory to store attachments if it doesn't exist
    os.makedirs(attachment_dir, exist_ok=True)

    # Extract attachments
    with open(eml_file_path, "rb") as f:
        msg = BytesParser(policy=policy.default).parse(f)

    attachment_texts = []
    for part in msg.walk():
        content_disposition = part.get("Content-Disposition")
        if content_disposition and "attachment" in content_disposition:
            filename = part.get_filename()
            attachment_path = os.path.join(attachment_dir, filename)

            with open(attachment_path, "wb") as file:
                file.write(part.get_payload(decode=True))

            # Process attachments based on file type
            if filename.endswith(".pdf"):
                attachment_texts.append(extract_text_from_pdf(attachment_path))
            elif filename.endswith(".docx"):
                attachment_texts.append(extract_text_from_docx(attachment_path))
            elif filename.endswith((".png", ".jpg", ".jpeg")):
                attachment_texts.append(extract_text_from_image(attachment_path))

    return {
        "subject": subject,
        "body": body,
        "attachments_text": attachment_texts
    }

### Function Calling Api to Extract Request & sub request in json format
Using the Function Calling API to create a function schema to extract data directly into the defined JSON format

In [None]:
# Define the function schema
functions = [
    {
        "name": "classify_email",
        "description": "Classifies the email into a request type and sub-request type",
        "parameters": {
            "type": "object",
            "properties": {
                "request_type": {
                    "type": "string",
                    "description": "The high-level category of the request"
                },
                "sub_request_type": {
                    "type": "string",
                    "description": "The specific sub-category under the request type"
                }
            },
            "required": ["request_type", "sub_request_type"]
        }
    }
]


In [None]:
def moderate_content(content):
    # Placeholder function for moderation (assume OpenAI's moderation API is used here)
    moderation_response = openai.moderations.create(input=content)
    if moderation_response.results[0].flagged:
        return False
    return True

In [None]:
def get_chat_completions(user_input, functions):

    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=user_input,
        functions=functions,
        function_call="auto"
    )
    # Extract the function call result
    return response.choices[0].message

In [None]:
def initialize_conversation(email_text):
    predefined_request_types = """
    Request Types:
    - Adjustment: (Subtypes: Closing Notice, Commitment Change, Fee Payment, Reallocation Fees, Amendment Fees, Reallocation Principal)
    - AU Transfer: (Subtypes: Cashless Roll, Decrease, Increase, Ongoing Fee, Letter of Credit Fee)
    - Money Movement-Inbound: (Subtypes: Principal, Interest, Principal + Interest, Principal+Interest+Fee)
    - Money Movement-Outbound: (Subtypes: Timebound, Foreign Currency)
    """

    classification_prompt = f"""
    Below are the predefined request categories:

    {predefined_request_types}

    Please analyze the following email and classify it into the most relevant **Request Type** and **Sub Request Type**:

    Email:
    "{email_text}"

    Ensure the classification follows the provided categories. If the email does not match any, return "Unknown".
    """
    return [{"role": "system", "content": "You are a subject matter expert in Commercial Bank Lending Service teams who classifies emails based on predefined request types and sub-request types."},
            {"role": "user", "content": classification_prompt}]

In [None]:
eml_file_path = "/content/drive/MyDrive/EmailClassification/sample_email.eml"
processed_email = process_email_with_attachments(eml_file_path)
print(processed_email)

{'subject': 'Facility Lender Share Adjustment', 'body': 'Citizens Bank, N.A.\nLoan Agency Services\n\nDate: 05-Feb-2025\n\nTO: WELLS FARGO BANK, NATIONAL ASSOCIATION\nATTN: RAMAKRISHNA KUNCHALA\nFax: 877-606-9426\n\nRe: ABTB MID-ATLANTIC LLC $171.3MM 11-4-2022, TERM LOAN A-2\n\nDescription: Facility Lender Share Adjustment\nBORROWER: ABTB MID-ATLANTIC LLC\nDEAL NAME: ABB MID-ATLANTIC LLC $171.3MM 11-4-2022\n\nEffective 04-Feb-2025, the Lender Shares of facility TERM LOAN A-2 have been adjusted.\nYour share of the commitment was USD 5,518,249.19. It has been Increased to USD 5,542,963.55.\n\nFor: WELLS FARGO BANK, NA\nReference: ABIB MID-ATLANTIC LIC $171.3MM 11-4-2022\n\nIf you have any questions, please call the undersigned.\n\n** COMMENT *\nPLEASE FUND YOUR SHARE OF $24,714.36\n\nBank Name: Citizens Bank NA\nABA # 011500120\nAccount #: 0026693011\nAccount Name: LIQ CLO Operating Account\nRef: ABTB Mid-Atlantic LLC\n\nRegards,\nSCOTT WALLACE\n', 'attachments_text': ['BANK OF AMERICA\n

In [None]:
conversation = initialize_conversation(processed_email)
email_classification = get_chat_completions(conversation, functions)
email_classification_json = email_classification.function_call.arguments
print(email_classification_json)

{"request_type":"AU Transfer","sub_request_type":"Increase"}


## Only for Testing - use the below cod for generating eml file with attachment

In [None]:
import smtplib
import email
import pdfplumber
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from fpdf import FPDF

# Step 1: Create the PDF Attachment
pdf_content = """\
BANK OF AMERICA
Bank of America, N.A.

To: WELLS FARGO BANK NATIONAL ASSOCIATION
Date: 8-Nov-2023
ATTN:
Phone: 999-999-9999
Fax: 877-606-9426
Email: DENLCFX@wellsfargo.com

Re: CANTOR FITZGERALD LP USD 425MM MAR22 / REVOLVER / CANTOR FIT00037

Deal CUSIP: 13861EAE0
Deal ISIN: US13861EAE05
Facility CUSIP: 13861EAF7
Facility ISIN: US13861EAF79
Lender MEI: US1L058422

Effective 10-Nov-2023, CANTOR FITZGERALD LP has elected to repay under the SOFR (US) Term option, a total of USD 20,000,000.00.
Previous Global principal balance: USD 45,000,000.00
New Global principal balance: USD 25,000,000.00

Your share of the USD 20,000,000.00 SOFR (US) Term option payment is USD 1,411,764.71.
Previous Lender Share Principal Balance: USD 3,176,470.59
New Lender Share Principal Balance: USD 1,764,705.88

We will remit USD 1,411,764.71 on the effective date. Please note that:
(i) if the Borrower has not in fact made such payment; or
(ii) any payment you receive is in excess of what was paid by the Borrower or
(iii) we notify you that the payment was erroneously made, then pursuant to the provisions of the credit facility, you agree to return such payment.

For: WELLS FARGO BANK NATIONAL ASSOCIATION
To: WELLS FARGO BANK, NA
ABA Number: 121000248
Account No: XXXXXXXXXX0720
Reference: CANTOR FITZGERALD LP USD 425MM MAR22, SOFR (US) Term Principal Payment (CANTOR FIT00037)

Thanks & Regards,
JONNY HERNANDEZ
Telephone #: +19803883225
Email id: jonny.hernandez@bofa.com
"""

# Create PDF
pdf_filename = "attachment.pdf"
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=12)

for line in pdf_content.split("\n"):
    pdf.cell(200, 10, txt=line, ln=True)

pdf.output(pdf_filename)

# Step 2: Create the Email Message in .eml Format
msg = MIMEMultipart()
msg["From"] = "scott.wallace@citizensbank.com"
msg["To"] = "ramakrishna.kunchala@wellsfargo.com"
msg["Subject"] = "Facility Lender Share Adjustment"

email_body = """\
Citizens Bank, N.A.
Loan Agency Services

Date: 05-Feb-2025

TO: WELLS FARGO BANK, NATIONAL ASSOCIATION
ATTN: RAMAKRISHNA KUNCHALA
Fax: 877-606-9426

Re: ABTB MID-ATLANTIC LLC $171.3MM 11-4-2022, TERM LOAN A-2

Description: Facility Lender Share Adjustment
BORROWER: ABTB MID-ATLANTIC LLC
DEAL NAME: ABB MID-ATLANTIC LLC $171.3MM 11-4-2022

Effective 04-Feb-2025, the Lender Shares of facility TERM LOAN A-2 have been adjusted.
Your share of the commitment was USD 5,518,249.19. It has been Increased to USD 5,542,963.55.

For: WELLS FARGO BANK, NA
Reference: ABIB MID-ATLANTIC LIC $171.3MM 11-4-2022

If you have any questions, please call the undersigned.

** COMMENT *
PLEASE FUND YOUR SHARE OF $24,714.36

Bank Name: Citizens Bank NA
ABA # 011500120
Account #: 0026693011
Account Name: LIQ CLO Operating Account
Ref: ABTB Mid-Atlantic LLC

Regards,
SCOTT WALLACE
"""

msg.attach(MIMEText(email_body, "plain"))

# Attach PDF
with open(pdf_filename, "rb") as attachment:
    part = MIMEBase("application", "octet-stream")
    part.set_payload(attachment.read())

# Encode attachment
encoders.encode_base64(part)
part.add_header(
    "Content-Disposition",
    f"attachment; filename={pdf_filename}",
)
msg.attach(part)

# Save as .eml file
eml_filename = "sample_email.eml"
with open(eml_filename, "w") as eml_file:
    eml_file.write(msg.as_string())

print(f"EML file '{eml_filename}' has been generated successfully.")


ModuleNotFoundError: No module named 'pdfplumber'