In [15]:
!pip install fastapi pydantic pypdf pymupdf python-docx unstructured google-generativeai elasticsearch opencv-python numpy pytesseract pdf2image uvicorn nest-asyncio

import os
import pytesseract
import fitz  # PyMuPDF for PDF processing
from PIL import Image
from pdf2image import convert_from_path
from docx import Document
from unstructured.partition.email import partition_email
from elasticsearch import Elasticsearch
import numpy as np
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import google.generativeai as genai
import uvicorn
import nest_asyncio


genai.configure(api_key="")

app = FastAPI()

class EmailRequest(BaseModel):
    file_path: str
    sender: str
    subject: str
    predefined_categories: dict = None
    priority_rules: dict = None

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_eml(eml_path):
    elements = partition_email(filename=eml_path)
    return "\n".join(str(element) for element in elements)

def extract_text_with_ocr(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image) + "\n"
    return text.strip()

def extract_email_content(file_path):
    ext = file_path.split(".")[-1].lower()
    if ext == "pdf":
        return extract_text_from_pdf(file_path) + "\n" + extract_text_with_ocr(file_path)
    elif ext == "docx":
        return extract_text_from_docx(file_path)
    elif ext == "eml":
        return extract_text_from_eml(file_path)
    return ""

# Classification using Google Gemini API
def classify_with_gemini(email_text, sender, subject, priority_rules, predefined_categories):
    model = genai.GenerativeModel("gemini-pro")

    prompt = f"""
    Classify the following email into predefined categories:

    Email:
    Subject: {subject}
    Sender: {sender}
    Content: {email_text}

    Categories:
    {predefined_categories}

    Priority Rules:
    {priority_rules}

    Return a JSON with `primary_category`, `categories`, and `confidence_score`.
    """

    response = model.generate_content(prompt)
    return response.text  # Ensure correct JSON parsing


@app.post("/classify_email")
async def classify_email(request: EmailRequest):
    try:
        file_path = request.file_path
        sender = request.sender
        subject = request.subject

        if not file_path or not os.path.exists(file_path):
            raise HTTPException(status_code=400, detail="File not found")

        # Extract email text
        email_text = extract_email_content(file_path)

        # Default categories if not provided
        predefined_categories = request.predefined_categories or {
        }

        # Default priority rules if not provided
        priority_rules = request.priority_rules or {
            "content_weightage": 0.7,
            "attachment_weightage": 0.3,
            "keywords_priority": {}
        }

        # Classify email
        classification = classify_with_gemini(email_text, sender, subject, priority_rules, predefined_categories)

        response = {
            "classification": classification,
            "primary_category": classification.get("primary_category")
        }

        return response

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))



ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-10' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=SystemExit(1)>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 163, in startup
    server = await loop.create_server(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/base_events.py", line 1536, in create_server
    raise OSError(err.errno, msg) from None
OSError: [Errno 98] error while attempting to bind on address ('0.0.0.0', 8001): address already in use

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-14-9918d5e3f5a5>", line 162, in ru



INFO:     Started server process [377]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)


https://muyr3qi6pk7-496ff2e9c6d22116-8002-colab.googleusercontent.com/


In [16]:
import requests

url = "https://muyr3qi6pk7-496ff2e9c6d22116-8002-colab.googleusercontent.com/classify_email"
data = {
    "file_path": "sample_email.pdf",
    "sender": "test@example.com",
    "subject": "Billing Issue"
}

response = requests.post(url, json=data)
print(response.json())


JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [12]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(8001)"))


https://tw3vmdam0t-496ff2e9c6d22116-8001-colab.googleusercontent.com/
