## Import and setup

In [1]:
import os, json
import PyPDF2
from langchain_xai import ChatXAI
from langchain_core.prompts import PromptTemplate

from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# Set up LLM (Grok)
llm = ChatXAI(model="grok-4-1-fast-reasoning")

## Extracting text from PDF and classifying 

In [3]:
# Creating a func to extract email components from PDF
def load_pdf(pdf_path: str) -> str:
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + '\n'
    
    return text


def extract_email_data(email_text: str) -> dict:
    """
    Intake: Extract text from PDF and parse into email components.
    Assumes simple Gmail-style format as in samples.
    """    
    # Clean and parse (heuristic based on sample format)
    lines = [line.strip() for line in email_text.split('\n') if line.strip()]
    
    # Extract From
    from_line = next((line for line in lines if line.startswith('From:')), None)
    sender = from_line.replace('From:', '').strip() if from_line else 'Unknown'
    
    # Extract Subject
    subject_line = next((line for line in lines if line.startswith('Subject:')), None)
    subject = subject_line.replace('Subject:', '').strip() if subject_line else 'No Subject'
    
    # Extract Body (everything after Subject until signature)
    body_start = lines.index(subject_line) + 1 if subject_line else 1
    body_lines = lines[body_start:]
    # Assume signature starts with 'Best regards,' or similar; stop before
    body_end = next((i for i, line in enumerate(body_lines) if 'Best regards,' in line or 'Thanks,' in line), len(body_lines))
    body = '\n'.join(body_lines[:body_end]).strip()
    
    return {
        'sender': sender,
        'subject': subject,
        'body': body
    }


# Email classificaion func
def classify_email(email_data: dict) -> dict:
    """
    Processing: Classify urgency (P1/P2/P3) and topic/domain using Gemini LLM.
    Domains examples: billing, support, outage, product, etc.
    """
    prompt_template = """
        Analyze this email:
        Subject: {subject}
        Body: {body}
        
        Classify:
        - Urgency: P1 (critical, immediate impact), P2 (high, affects operations), P3 (low, informational)
        - Topic: One main domain (e.g., billing, platform outage, support, product issue)
        - Keywords: 3-5 key entities/phrases
        
        Output as JSON: {{"urgency": "P1", "topic": "billing", "keywords": ["invoice", "delay", "cash flow"]}}
    """
    prompt = PromptTemplate.from_template(prompt_template)
    chain = prompt | llm
    
    response = chain.invoke({
        "subject": email_data['subject'],
        "body": email_data['body']
    })
    
    # Parse the response content
    content = response.content.strip()
    
    # Remove markdown code block if present
    if content.startswith('```json'):
        content = content[7:]
    if content.endswith('```'):
        content = content[:-3]
    content = content.strip()
    
    try:
        classification = json.loads(content)
        return classification
    except json.JSONDecodeError:
        return {
            "error": "Failed to parse JSON response",
            "raw_response": response.content
        }

In [4]:
email_test = load_pdf('../backend/test_data/gmail_style_email_2.pdf')
email_data = extract_email_data(email_test)
email_clf = classify_email(email_data)

email_clf

{'urgency': 'P1',
 'topic': 'platform outage',
 'keywords': ['9:30 AM EST',
  'two hours',
  'critical dashboards',
  'outage cause']}

## Storing email data in Milvus and Postgres

In [5]:
import uuid
import json
from sqlalchemy import create_engine, text
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

In [6]:
# DB Connection (sqlite for faster prototyping)
SQLITE_URI = "sqlite:///emails.db"  # File-based DB, creates 'emails.db' if not exists
engine = create_engine(SQLITE_URI)

# Initialize table if not exists (run once or on startup)
with engine.connect() as conn:
    conn.execute(text("""
        CREATE TABLE IF NOT EXISTS emails (
            id TEXT PRIMARY KEY,
            sender TEXT,
            subject TEXT,
            body TEXT,
            urgency TEXT,
            topic TEXT,
            keywords TEXT  -- Stored as JSON string
        )
    """))
    conn.commit()


def store_relational_data(email_id, keywords_json, email_data, classification):
    with engine.connect() as conn:
        conn.execute(text("""
            INSERT INTO emails (id, sender, subject, body, urgency, topic, keywords)
            VALUES (:id, :sender, :subject, :body, :urgency, :topic, :keywords)
        """), {
            'id': email_id,
            'sender': email_data['sender'],
            'subject': email_data['subject'],
            'body': email_data['body'],
            'urgency': classification['urgency'],
            'topic': classification['topic'],
            'keywords': keywords_json
        })
        conn.commit()

In [7]:
# VectorDB connection 
MILVUS_URI = "/Users/egorfolley/TechDev/customer_support_agent/backend/vectorDB/test_vecDB.db"
vector_store = MilvusVectorStore(
    uri=MILVUS_URI,
    collection_name="email_vectors",
    dim=384,
    embedding_field="embedding",
    overwrite=False,  # use True only if recreating
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model,
    storage_context=storage_context,
)

  from pkg_resources import DistributionNotFound, get_distribution


In [8]:
def store_email(email_data, classification):
    """
    Store metadata + keywords + urgency + topic in VectorDB (Milvus) and Relational DB (SQLite).
    - SQLite: Raw email, structured metadata (urgency, topic, keywords as JSON string).
    - Milvus: Embeddings of summary/metadata for RAG retrieval.
    """
    # Check if classification failed
    if 'error' in classification:
        return {"error": "Classification failed", "details": classification}
    
    # Generate unique ID
    email_id = str(uuid.uuid4())
    
    # Store relational data
    keywords_json = json.dumps(classification['keywords'])  # Store list as JSON string
    with engine.connect() as conn:
        conn.execute(text("""
            INSERT INTO emails (id, sender, subject, body, urgency, topic, keywords)
            VALUES (:id, :sender, :subject, :body, :urgency, :topic, :keywords)
        """), {
            'id': email_id,
            'sender': email_data['sender'],
            'subject': email_data['subject'],
            'body': email_data['body'],
            'urgency': classification['urgency'],
            'topic': classification['topic'],
            'keywords': keywords_json
        })
        conn.commit()

    # Create a summary for embedding (title + topic + urgency + key parts)
    summary = f"Subject: {email_data['subject']}\nUrgency: {classification['urgency']}\nTopic: {classification['topic']}\nKeywords: {', '.join(classification['keywords'])}\nBody Snippet: {email_data['body'][:200]}..."
    
    # Store in Milvus (vector) using LlamaIndex for embedding and indexing
    doc = Document(
        text=summary,
        metadata={
            "email_id": email_id,
            "urgency": classification['urgency'],
            "topic": classification['topic']
        },
        id_=email_id
    )
    
    
    # Index with embeddings (LlamaIndex handles embedding via embed_model if set, but here we specify)
    index = VectorStoreIndex.from_documents(
        [doc],
        storage_context=storage_context,
        embed_model=embed_model  # Use Gemini-compatible embeddings
    )
    output = {"email_id": email_id, "stored": True, "storage_context": storage_context}
        
    return output

In [9]:
# Email -> data -> classification -> NOW store
email_store = store_email(email_data=email_data, classification=email_clf)

I0000 00:00:1768798915.325581  477414 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


## Summarization and RAG Extraction

In [10]:
from langchain.tools import tool
from langgraph.graph import StateGraph, START, END
from typing import TypedDict

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langgraph.prebuilt import create_react_agent

In [11]:
# Tools
@tool
def rag_retrieval(query: str) -> str:
    """
    Retrieve similar historical emails from the vector database using semantic search.
    
    Args:
        query (str): The search query based on topic, urgency, keywords.
    
    Returns:
        str: Concatenated text of top 5 similar cases.
    """
    retriever = index.as_retriever()
    retrieved_docs = retriever.retrieve(query)
    similar_cases = "\n".join([doc.text for doc in retrieved_docs[:5]])
    return similar_cases if similar_cases else "No similar cases found."

@tool
def draft_support_response(email_data: dict, classification: dict, diagnostic: dict) -> str:
    """
    Draft a professional customer support response based on email data, classification, and diagnostic.
    
    Args:
        email_data (dict): Parsed email data.
        classification (dict): Classification result.
        diagnostic (dict): Diagnostic output.
    
    Returns:
        str: Drafted response text.
    """
    prompt = f"""
    Based on the following, draft a professional customer support response.
    
    Email Subject: {email_data['subject']}
    Email Body: {email_data['body']}
    Urgency: {classification['urgency']}
    Topic: {classification['topic']}
    Problem Statement: {diagnostic['problem_statement']}
    
    Acknowledge the issue, provide next steps, and include a title.
    """
    response = llm.invoke(prompt)
    return response.content.strip()

@tool
def search_past_similar_tickets(topic: str, urgency: str) -> str:
    """
    Search the relational database for past similar tickets based on topic and urgency.
    
    Args:
        topic (str): The topic of the ticket.
        urgency (str): The urgency level.
    
    Returns:
        str: List of similar tickets as JSON strings.
    """
    with engine.connect() as conn:
        result = conn.execute(text("""
            SELECT id, sender, subject, body, keywords FROM emails
            WHERE topic = :topic AND urgency = :urgency
            LIMIT 5
        """), {'topic': topic, 'urgency': urgency})
        tickets = [dict(row) for row in result]
    return json.dumps(tickets) if tickets else "No similar tickets found."

In [12]:
# Agent setup
tools = [rag_retrieval, draft_support_response, search_past_similar_tickets]

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant for customer support. "
                "Use the tools to analyze the email, retrieve similar historical cases, and search past tickets. "
                "Based on the analysis, create suggestions on how to fix the issue described in the email. "
                "Output structured JSON with diagnostics and fix suggestions."),
    MessagesPlaceholder("messages"),
])

agent = create_react_agent(llm, tools, prompt=prompt)

# agent = create_react_agent(llm, tools, prompt=prompt)

/var/folders/ww/d2xxjh3s0y3dhb878qz0n5z80000gn/T/ipykernel_44219/3540264450.py:12: LangGraphDeprecatedSinceV10: create_react_agent has been moved to `langchain.agents`. Please update your import to `from langchain.agents import create_agent`. Deprecated in LangGraph V1.0 to be removed in V2.0.
  agent = create_react_agent(llm, tools, prompt=prompt)


In [13]:
# Execution example
email_test = load_pdf('../backend/test_data/gmail_style_email_4.pdf')
email_data = extract_email_data(email_test)
email_clf = classify_email(email_data)

initial_state = {
    "messages": [HumanMessage(content=f"Process this email: subject '{email_data['subject']}', body '{email_data['body']}', urgency '{email_clf['urgency']}', topic '{email_clf['topic']}'.")],
}

result = agent.invoke(initial_state)
print(result['messages'][-1].content)

{
  "email_data": {
    "subject": "Support Ticket #4832 – No Response Yet",
    "body": "Hi Support Team,\nI’m following up on ticket #4832, which we opened last week. We haven’t received any response so far,\nand the issue is still blocking our workflow.\nCould someone please take a look and let us know when we can expect help?\nThank you,\nDavid Morales\nProduct Lead, ClearPath AI",
    "urgency": "P2",
    "topic": "support"
  },
  "diagnostics": {
    "issue_type": "follow-up on unanswered ticket",
    "ticket_id": "4832",
    "customer_sentiment": "frustrated",
    "impact": "blocking workflow",
    "time_since_open": "last week",
    "similar_cases": "No highly similar historical cases found. Retrieved case on platform outage (P1) not matching.",
    "past_tickets": "No similar tickets found for topic 'support' and urgency 'P2'."
  },
  "fix_suggestions": [
    "Immediately retrieve and review the details of ticket #4832 to understand the original issue.",
    "Assign the ticket