<a href="https://colab.research.google.com/github/dkisselev-zz/llm_engineering/blob/wk5-excersise/week5/community-contributions/dkisselev-zz/Week5_Excerise_EmailTerminator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gmail Terminator

## An Intelligent Email Management System

This application uses RAG (Retrieval Augmented Generation) and LLMs to analyze your Gmail inbox, identify important topics and interests, and help you safely delete unimportant emails with archiving.

### Features:
- **IMAP Authentication**: Secure app-specific password authentication
- **Vector Embeddings**: OpenAI or BERT/HuggingFace models
- **Topic Analysis**: LLM-powered identification of your interests
- **Category Counts**: See breakdown of email categories
- **Chat-Based Topics Updates**: Use chat to find specific topics of interest
- **Selective Deletion**: Choose specific emails to delete with checkboxes
- **Safe Deletion**: Automatic archiving before deletion
- **Testing Mode**: Process limited emails with debug output

### Architecture:
1. Connect to Gmail via IMAP
2. Fetch and parse emails
3. Chunk text and create embeddings
4. Store vectors in ChromaDB
5. Use LLM to identify important topics
6. Classify emails as keep/delete
7. Select specific emails to delete
8. Archive and safely delete selected emails

## Setup Instructions

### IMAP with App-Specific Password

1. **Enable 2-Factor Authentication** on your Google account (required for app passwords)
2. **Create App-Specific Password**
   - Go to [Google Account Security](https://myaccount.google.com/security)
   - Under "2-Step Verification", find "App passwords"
   - Generate a new app password for "Mail"
3. **Store Credentials**
   - **Google Colab**: Store as secrets named `EMAIL` and `IMAP_PASSWORD`
   - **Local**: Add to `.env` file:
     ```
     EMAIL=your.email@gmail.com
     IMAP_PASSWORD=your_16_char_app_password
     ```
4. **Connect**: If credentials are stored, they will auto-populate in the UI

## Install and Setup

In [None]:
%pip install -U -q imapclient langchain langchain-openai langchain-chroma langchain-community langchain-core langchain-text-splitters langchain-huggingface chromadb sentence-transformers


In [None]:
# Standard library imports
import os
import json
import base64
import zipfile
import shutil
from datetime import datetime
from collections import Counter
from typing import List, Dict, Optional, Tuple
from abc import ABC, abstractmethod

# Third-party imports
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup

# IMAP imports
import imaplib
import email
from email.header import decode_header

# LangChain v1.0+ imports
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.callbacks import StdOutCallbackHandler

# LLM APIs
from openai import OpenAI

# HuggingFace
from huggingface_hub import login

# Gradio
import gradio as gr


In [None]:
def setup_api_keys():
    try:
        # Try Colab environment first
        from google.colab import userdata
        api_keys = {
            'openai': userdata.get('OPENAI_API_KEY'),
            'anthropic': userdata.get('ANTHROPIC_API_KEY'),
            'google': userdata.get('GOOGLE_API_KEY'),
            'hf_token': userdata.get('HF_TOKEN')
        }
        email = userdata.get('EMAIL')
        password = userdata.get('IMAP_PASSWORD')
        print("✅ Using Colab secrets")
    except:
        # Fallback to local environment
        from dotenv import load_dotenv
        load_dotenv()
        api_keys = {
            'openai': os.getenv('OPENAI_API_KEY'),
            'anthropic': os.getenv('ANTHROPIC_API_KEY'),
            'google': os.getenv('GOOGLE_API_KEY'),
            'hf_token': os.getenv('HF_TOKEN')
        }

        email = os.getenv('EMAIL', '')
        password = os.getenv('IMAP_PASSWORD', '')
        print("✅ Using local .env file")

    # Initialize API clients
    anthropic_url = "https://api.anthropic.com/v1/"
    gemini_url = "https://generativelanguage.googleapis.com/v1beta/openai/"

    clients = {}
    if api_keys['openai']:
        clients['openai'] = OpenAI(api_key=api_keys['openai'])
    if api_keys['anthropic']:
        clients['anthropic'] = OpenAI(api_key=api_keys['anthropic'], base_url=anthropic_url)
    if api_keys['google']:
        clients['google'] = OpenAI(api_key=api_keys['google'], base_url=gemini_url)
    if api_keys['hf_token']:
        login(api_keys['hf_token'])

    os.environ['OPENAI_API_KEY'] = api_keys['openai']
    os.environ['ANTHROPIC_API_KEY'] = api_keys['anthropic']
    os.environ['GOOGLE_API_KEY'] = api_keys['google']

    return api_keys, clients, email, password

# Initialize API keys and clients
api_keys, clients, default_email, default_password = setup_api_keys()

# Constants
MODEL_OPENAI = "gpt-4o-mini"
MODEL_GEMINI = "gemini-2.5-pro"
DB_NAME = "email_vector_db"


##Helper Functions

In [None]:
def get_header_value(headers, name):
    """Get header value from email headers."""
    for header in headers:
        if header['name'].lower() == name.lower():
            return header['value']
    return ""

##Gmail Connection Classes

In [None]:
class GmailConnection(ABC):
    """Abstract base class for Gmail connections."""

    def __init__(self):
        self.connection = None
        self.auth_info = None

    @abstractmethod
    def connect(self) -> bool:
        pass

    def fetch_emails(self, max_emails: Optional[int] = None) -> Tuple[List[Document], str]:
        """Fetch emails. Returns (documents, diagnostic_message)."""
        pass

    @abstractmethod
    def delete_emails(self, documents: List[Document]) -> Tuple[int, int]:
        pass

    def get_auth_info(self) -> Dict:
        return self.auth_info

    def is_connected(self) -> bool:
        return self.connection is not None


class IMAPConnection(GmailConnection):
    """IMAP Gmail connection.

    IMPORTANT: For proper email deletion with Gmail IMAP, configure these settings:
    1. Go to Gmail Settings → Forwarding and POP/IMAP tab
    2. Under "When I mark a message in IMAP as deleted":
       - Set to "Auto-Expunge off - Wait for the client to update the server"
    3. Under "When a message is marked as deleted and expunged from the last visible IMAP folder":
       - Select "Move the message to the Trash"
    4. Make sure "Trash" label is set to "Show in IMAP" under Labels settings

    This ensures deleted emails are properly moved to Trash when expunged.
    """

    def __init__(self, email_address: str, app_password: str):
        super().__init__()
        self.email_address = email_address
        self.app_password = app_password

    def connect(self) -> bool:
        """Authenticate with Gmail using IMAP."""
        try:
            imaplib._MAXLINE = 10000000  # 10MB

            self.connection = imaplib.IMAP4_SSL("imap.gmail.com", 993)
            self.connection.login(self.email_address, self.app_password)

            status, messages = self.connection.select("INBOX")
            if status == "OK":
                self.auth_info = {
                    'email': self.email_address,
                    'total_messages': int(messages[0]),
                    'auth_method': 'IMAP'
                }

                print(f"✓ IMAP connected as: {self.email_address}")
                print(f"✓ Total messages in INBOX: {self.auth_info['total_messages']:,}")
                return True
            else:
                print(f"❌ Failed to select INBOX: {status}")
                return False

        except Exception as e:
            print(f"❌ IMAP authentication failed: {e}")
            print("Make sure you're using an app-specific password.")
            return False

    def fetch_emails(self, max_emails: Optional[int] = None) -> Tuple[List[Document], str]:
        """Fetch emails using IMAP with UIDs. Returns (documents, diagnostic_message)."""
        if not self.connection:
            raise RuntimeError("Not connected. Call connect() first.")

        diagnostics = []  # Capture diagnostic messages

        try:
            self.connection.select("INBOX")

            status, messages = self.connection.uid('search', None, "ALL")

            if status != "OK":
                msg = f"❌ Search failed with status: {status}"
                diagnostics.append(msg)
                return [], "\n".join(diagnostics)

            msg_uids = messages[0].split()
            diagnostics.append(f"✓ Found {len(msg_uids)} message UIDs")

            if not msg_uids:
                diagnostics.append("❌ No message UIDs returned from search")
                return [], "\n".join(diagnostics)

            if max_emails:
                msg_uids = msg_uids[-max_emails:]  # Get most recent
                diagnostics.append(f"  → Limited to {len(msg_uids)} most recent emails")

            diagnostics.append(f"Fetching {len(msg_uids)} emails...")
            documents = []
            errors = []

            for uid in tqdm(msg_uids, desc="Processing emails"):
                try:
                    # Fetch using UID to get both UID and the email content
                    status, msg_data = self.connection.uid('fetch', uid, "(RFC822)")
                    if status != "OK":
                        errors.append(f"Fetch failed for UID {uid}: {status}")
                        continue

                    # Check if msg_data is valid
                    if not msg_data or not msg_data[0] or len(msg_data[0]) < 2:
                        errors.append(f"Invalid msg_data for UID {uid}")
                        continue

                    email_message = email.message_from_bytes(msg_data[0][1])

                    # Extract headers
                    subject = email_message.get("Subject", "")
                    if subject:
                        decoded = decode_header(subject)[0]
                        if isinstance(decoded[0], bytes):
                            subject = decoded[0].decode(decoded[1] or 'utf-8', errors='ignore')
                        else:
                            subject = decoded[0]

                    sender = email_message.get("From", "")
                    recipient = email_message.get("To", "")
                    date_str = email_message.get("Date", "")

                    # Extract body
                    body = ""
                    if email_message.is_multipart():
                        for part in email_message.walk():
                            if part.get_content_type() == "text/plain":
                                try:
                                    payload = part.get_payload(decode=True)
                                    if payload:
                                        body = payload.decode('utf-8', errors='ignore')
                                        break
                                except Exception as e:
                                    continue
                            elif part.get_content_type() == "text/html" and not body:
                                try:
                                    payload = part.get_payload(decode=True)
                                    if payload:
                                        html = payload.decode('utf-8', errors='ignore')
                                        body = BeautifulSoup(html, 'html.parser').get_text()
                                except Exception as e:
                                    continue
                    else:
                        try:
                            payload = email_message.get_payload(decode=True)
                            if payload:
                                body = payload.decode('utf-8', errors='ignore')
                                if email_message.get_content_type() == "text/html":
                                    body = BeautifulSoup(body, 'html.parser').get_text()
                            else:
                                # Try without decoding for plain text
                                body = str(email_message.get_payload())
                        except Exception as e:
                            # Last resort: use subject as body
                            body = ""

                    # Clean whitespace
                    if body:
                        body = ' '.join(body.split())

                    # Use subject if body is empty or too short
                    if not body or len(body) < 10:
                        body = subject or "No content"

                    content = f"Subject: {subject}\nFrom: {sender}\nTo: {recipient}\nDate: {date_str}\n\n{body}"

                    doc = Document(
                        page_content=content,
                        metadata={
                            'uid': uid.decode(),
                            'message_id': uid.decode(),
                            'subject': subject,
                            'sender': sender,
                            'recipient': recipient,
                            'date': date_str,
                            'source': 'gmail_imap'
                        }
                    )
                    documents.append(doc)

                except Exception as e:
                    errors.append(f"Error processing UID {uid}: {str(e)}")
                    continue

            diagnostics.append(f"✓ Successfully fetched {len(documents)} emails out of {len(msg_uids)} attempted")

            if errors:
                diagnostics.append(f"\n⚠️ Encountered {len(errors)} errors:")
                # Show first 5 errors
                for err in errors[:5]:
                    diagnostics.append(f"  • {err}")
                if len(errors) > 5:
                    diagnostics.append(f"  ... and {len(errors) - 5} more errors")

            if len(documents) == 0 and len(msg_uids) > 0:
                diagnostics.append("\n⚠️ WARNING: No documents created despite having UIDs")

            return documents, "\n".join(diagnostics)

        except Exception as error:
            diagnostics.append(f"❌ Fetch error: {error}")
            import traceback
            diagnostics.append(f"\nTraceback:\n{traceback.format_exc()}")
            return [], "\n".join(diagnostics)

    def delete_emails(self, documents: List[Document]) -> Tuple[int, int]:
        """Delete emails using IMAP with proper UID handling for Gmail.

        This method works with Gmail's "Auto-Expunge off" setting by:
        1. Using UIDs instead of sequence numbers for reliable identification
        2. Marking emails with \\Deleted flag
        3. Explicitly calling EXPUNGE to permanently remove them
        4. Moving emails to [Gmail]/Trash (Gmail's default behavior)
        """
        if not self.connection:
            raise RuntimeError("Not connected. Call connect() first.")

        if not documents:
            return 0, 0

        successful, failed = 0, 0
        print(f"Deleting {len(documents)} emails via IMAP...")

        try:
            # Select INBOX in read-write mode (default)
            status, response = self.connection.select("INBOX")
            if status != "OK":
                print(f"❌ Failed to select INBOX: {response}")
                return 0, len(documents)

            for doc in tqdm(documents, desc="Marking emails for deletion"):
                # Try to get UID first, fall back to message_id
                uid = doc.metadata.get('uid') or doc.metadata.get('message_id')
                if not uid:
                    print(f"⚠️ No UID found for email: {doc.metadata.get('subject', 'Unknown')}")
                    failed += 1
                    continue

                try:
                    # Convert to bytes if it's a string
                    if isinstance(uid, str):
                        uid = uid.encode()

                    # Use UID STORE to mark the email as deleted
                    # This is more reliable than using sequence numbers
                    status, response = self.connection.uid('STORE', uid, '+FLAGS', '(\\Deleted)')

                    if status == "OK":
                        successful += 1
                    else:
                        print(f"⚠️ Failed to mark UID {uid.decode()}: {response}")
                        failed += 1

                except Exception as e:
                    print(f"❌ Error deleting UID {uid}: {e}")
                    failed += 1

            # Expunge to permanently delete all messages marked as \\Deleted
            # With Gmail's "Auto-Expunge off", this command is required
            print(f"\n📤 Expunging {successful} deleted emails...")
            try:
                status, response = self.connection.expunge()
                if status == "OK":
                    print(f"✓ Expunge successful: {response}")
                else:
                    print(f"⚠️ Expunge response: {status} - {response}")
            except Exception as e:
                print(f"❌ Expunge error: {e}")

            # Close and reselect to ensure changes are committed
            try:
                self.connection.close()
                self.connection.select("INBOX")
            except:
                pass  # Not critical if this fails

            print(f"\n✓ Deletion complete: {successful} successful, {failed} failed")
            if successful > 0:
                print(f"ℹ️  With Gmail's settings, deleted emails should appear in [Gmail]/Trash")

            return successful, failed

        except Exception as error:
            print(f"❌ Delete operation error: {error}")
            return successful, failed


def create_gmail_connection(email: str, password: str) -> GmailConnection:
    """Factory function to create Gmail connection."""
    if not email or not password:
        raise ValueError("Email and password required for IMAP")
    return IMAPConnection(email, password)

##Vector Database Manager

In [None]:
class VectorDatabaseManager:
    """Manages vector database operations for email embeddings."""

    def __init__(self, db_name: str = DB_NAME):
        self.db_name = db_name
        self.vectorstore = None
        self.embeddings = None

    def create_embeddings(self, model_type: str = "openai"):
        """Create embedding function based on model type."""
        if model_type.lower() == "openai":
            print("Using OpenAI embeddings...")
            self.embeddings = OpenAIEmbeddings()
        elif model_type.lower() == "bert":
            print("Using BERT (HuggingFace) embeddings...")
            self.embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2"
            )
        else:
            raise ValueError(f"Unknown model type: {model_type}. Use 'openai' or 'bert'.")

        return self.embeddings

    def create_vector_store(self, chunks: List[Document], recreate: bool = True):
        """Chroma vector store from document chunks."""
        if not self.embeddings:
            raise RuntimeError("Call create_embeddings() first")

        if recreate and os.path.exists(self.db_name):
            print(f"Deleting existing database: {self.db_name}")
            try:
                Chroma(persist_directory=self.db_name, embedding_function=self.embeddings).delete_collection()
            except:
                pass

        print(f"Creating vector store with {len(chunks)} chunks")
        self.vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=self.embeddings,
            persist_directory=self.db_name
        )

        count = self.vectorstore._collection.count()
        print(f"Vector store created with {count:,} documents")

        return self.vectorstore

    def load_vector_store(self):
        """Load existing Chroma vector store."""
        if not self.embeddings:
            raise RuntimeError("Call create_embeddings() first")

        if not os.path.exists(self.db_name):
            raise FileNotFoundError(f"Vector store not found: {self.db_name}")

        self.vectorstore = Chroma(
            persist_directory=self.db_name,
            embedding_function=self.embeddings
        )

        count = self.vectorstore._collection.count()
        print(f"Loaded vector store with {count:,} documents")

        return self.vectorstore

    def get_vectorstore(self):
        """Get the vectorstore instance."""
        return self.vectorstore

## Email Processor

In [None]:
class EmailProcessor:
    """Email processor"""

    def __init__(self):
        self.documents = []
        self.chunks = []
        self.llm = None
        self.topics = ""
        self.classified_emails = {'keep': [], 'delete': []}
        self.topic_to_emails = {}
        self.email_to_topic = {}

    def chunk_documents(self, documents: List[Document], chunk_size: int = 1000, chunk_overlap: int = 200):
        """Chunk email documents."""
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

        self.documents = documents
        self.chunks = text_splitter.split_documents(documents)
        print(f"Created {len(self.chunks)} chunks from {len(documents)} documents")
        return self.chunks

    def get_statistics(self, documents: List[Document]) -> Dict:
        """Calculate statistics."""
        if not documents:
            return {}

        senders = [doc.metadata.get('sender', '') for doc in documents]
        total_chars = sum(len(doc.page_content) for doc in documents)

        return {
            'total_emails': len(documents),
            'total_chars': total_chars,
            'avg_email_length': total_chars // len(documents),
            'unique_senders': len(set(senders)),
            'top_senders': Counter(senders).most_common(10)
        }

    def create_llm(self, model_type: str = "openai", temperature: float = 0.7, debug: bool = False):
        """Create LLM instance."""
        callbacks = [StdOutCallbackHandler()] if debug else []

        if model_type.lower() == "openai":
            self.llm = ChatOpenAI(
                temperature=temperature,
                model_name=MODEL_OPENAI,
                callbacks=callbacks
            )
        else:
            self.llm = ChatOpenAI(temperature=temperature, model_name=MODEL_OPENAI)

        return self.llm

    def analyze_personal_interests(self, documents: List[Document]) -> str:
        """Analyze personal interests using LLM."""
        if not self.llm:
            raise RuntimeError("Call create_llm() first")

        prompt = self._generate_topics_prompt(documents)
        response = self.llm.invoke([HumanMessage(content=prompt)])
        self.topics = response.content
        return self.topics

    def _generate_topics_prompt(self, documents: List[Document], user_context: Optional[str] = None) -> str:
        """Generate LLM prompt for topic identification."""
        senders = [doc.metadata.get('sender', '') for doc in documents]
        subjects = [doc.metadata.get('subject', '') for doc in documents]
        sender_counts = Counter(senders).most_common(20)

        context_line = f'Based on the user\'s query: "{user_context}"\n\n' if user_context else ""

        prompt = f"""
{context_line}I have {len(documents)} emails. Analyze and identify 5-10 important topics/categories.

Top senders:
{chr(10).join([f"- {sender}: {count}" for sender, count in sender_counts])}

Sample subjects (first 30):
{chr(10).join([f"- {subj}" for subj in subjects[:30]])}

IMPORTANT: Format your response as a simple numbered list with ONLY the topic names, one per line.
Do NOT use markdown formatting (**, *, etc.).
Do NOT add descriptions or explanations after the topic name.
Do NOT add blank lines between topics.

Example format:
1. Work Projects
2. Family Communications
3. Professional Development
"""

        if user_context:
            prompt += f"\n\nYour response should list topics that align with the user's query about: {user_context}"

        return prompt

    def extract_topics_from_text(self, topics_text: str) -> List[str]:
        """Extract topic list from LLM-generated topics text."""
        topics = []
        lines = topics_text.strip().split('\n')

        for line in lines:
            line = line.strip()

            # Skip empty lines
            if not line or len(line) < 3:
                continue

            # Skip lines that are clearly descriptions (start with lowercase, or too long)
            if line[0].islower() or line.startswith(('Emails', 'Topics', 'Information', 'Communications', 'Offers')):
                continue

            # Remove markdown formatting (**, *, _)
            line = line.replace('**', '').replace('*', '').replace('_', '')

            # Remove numbering and bullet points
            if line and line[0].isdigit():
                # Remove "1." or "1)"
                parts = line.split('.', 1)
                if len(parts) > 1:
                    line = parts[1].strip()
                else:
                    parts = line.split(')', 1)
                    if len(parts) > 1:
                        line = parts[1].strip()
            elif line.startswith(('-', '•')):
                line = line[1:].strip()

            # Take only the topic name (before any dash or colon describing it)
            if ' - ' in line:
                topic = line.split(' - ')[0].strip()
            elif ':' in line:
                topic = line.split(':')[0].strip()
            else:
                topic = line.strip()

            # Validate: reasonable length for a topic name (not a full sentence/description)
            # Topic names should be between 5-60 characters
            if topic and 5 < len(topic) < 60 and not topic.lower().startswith('based on'):
                topics.append(topic)

        return topics[:10]  # Limit to top 10 topics

    def categorize_emails_by_topics(self, documents: List[Document], vectorstore) -> Dict[str, List[Document]]:
        """Categorize emails by matching them to identified topics using RAG."""
        if not self.topics or not vectorstore:
            return {}

        # Extract topic list from the topics text
        topic_list = self.extract_topics_from_text(self.topics)

        if not topic_list:
            return {}

        # For each topic, find matching emails using vector similarity
        topic_to_emails = {topic: [] for topic in topic_list}
        topic_to_emails['Uncategorized'] = []

        # Track which emails have been matched to which topic
        matched_email_ids = set()
        email_to_topic = {}  # Map message_id to topic name

        retriever = vectorstore.as_retriever(search_kwargs={"k": len(documents)})

        for topic in topic_list:
            # Query vectorstore for emails matching this topic
            query = f"Emails about: {topic}"
            relevant_docs = retriever.invoke(query)

            # Take top matches (based on proportion of total emails - ~15% per topic)
            num_matches = max(1, int(len(documents) * 0.15))

            for doc in relevant_docs[:num_matches]:
                msg_id = doc.metadata.get('message_id')
                if msg_id and msg_id not in matched_email_ids:
                    # Find the original document
                    original_doc = next((d for d in documents if d.metadata.get('message_id') == msg_id), None)
                    if original_doc:
                        topic_to_emails[topic].append(original_doc)
                        matched_email_ids.add(msg_id)
                        email_to_topic[msg_id] = topic

        # Add uncategorized emails
        for doc in documents:
            msg_id = doc.metadata.get('message_id')
            if msg_id not in matched_email_ids:
                topic_to_emails['Uncategorized'].append(doc)
                email_to_topic[msg_id] = 'Uncategorized'

        # Store the mapping for use in dataframe creation
        self.email_to_topic = email_to_topic

        return topic_to_emails

    def get_topic_counts_display(self, documents: List[Document], vectorstore) -> str:
        """Get formatted topic counts for display."""
        if not self.topics or not vectorstore:
            return "No topics identified yet."

        topic_to_emails = self.categorize_emails_by_topics(documents, vectorstore)

        counts_text = "Email Counts by Identified Topic:\n\n"

        # Sort by count, descending
        sorted_topics = sorted(topic_to_emails.items(), key=lambda x: len(x[1]), reverse=True)

        for topic, emails in sorted_topics:
            count = len(emails)
            if count > 0:
                counts_text += f"  {topic}: {count} emails\n"

        total = sum(len(emails) for emails in topic_to_emails.values())
        counts_text += f"\n  Total: {total} emails"

        return counts_text

    def classify_emails(self, documents: List[Document], vectorstore, threshold: float = 0.5):
        """Classify emails based on identified topics.

        Emails matching identified topics → KEEP
        Emails not matching any topic → DELETE candidates
        """
        if not self.topics:
            raise RuntimeError("Call analyze_personal_interests() first")

        # Categorize emails by topics
        topic_to_emails = self.categorize_emails_by_topics(documents, vectorstore)

        # Emails matching topics are KEPT
        keep_emails = []
        for topic, emails in topic_to_emails.items():
            if topic != 'Uncategorized':
                keep_emails.extend(emails)

        # Uncategorized emails are DELETE candidates
        delete_candidates = topic_to_emails.get('Uncategorized', [])

        # Store topic categorization for counts display
        self.topic_to_emails = topic_to_emails

        self.classified_emails = {'keep': keep_emails, 'delete': delete_candidates}

        print(f"Classification: {len(keep_emails)} keep, {len(delete_candidates)} delete")
        print(f"Matched to {len([t for t in topic_to_emails.keys() if t != 'Uncategorized'])} topics")
        return self.classified_emails

    def create_archive(self, documents: List[Document], archive_name: Optional[str] = None) -> str:
        """Create ZIP archive of emails."""
        if not documents:
            raise ValueError("No documents to archive")

        if not archive_name:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            archive_name = f"email_archive_{timestamp}.zip"

        archive_dir = "email_archive_temp"
        os.makedirs(archive_dir, exist_ok=True)

        for i, doc in enumerate(documents):
            email_data = {'metadata': doc.metadata, 'content': doc.page_content}
            subject = doc.metadata.get('subject', 'no_subject')[:50]
            safe_subject = "".join(c for c in subject if c.isalnum() or c in (' ', '-', '_')).strip()
            filename = f"{i+1:04d}_{safe_subject}.json"

            with open(os.path.join(archive_dir, filename), 'w', encoding='utf-8') as f:
                json.dump(email_data, f, indent=2, ensure_ascii=False)

        # Create ZIP
        with zipfile.ZipFile(archive_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, dirs, files in os.walk(archive_dir):
                for file in files:
                    zipf.write(os.path.join(root, file), file)

        shutil.rmtree(archive_dir)
        print(f"Archive created: {archive_name}")
        return archive_name

    def emails_to_dataframe(self, documents: List[Document], add_select_column: bool = False) -> pd.DataFrame:
        """Convert to DataFrame with Topics column."""
        data = [
            {
                'Topics': self.email_to_topic.get(doc.metadata.get('message_id', ''), 'Unknown'),
                'Message ID': doc.metadata.get('message_id', ''),
                'Subject': doc.metadata.get('subject', '')[:100],
                'Sender': doc.metadata.get('sender', ''),
                'Length': len(doc.page_content)
            }
            for doc in documents
        ]
        df = pd.DataFrame(data)

        if add_select_column:
            # Add Select column as first column
            df.insert(0, 'Select', False)

        return df

##Application State

In [None]:
class AppState:
    """Global application state."""
    def __init__(self):
        self.gmail_conn: Optional[GmailConnection] = None
        self.vector_db_manager = VectorDatabaseManager()
        self.email_processor = EmailProcessor()
        self.testing_mode = False
        self.debug_mode = False

state = AppState()

##Gradio Callback Functions

In [None]:
def connect_imap(email, password):
    try:
        state.gmail_conn = create_gmail_connection(email, password)
        if state.gmail_conn.connect():
            info = state.gmail_conn.get_auth_info()
            return f"Connected as {info['email']}\nTotal messages: {info['total_messages']:,}"
        return "❌ Authentication failed"
    except Exception as e:
        return f"❌ Error: {str(e)}"


def connect_imap(email, password):
    try:
        state.gmail_conn = create_gmail_connection(email, password)
        if state.gmail_conn.connect():
            info = state.gmail_conn.get_auth_info()
            return f"Connected as {info['email']}\nTotal messages: {info['total_messages']:,}"
        return "❌ Authentication failed"
    except Exception as e:
        return f"❌ Error: {str(e)}"


def fetch_and_process(testing_mode, embedding_model):
    try:
        if not state.gmail_conn or not state.gmail_conn.is_connected():
            return "❌ Not authenticated"

        state.testing_mode = testing_mode
        max_emails = 50 if testing_mode else None

        documents, fetch_diagnostics = state.gmail_conn.fetch_emails(max_emails)

        if not documents:
            return f"❌ No emails fetched\n\n{fetch_diagnostics}"

        stats = state.email_processor.get_statistics(documents)
        chunks = state.email_processor.chunk_documents(documents)

        state.vector_db_manager.create_embeddings(embedding_model)
        state.vector_db_manager.create_vector_store(chunks)

        return f"""✓ Processing completed!

{fetch_diagnostics}

Total emails: {stats['total_emails']}
Chunks created: {len(chunks)}
Top 5 senders:
{chr(10).join([f"  - {sender}: {count}" for sender, count in stats['top_senders'][:5]])}
"""
    except Exception as e:
        import traceback
        return f"❌ Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"


def analyze_topics(llm_model, threshold):
    try:
        if not state.email_processor.documents:
            return "❌ No documents loaded", "", None, None

        state.email_processor.create_llm(llm_model)
        topics = state.email_processor.analyze_personal_interests(state.email_processor.documents)

        # Automatically classify after analysis
        classified = state.email_processor.classify_emails(
            state.email_processor.documents,
            state.vector_db_manager.vectorstore,
            threshold
        )

        # Get topic counts after classification (shows which topics emails matched to)
        counts_text = state.email_processor.get_topic_counts_display(
            state.email_processor.documents,
            state.vector_db_manager.vectorstore
        )

        # Get the actual topics list that was used for categorization
        topic_list = state.email_processor.extract_topics_from_text(topics)
        formatted_topics = "Identified Topics:\n\n" + "\n".join([f"{i+1}. {topic}" for i, topic in enumerate(topic_list)])

        keep_df = state.email_processor.emails_to_dataframe(classified['keep'], add_select_column=False)
        delete_df = state.email_processor.emails_to_dataframe(classified['delete'], add_select_column=True)

        return formatted_topics, counts_text, keep_df, delete_df
    except Exception as e:
        return f"❌ Error: {str(e)}", "", None, None


def refine_topics_with_chat(chat_query, llm_model, threshold):
    """Use LLM to identify topics based on user query about their interests."""
    try:
        if not state.email_processor.documents or not state.vector_db_manager.vectorstore:
            return "❌ Please process emails first", "", None, None

        if not chat_query or chat_query.strip() == "":
            return "❌ Please enter a query", "", None, None

        # Create LLM if needed
        if not state.email_processor.llm:
            state.email_processor.create_llm(llm_model)

        prompt = state.email_processor._generate_topics_prompt(
            state.email_processor.documents,
            user_context=chat_query
        )

        response = state.email_processor.llm.invoke([HumanMessage(content=prompt)])
        state.email_processor.topics = response.content

        # Automatically classify emails based on the new topics
        classified = state.email_processor.classify_emails(
            state.email_processor.documents,
            state.vector_db_manager.vectorstore,
            threshold
        )

        # Get topic counts after classification
        counts_text = state.email_processor.get_topic_counts_display(
            state.email_processor.documents,
            state.vector_db_manager.vectorstore
        )

        # Get the actual topics list that was used for categorization
        topic_list = state.email_processor.extract_topics_from_text(state.email_processor.topics)
        formatted_topics = "Identified Topics:\n\n" + "\n".join([f"{i+1}. {topic}" for i, topic in enumerate(topic_list)])

        keep_df = state.email_processor.emails_to_dataframe(classified['keep'], add_select_column=False)
        delete_df = state.email_processor.emails_to_dataframe(classified['delete'], add_select_column=True)

        return formatted_topics, counts_text, keep_df, delete_df
    except Exception as e:
        return f"❌ Error: {str(e)}", "", None, None


def select_all_emails(delete_df):
    """Select all delete candidate emails."""
    if delete_df is None or len(delete_df) == 0:
        return delete_df

    delete_df_copy = delete_df.copy()
    delete_df_copy['Select'] = True
    return delete_df_copy


def deselect_all_emails(delete_df):
    """Deselect all delete candidate emails."""
    if delete_df is None or len(delete_df) == 0:
        return delete_df

    delete_df_copy = delete_df.copy()
    delete_df_copy['Select'] = False
    return delete_df_copy


def create_archive_file():
    try:
        if not state.email_processor.classified_emails['delete']:
            return "❌ No emails to archive", None

        archive_path = state.email_processor.create_archive(
            state.email_processor.classified_emails['delete']
        )
        return f"✓ Archive created: {archive_path}", archive_path
    except Exception as e:
        return f"❌ Error: {str(e)}", None


def perform_deletion(confirmation_text, delete_df):
    try:
        if confirmation_text.strip().upper() != "DELETE":
            return "❌ Confirmation failed. Type 'DELETE' to confirm."

        if delete_df is None or len(delete_df) == 0:
            return "❌ No emails available for deletion"

        # Get selected emails
        if 'Select' not in delete_df.columns:
            return "❌ Invalid dataframe format"

        selected_rows = delete_df[delete_df['Select'] == True]
        if len(selected_rows) == 0:
            return "❌ No emails selected for deletion"

        # Get message IDs of selected emails
        selected_ids = set(selected_rows['Message ID'].tolist())

        # Filter documents to only selected ones
        selected_docs = [
            doc for doc in state.email_processor.classified_emails['delete']
            if doc.metadata.get('message_id') in selected_ids
        ]

        if not state.gmail_conn:
            return "❌ Not authenticated"

        success, failed = state.gmail_conn.delete_emails(selected_docs)

        return f"Deletion complete:\n  - Deleted: {success}\n  - Failed: {failed}\n  - Skipped: {len(state.email_processor.classified_emails['delete']) - len(selected_docs)}"
    except Exception as e:
        return f"❌ Error: {str(e)}"

##Gradio Interface

In [None]:
with gr.Blocks(title="Gmail Inbox Terminator", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🔥 Gmail Inbox Terminator")
    gr.Markdown("### Intelligent Email Management with AI")
    gr.Markdown("Identify important topics, then delete emails OUTSIDE those topics.")

    with gr.Tabs():
        # Tab 1: Connection
        with gr.Tab("🔌 Connection"):
            gr.Markdown("## Connect to Gmail via IMAP")

            if default_email and default_password:
                gr.Markdown("""
**✅ Credentials loaded**

Use pre-filled credentials or enter different ones.
""")
            else:
                gr.Markdown("""
**Requirements:**
1. Enable 2-Factor Authentication on your Google account
2. Create an app-specific password at [Google Account Security](https://myaccount.google.com/security)
3. Use the app password below (not your regular password)
""")

            with gr.Row():
                imap_email = gr.Textbox(
                    label="Email Address",
                    placeholder="your.email@gmail.com",
                    value=default_email
                )
                imap_password = gr.Textbox(
                    label="App Password",
                    type="password",
                    placeholder="16-character app password",
                    value=default_password
                )

            imap_btn = gr.Button("Connect", variant="primary")
            imap_status = gr.Textbox(label="Connection Status", lines=3)

            gr.Markdown("---")
            gr.Markdown("## Process Emails")

            with gr.Row():
                testing_mode_check = gr.Checkbox(label="Testing Mode (50 emails only)", value=True)
                embedding_dropdown = gr.Dropdown(
                    choices=["openai", "bert"],
                    value="openai",
                    label="Embedding Model"
                )

            process_btn = gr.Button("📥 Fetch and Process Emails", variant="primary", size="lg")
            process_status = gr.Textbox(label="Processing Status", lines=10)

            imap_btn.click(connect_imap, inputs=[imap_email, imap_password], outputs=imap_status)
            process_btn.click(
                fetch_and_process,
                inputs=[testing_mode_check, embedding_dropdown],
                outputs=process_status
            )

        # Tab 2: Topic Analysis & Configuration
        with gr.Tab("🔍 Topic Analysis & Configuration"):
            gr.Markdown("## a) Configuration")

            with gr.Row():
                llm_dropdown = gr.Dropdown(
                    choices=["openai", "gemini"],
                    value="openai",
                    label="LLM Model"
                )

                classification_threshold = gr.Slider(
                    minimum=0.1,
                    maximum=0.9,
                    value=0.5,
                    step=0.1,
                    label="Relevance Threshold (higher = more strict, fewer kept)"
                )

            gr.Markdown("---")
            gr.Markdown("## b) Interest Analysis")
            gr.Markdown("Identify topics that are IMPORTANT to you. Emails matching these topics will be KEPT, others offered for deletion.")

            analyze_btn = gr.Button("🤖 Identify My Interests", variant="primary", size="lg")
            topics_output = gr.Textbox(label="Important Topics", lines=10)
            counts_output = gr.Textbox(label="Category Counts", lines=8)

            gr.Markdown("---")
            gr.Markdown("### Refine Topics with LLM Query")
            gr.Markdown("Ask the LLM to identify specific topics based on your interests. Results replace topics above.")

            with gr.Row():
                chat_query_input = gr.Textbox(
                    label="Query about your interests",
                    placeholder="e.g., 'What are my most important professional topics?'",
                    scale=3
                )
                chat_submit_btn = gr.Button("Submit Query", variant="secondary", scale=1)

            gr.Markdown("""
**Example queries:**
- "What are my most important professional topics?"
- "Identify topics related to family and personal life"
- "What work-related topics should I keep?"
""")

        # Tab 3: Email Management & Deletion
        with gr.Tab("📧 Email Management & Deletion"):
            gr.Markdown("## Classified Emails based on topic analysi)")
            gr.Markdown("Emails matching your important topics are in 'Keep'. Others are deletion candidates.")

            with gr.Row():
                with gr.Column():
                    gr.Markdown("### 📌 Keep (Important)")
                    keep_df = gr.Dataframe(label="Emails to Keep", interactive=False)

                with gr.Column():
                    gr.Markdown("### 🗑️ Delete Candidates")

                    with gr.Row():
                        select_all_btn = gr.Button("✅ Select All", size="sm")
                        deselect_all_btn = gr.Button("❌ Deselect All", size="sm")

                    delete_df = gr.Dataframe(
                        label="Select emails to delete",
                        interactive=True,
                        datatype=["bool", "str", "str", "str", "str", "number"],
                        col_count=(6, "fixed")
                    )

            select_all_btn.click(select_all_emails, inputs=delete_df, outputs=delete_df)
            deselect_all_btn.click(deselect_all_emails, inputs=delete_df, outputs=delete_df)

            gr.Markdown("---")
            gr.Markdown("## Archive & Delete")

            with gr.Row():
                archive_btn = gr.Button("📦 Create Archive", variant="secondary")
                delete_btn = gr.Button("🔥 DELETE SELECTED", variant="stop")

            with gr.Row():
                with gr.Column():
                    archive_status = gr.Textbox(label="Archive Status", lines=2)
                with gr.Column():
                    confirmation_input = gr.Textbox(label="Type DELETE to confirm", placeholder="DELETE")

            archive_file = gr.File(label="Download Archive")
            deletion_status = gr.Textbox(label="Deletion Result", lines=3)

    analyze_btn.click(
        analyze_topics,
        inputs=[llm_dropdown, classification_threshold],
        outputs=[topics_output, counts_output, keep_df, delete_df]
    )

    chat_submit_btn.click(
        refine_topics_with_chat,
        inputs=[chat_query_input, llm_dropdown, classification_threshold],
        outputs=[topics_output, counts_output, keep_df, delete_df]
    )

    archive_btn.click(create_archive_file, outputs=[archive_status, archive_file])
    delete_btn.click(perform_deletion, inputs=[confirmation_input, delete_df], outputs=deletion_status)

## Launch App

In [None]:
app.launch(share=True, inbrowser=True)

##Unit Tests for Components

In [None]:

print("=" * 60)
print("UNIT TESTS - Testing Individual Components")
print("=" * 60)

# Test 1: Helper Functions
print("\n📝 Test 1: Helper Functions")
print("-" * 40)

def test_helper_functions():
    """Test email parsing helper functions."""
    # Test get_header_value
    test_headers = [
        {'name': 'Subject', 'value': 'Test Email'},
        {'name': 'From', 'value': 'sender@example.com'},
        {'name': 'Date', 'value': '2025-10-21'}
    ]

    assert get_header_value(test_headers, 'Subject') == 'Test Email'
    assert get_header_value(test_headers, 'From') == 'sender@example.com'
    assert get_header_value(test_headers, 'Missing') == ''

    print("✓ get_header_value() works correctly")
    return True

try:
    test_helper_functions()
    print("\n✅ Helper functions test PASSED")
except AssertionError as e:
    print(f"\n❌ Helper functions test FAILED: {e}")

# Test 2: VectorDatabaseManager
print("\n\n💾 Test 2: VectorDatabaseManager")
print("-" * 40)

def test_vector_database_manager():
    """Test VectorDatabaseManager class."""
    test_docs = [
        Document(
            page_content="This is a test email about Python programming and data science.",
            metadata={'subject': 'Test 1', 'sender': 'test@example.com'}
        ),
        Document(
            page_content="Another email discussing machine learning and AI topics.",
            metadata={'subject': 'Test 2', 'sender': 'ai@example.com'}
        ),
        Document(
            page_content="Meeting invitation for tomorrow's project review.",
            metadata={'subject': 'Test 3', 'sender': 'manager@example.com'}
        )
    ]

    test_mgr = VectorDatabaseManager(db_name="test_vector_db")
    embeddings = test_mgr.create_embeddings("bert")
    assert test_mgr.embeddings is not None
    print("✓ Embeddings created successfully")

    vectorstore = test_mgr.create_vector_store(test_docs, recreate=True)
    assert vectorstore is not None
    assert test_mgr.vectorstore._collection.count() == len(test_docs)
    print(f"✓ Vector store created with {len(test_docs)} documents")

    retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
    results = retriever.invoke("Python programming")
    assert len(results) > 0
    print(f"✓ Retrieval works: found {len(results)} relevant documents")

    if os.path.exists("test_vector_db"):
        shutil.rmtree("test_vector_db")

    return True

try:
    test_vector_database_manager()
    print("\n✅ VectorDatabaseManager test PASSED")
except Exception as e:
    print(f"\n❌ VectorDatabaseManager test FAILED: {e}")

# Test 3: EmailProcessor
print("\n\n📧 Test 3: EmailProcessor")
print("-" * 40)

def test_email_processor():
    """Test EmailProcessor class."""
    test_docs = [
        Document(
            page_content="Subject: Project Update\nFrom: boss@company.com\nTo: me@company.com\nDate: 2025-10-20\n\nPlease review the quarterly report.",
            metadata={'subject': 'Project Update', 'sender': 'boss@company.com', 'message_id': '001', 'date': '2025-10-20'}
        ),
        Document(
            page_content="Subject: Newsletter\nFrom: marketing@spam.com\nTo: me@company.com\nDate: 2025-10-19\n\nCheck out our latest deals!",
            metadata={'subject': 'Newsletter', 'sender': 'marketing@spam.com', 'message_id': '002', 'date': '2025-10-19'}
        ),
        Document(
            page_content="Subject: Team Meeting\nFrom: colleague@company.com\nTo: me@company.com\nDate: 2025-10-21\n\nMeeting tomorrow at 10am.",
            metadata={'subject': 'Team Meeting', 'sender': 'colleague@company.com', 'message_id': '003', 'date': '2025-10-21'}
        )
    ]

    processor = EmailProcessor()

    chunks = processor.chunk_documents(test_docs, chunk_size=100, chunk_overlap=20)
    assert len(chunks) >= len(test_docs)
    print(f"✓ Chunking works: created {len(chunks)} chunks from {len(test_docs)} documents")

    stats = processor.get_statistics(test_docs)
    assert stats['total_emails'] == 3
    assert stats['unique_senders'] == 3
    print(f"✓ Statistics calculation works: {stats['total_emails']} emails, {stats['unique_senders']} unique senders")

    df = processor.emails_to_dataframe(test_docs, add_select_column=True)
    assert len(df) == 3
    assert 'Topics' in df.columns
    assert 'Subject' in df.columns
    assert 'Sender' in df.columns
    assert 'Select' in df.columns
    print(f"✓ DataFrame conversion works: {len(df)} rows, {len(df.columns)} columns")

    return True

try:
    test_email_processor()
    print("\n✅ EmailProcessor test PASSED")
except Exception as e:
    print(f"\n❌ EmailProcessor test FAILED: {e}")

# Test 4: Mock IMAP Connection
print("\n\n🔌 Test 4: Mock IMAP Connection")
print("-" * 40)

def test_mock_connection():
    """Test the connection interface with a mock implementation."""

    class MockIMAPConnection(GmailConnection):
        """Mock implementation for testing."""

        def connect(self) -> bool:
            self.auth_info = {
                'email': 'test@example.com',
                'total_messages': 100,
                'auth_method': 'Mock'
            }
            self.connection = "mock_connection"
            return True

        def fetch_emails(self, max_emails: Optional[int] = None) -> Tuple[List[Document], str]:
            limit = max_emails if max_emails else 10
            docs = [
                Document(
                    page_content=f"Mock email {i}",
                    metadata={
                        'message_id': f'mock_{i}',
                        'subject': f'Test Subject {i}',
                        'sender': f'sender{i}@example.com',
                        'date': '2025-10-21'
                    }
                )
                for i in range(min(limit, 5))
            ]
            return docs, f"✓ Fetched {len(docs)} mock emails"

        def delete_emails(self, documents: List[Document]) -> Tuple[int, int]:
            return len(documents), 0

    mock_conn = MockIMAPConnection()

    assert mock_conn.connect()
    print("✓ Mock connection established")

    assert mock_conn.is_connected()
    print("✓ Connection status check works")

    info = mock_conn.get_auth_info()
    assert info['email'] == 'test@example.com'
    print(f"✓ Auth info retrieved: {info['email']}")

    emails, diagnostics = mock_conn.fetch_emails(max_emails=3)
    assert len(emails) == 3
    print(f"✓ Fetched {len(emails)} mock emails")
    print(f"  Diagnostics: {diagnostics}")

    success, failed = mock_conn.delete_emails(emails)
    assert success == 3 and failed == 0
    print(f"✓ Mock deletion: {success} successful, {failed} failed")

    return True

try:
    test_mock_connection()
    print("\n✅ Mock connection test PASSED")
except Exception as e:
    print(f"\n❌ Mock connection test FAILED: {e}")

print("\n" + "=" * 60)
print("✅ ALL UNIT TESTS COMPLETED")
print("=" * 60)


##Integration Test (with Mock Data)

In [None]:
print("\n\n" + "=" * 60)
print("INTEGRATION TEST - Full Workflow with Mock Data")
print("=" * 60)

def run_integration_test():
    """Run a complete workflow test with mock data."""

    print("\n🚀 Starting integration test...")

    # Step 1: Create mock connection
    print("\n1️⃣ Creating mock Gmail connection...")

    class TestGmailConnection(GmailConnection):
        def connect(self):
            self.connection = True
            self.auth_info = {'email': 'test@example.com', 'total_messages': 20, 'auth_method': 'Test'}
            return True

        def fetch_emails(self, max_emails=None):
            # Generate realistic mock emails
            topics = [
                ("Work Project", "manager@company.com", "Need your input on Q4 planning and budget allocation."),
                ("Team Meeting", "colleague@company.com", "Weekly sync tomorrow at 10am to discuss progress."),
                ("Newsletter", "marketing@newsletter.com", "Top 10 deals this week! Don't miss out!"),
                ("Spam Offer", "deals@promo.com", "You've won a million dollars! Click here now!"),
                ("Client Update", "client@business.com", "Regarding the proposal you sent last week."),
                ("Training Course", "learning@company.com", "New Python course available for employees."),
                ("Marketing Email", "ads@shopping.com", "Summer sale - 50% off everything!"),
                ("Boss Email", "ceo@company.com", "Great job on the presentation yesterday!"),
                ("Junk", "random@spam.com", "Make money fast with this one weird trick!"),
                ("Important Notice", "hr@company.com", "Annual review meeting scheduled for next month.")
            ]

            limit = min(max_emails if max_emails else 10, len(topics))

            docs = [
                Document(
                    page_content=f"Subject: {subj}\nFrom: {sender}\nTo: test@example.com\nDate: 2025-10-{20-i}\n\n{body}",
                    metadata={
                        'message_id': f'test_{i}',
                        'subject': subj,
                        'sender': sender,
                        'recipient': 'test@example.com',
                        'date': f'2025-10-{20-i}',
                        'source': 'test'
                    }
                )
                for i, (subj, sender, body) in enumerate(topics[:limit])
            ]
            return docs, f"✓ Fetched {len(docs)} test emails"

        def delete_emails(self, documents):
            return len(documents), 0

    test_conn = TestGmailConnection()
    test_conn.connect()
    print(f"   ✓ Connected as: {test_conn.get_auth_info()['email']}")

    # Step 2: Fetch emails
    print("\n2️⃣ Fetching mock emails...")
    emails, diagnostics = test_conn.fetch_emails(max_emails=10)
    print(f"   ✓ Fetched {len(emails)} emails")
    print(f"   {diagnostics}")

    # Step 3: Process emails
    print("\n3️⃣ Processing emails...")
    processor = EmailProcessor()
    chunks = processor.chunk_documents(emails)
    print(f"   ✓ Created {len(chunks)} chunks")

    stats = processor.get_statistics(emails)
    print(f"   ✓ Statistics: {stats['total_emails']} emails, {stats['unique_senders']} senders")

    # Step 4: Create vector store
    print("\n4️⃣ Creating vector store...")
    vector_mgr = VectorDatabaseManager(db_name="test_integration_db")
    vector_mgr.create_embeddings("bert")  # Use BERT to avoid API costs
    vector_mgr.create_vector_store(chunks, recreate=True)
    print(f"   ✓ Vector store created with {vector_mgr.vectorstore._collection.count()} documents")

    # Step 5: Analyze topics (simulated - would normally use LLM)
    print("\n5️⃣ Analyzing topics...")
    processor.topics = """
Based on the email analysis:
1. Work Projects - Manager communications about planning and budgets
2. Team Collaboration - Meeting invites and team sync-ups
3. Client Relations - Important client communications
4. Professional Development - Training and learning opportunities
5. Company Announcements - HR and leadership communications
"""
    print("   Topics identified (mock analysis)")

    # Step 6: Classify emails
    print("\n6️⃣ Classifying emails...")
    # Simulate classification based on sender domains
    work_domains = ['company.com', 'business.com']
    spam_domains = ['newsletter.com', 'promo.com', 'spam.com', 'shopping.com']

    keep_emails = [email for email in emails if any(domain in email.metadata.get('sender', '') for domain in work_domains)]
    delete_emails = [email for email in emails if any(domain in email.metadata.get('sender', '') for domain in spam_domains)]

    processor.classified_emails = {'keep': keep_emails, 'delete': delete_emails}
    print(f"   ✓ Classification complete:")
    print(f"      - Keep: {len(keep_emails)} emails")
    print(f"      - Delete: {len(delete_emails)} emails")

    # Step 7: Create archive
    print("\n7️⃣ Creating archive...")
    if delete_emails:
        archive_path = processor.create_archive(delete_emails)
        print(f"   ✓ Archive created: {archive_path}")
        archive_exists = os.path.exists(archive_path)
        print(f"   ✓ Archive file exists: {archive_exists}")

    # Step 8: Simulate deletion
    print("\n8️⃣ Simulating deletion...")
    success, failed = test_conn.delete_emails(delete_emails)
    print(f"   ✓ Deletion complete: {success} successful, {failed} failed")

    # Step 9: Display results as DataFrame
    print("\n9️⃣ Generating reports...")
    keep_df = processor.emails_to_dataframe(keep_emails)
    delete_df = processor.emails_to_dataframe(delete_emails)
    print(f"   ✓ Keep DataFrame: {len(keep_df)} rows")
    print(f"   ✓ Delete DataFrame: {len(delete_df)} rows")

    # Cleanup
    print("\n🧹 Cleaning up test files...")
    if os.path.exists("test_integration_db"):
        shutil.rmtree("test_integration_db")
    if delete_emails and os.path.exists(archive_path):
        os.remove(archive_path)
    print("   ✓ Cleanup complete")

    print("\n" + "=" * 60)
    print("✅ INTEGRATION TEST COMPLETED SUCCESSFULLY!")
    print("=" * 60)
    print("\n📊 Summary:")
    print(f"   • Total emails processed: {len(emails)}")
    print(f"   • Emails to keep: {len(keep_emails)}")
    print(f"   • Emails to delete: {len(delete_emails)}")
    print(f"   • Archive created: ✓")
    print(f"   • Deletion simulated: ✓")
    print("\n💡 The refactored architecture makes testing easy!")

    return True

try:
    run_integration_test()
except Exception as e:
    print(f"\n❌ INTEGRATION TEST FAILED: {e}")
    import traceback
    traceback.print_exc()

##Performance Test

In [None]:

print("\n\n" + "=" * 60)
print("PERFORMANCE TEST - Component Benchmarks")
print("=" * 60)

import time

def benchmark_component(name, func, *args, **kwargs):
    """Benchmark a component function."""
    start = time.time()
    result = func(*args, **kwargs)
    elapsed = time.time() - start
    print(f"   {name}: {elapsed:.3f}s")
    return result, elapsed

def run_performance_tests():
    """Run performance benchmarks."""

    # Generate test data
    print("\n📊 Generating test data...")
    test_emails = [
        Document(
            page_content=f"Subject: Test {i}\nFrom: sender{i % 10}@example.com\n\n" + " ".join(["word"] * 100),
            metadata={
                'message_id': f'perf_{i}',
                'subject': f'Test {i}',
                'sender': f'sender{i % 10}@example.com',
                'date': f'2025-10-{(i % 30) + 1:02d}'
            }
        )
        for i in range(100)
    ]
    print(f"   ✓ Created {len(test_emails)} test emails")

    # Benchmark EmailProcessor
    print("\n⏱️  Benchmarking EmailProcessor...")
    processor = EmailProcessor()

    chunks, t1 = benchmark_component("Chunking", processor.chunk_documents, test_emails)
    stats, t2 = benchmark_component("Statistics", processor.get_statistics, test_emails)
    df, t3 = benchmark_component("DataFrame conversion", processor.emails_to_dataframe, test_emails)

    # Benchmark VectorDatabaseManager
    print("\n⏱️  Benchmarking VectorDatabaseManager...")
    vector_mgr = VectorDatabaseManager(db_name="test_perf_db")

    emb, t4 = benchmark_component("Embedding creation", vector_mgr.create_embeddings, "bert")
    vs, t5 = benchmark_component("Vector store creation", vector_mgr.create_vector_store, chunks[:50])  # Limit for speed

    # Cleanup
    if os.path.exists("test_perf_db"):
        shutil.rmtree("test_perf_db")

    print("\n" + "=" * 60)
    print("✅ PERFORMANCE TEST COMPLETED")
    print("=" * 60)
    print(f"\n📈 Total time: {t1 + t2 + t3 + t4 + t5:.3f}s")
    print(f"   Fastest operation: DataFrame conversion ({t3:.3f}s)")
    print(f"   Slowest operation: Vector store creation ({t5:.3f}s)")

try:
    run_performance_tests()
except Exception as e:
    print(f"\n❌ PERFORMANCE TEST FAILED: {e}")

