# Personal Knowledge Assistant

## Week 5 exercise


### Features:
1. Chat powered of uploaded knowlege

    The system prompt is designed to make the chatbot simulate a person based on the provided documents.

2. Load files from local system

    Reuse code from bluebells1 [Wk5-final-multi-doc-type-KB.ipynb](../Wk5-final-multi-doc-type-KB.ipynb). Really appreciate it!

    Choose a folder located in the same directory as this script to extract content from. You can also specify subfolders to exclude from the extraction.

3. Load emails from Gmail

    Enter an alias first, and a Google popup will guide you to grant permissions and log in, then extract emails for your specified time range

4. Load emails from Outlook

    First, enter an alias. After clicking the 'Get Verification Code' button, a URI and code will appear in the 'Verification Instructions' textbox. Visit the Outlook website using the code, and follow the guide to grant permissions and complete the login.
    Then, extract emails for your specified time range
    
5. Load files from Google Workspace

    Enter with an alias first, and Google popup will guide you to grant permissions and log in, then extract emails for your specified folder in your Google Drive


### TO-DO Features:
1. Load messages from Slack
2. Use local inference/embedding models (llama) instead of relying on OpenAI-hosted models   
3. Optimize Gmail/Outlook/Google Workspace login logic
4. Label different files. For example, extract prrivate and work emails respectively and store them into different vector stores
5. Add vector visualization

### Requirements:
1. Store gmail credential json file under the 'credentials' folder

   The setup and configuration steps for Gmail API are in this guide: [Gmail_API_Credential_Guide](./Gmail_API_Credential_Guide.ipynb)

2. Set AZURE_CLIENT_ID in .env file

   The setup and configuration steps for Outlook API are in this guide: [Outlook_API_Credential_Guide](./Outlook_API_Credential_Guide.ipynb)


3. Store google workspace credential json file under the 'credentials' folder

   The setup and configuration steps for Gmail API are in this guide: [Google_Workspace_API_Credential_Guide](./Google_Workspace_API_Credential_Guide.ipynb)

The directories should be structured before launch as follows:

   ```text
    The project/
    │
    ├── credentials/                 <-- Need to create and store manually before launch; download from Google Cloud Plafotm(GCP)
    │   ├── gmail_credentials.json
    │   └── google_workspace_credentials.json
    ├── tokens/                      <-- Automatically created and saved
    │   ├── gmail_tokens             
    │   │   └── gmail_token_{alias}.json
    │   ├── google_workspace_tokens
    │   └── outlook_tokens
    ├── vector_index/                <-- Need to create manually before launch
    │   ├── local_vector_index
    │   ├── google_workspace_vector_index
    │   ├── gmail_vector_index
    │   └── output_vector_index
    └── ***.ipynb                    <-- Script

Feel free to contact me via zhufqiu@gmail.com or via [Linkedin](https://www.linkedin.com/in/zhufeng-zephyr-qiu/) if you have any questions about this project. If you have better idea about system prompt, chunk config or search_kwargs, I will be happy to talk with you!

In [None]:
# !pip install pymupdf
# !pip install openpyxl
# !pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [None]:
import os
import base64
from datetime import datetime
from email import message_from_bytes
from email.utils import parsedate_to_datetime

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import ConversationChain
from langchain.retrievers import MergerRetriever
from collections import defaultdict
from langchain.document_loaders import (
    DirectoryLoader, TextLoader, 
    Docx2txtLoader,
    TextLoader,
    PyPDFLoader,
    UnstructuredExcelLoader,
    BSHTMLLoader
)
import glob
from dotenv import load_dotenv
import gradio as gr
import tiktoken

from msal import PublicClientApplication
import requests
from datetime import datetime, timezone
import json
import shutil

from PIL import Image
import pytesseract
import fitz
import ebooklib
from ebooklib import epub
import io

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)
from langchain.prompts import PromptTemplate

In [None]:
MODEL = "gpt-4o-mini"
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

### If it is your first time to create VECTOR_DIR and its sub-folder, you should create them, close this script and re-open it

In [None]:
LOCAL_VECTOR_DIR = 'vector_index/local_vector_index'
GMAIL_VECTOR_DIR = 'vector_index/gmail_vector_index'
OUTLOOK_VECTOR_DIR = "vector_index/outlook_vector_index"
GOOGLE_WORKSPACE_VECTOR_DIR = 'vector_index/google_workspace_vector_index'
SLACK_VECTOR_DIR = 'vector_index/slack_vector_index'

os.makedirs(LOCAL_VECTOR_DIR, exist_ok=True)
os.makedirs(GMAIL_VECTOR_DIR, exist_ok=True)
os.makedirs(OUTLOOK_VECTOR_DIR, exist_ok=True)
os.makedirs(GOOGLE_WORKSPACE_VECTOR_DIR, exist_ok=True)

#### Utilize functions

In [None]:
def get_num_tokens(text, model="text-embedding-3-large"):
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text))

def batch_chunks(chunks, max_tokens=250000, model="text-embedding-3-large"):
    batches = []
    current_batch = []
    current_tokens = 0

    for doc in chunks:
        doc_tokens = get_num_tokens(doc.page_content, model)
        if current_tokens + doc_tokens > max_tokens:
            batches.append(current_batch)
            current_batch = [doc]
            current_tokens = doc_tokens
        else:
            current_batch.append(doc)
            current_tokens += doc_tokens

    if current_batch:
        batches.append(current_batch)
    
    return batches

### 1. Local

Reuse code from bluebells1 [Wk5-final-multi-doc-type-KB.ipynb](../Wk5-final-multi-doc-type-KB.ipynb). Really appreciate it!

Advanced features:
1. ImgLoader added to load image file (png, jpg, jpeg)
2. Add logic to use DocumentLoader, extract files and show summary in Gradio textbox

In [None]:
from ebooklib import epub
from bs4 import BeautifulSoup
from langchain.document_loaders.base import BaseLoader

class EpubLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> list[Document]:
        book = epub.read_epub(self.file_path)
        text = ''
        for item in book.get_items():
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                soup = BeautifulSoup(item.get_content().decode("utf-8"), 'html.parser')
                extracted = soup.get_text().strip()
                if extracted:
                    text += extracted + '\n\n'

        return [Document(page_content=text.strip(), metadata={"source": self.file_path})]

In [None]:
from pptx import Presentation

class PptxLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> list[Document]:
        prs = Presentation(self.file_path)
        text = ''
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text") and shape.text:
                    text += shape.text + '\n'

        return [Document(page_content=text, metadata={"source": self.file_path})]

In [None]:
from PIL import Image
import pytesseract

class ImgLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> list[Document]:
        text = ''
        try:
            text = pytesseract.image_to_string(Image.open(self.file_path))
        except Exception as e:
            print(f"OCR failed for {path}: {e}")
        return [Document(page_content=text, metadata={"source": self.file_path})]

In [None]:
# Class based version of document loader which can be expanded more easily for other document types.  (Currently includes file types: docx, txt (windows encoding), xlsx, pdfs, epubs, pptx)

class DocumentLoader:
    """A clean, extensible document loader for multiple file types."""
    
    def __init__(self, base_path, exclude_folders=None):
        self.base_path = base_path
        self.documents = []
        self.exclude_folders = exclude_folders or []
        self.print_info = ""
        
        # Configuration for different file types
        self.loader_config = {
            'docx': {
                'loader_cls': Docx2txtLoader,
                'glob_pattern': "**/*.docx",
                'loader_kwargs': {},
                'post_process': None
            },
            'txt': {
                'loader_cls': TextLoader,
                'glob_pattern': "**/*.txt",
                'loader_kwargs': {"encoding": 'utf-8'},
                'post_process': None
            },
            'md': {
                'loader_cls': TextLoader,
                'glob_pattern': "**/*.md",
                'loader_kwargs': {"encoding": 'utf-8'},
                'post_process': None
            },
            'pdf': {
                'loader_cls': PyPDFLoader,
                'glob_pattern': "**/*.pdf",
                'loader_kwargs': {},
                'post_process': None
            },
            'xlsx': {
                'loader_cls': UnstructuredExcelLoader,
                'glob_pattern': "**/*.xlsx",
                'loader_kwargs': {},
                'post_process': None
            },
            'html': {
                'loader_cls': BSHTMLLoader,
                'glob_pattern': "**/*.html",
                'loader_kwargs': {},
                'post_process': None
            },
            'epub': {
                'loader_cls': EpubLoader,
                'glob_pattern': "**/*.epub",
                'loader_kwargs': {},
                'post_process': self._process_epub_metadata
            },
            'pptx': {
                'loader_cls': PptxLoader,
                'glob_pattern': "**/*.pptx",
                'loader_kwargs': {},
                'post_process': None
            },
            'png': {
                'loader_cls': ImgLoader,
                'glob_pattern': "**/*.png",
                'loader_kwargs': {},
                'post_process': None
            },
            'jpeg': {
                'loader_cls': ImgLoader,
                'glob_pattern': "**/*.jpeg",
                'loader_kwargs': {},
                'post_process': None
            },
            'jpg': {
                'loader_cls': ImgLoader,
                'glob_pattern': "**/*.jpg",
                'loader_kwargs': {},
                'post_process': None
            }
        }
    
    def _get_epub_metadata(self, file_path):
        """Extract metadata from EPUB files."""
        try:
            book = epub.read_epub(file_path)
            title = book.get_metadata('DC', 'title')[0][0] if book.get_metadata('DC', 'title') else None
            author = book.get_metadata('DC', 'creator')[0][0] if book.get_metadata('DC', 'creator') else None
            return title, author
        except Exception as e:
            self.print_info += f"Error extracting EPUB metadata: {e}\n"
            return None, None
    
    def _process_epub_metadata(self, doc) -> None:
        """Post-process EPUB documents to add metadata."""
        title, author = self._get_epub_metadata(doc.metadata['source'])
        doc.metadata["author"] = author
        doc.metadata["title"] = title
    
    def _load_file_type(self, folder, file_type, config):
        """Load documents of a specific file type from a folder."""
        try:
            loader = DirectoryLoader(
                folder, 
                glob=config['glob_pattern'], 
                loader_cls=config['loader_cls'],
                loader_kwargs=config['loader_kwargs']
            )
            docs = loader.load()
            self.print_info += f"Found {len(docs)} .{file_type} files\n"
            
            # Apply post-processing if defined
            if config['post_process']:
                for doc in docs:
                    config['post_process'](doc)
            
            return docs
            
        except Exception as e:
            self.print_info += f"Error loading .{file_type} files: {e}\n"
            return []
    
    def load_all(self):
        """Load all documents from configured folders."""
        all_folders = [f for f in glob.glob(self.base_path) if os.path.isdir(f)]

        #filter out excluded folders
        folders = []
        for folder in all_folders:
            folder_name = os.path.basename(folder)
            if folder_name not in self.exclude_folders:
                folders.append(folder)
            else:
                self.print_info += f"Excluded folder: {folder_name}\n"
        
        self.print_info += f"Scanning folders (directories only):{folders}\n" 
        
        self.documents = []
        
        for folder in folders:
            doc_type = os.path.basename(folder)
            self.print_info += f"\nProcessing folder: {doc_type}\n"
            
            for file_type, config in self.loader_config.items():
                docs = self._load_file_type(folder, file_type, config)
                
                # Add doc_type metadata to all documents
                for doc in docs:
                    doc.metadata["doc_type"] = doc_type
                    self.documents.append(doc)
        
        self.print_info += f"\nTotal documents loaded: {len(self.documents)}\n"
        return self.documents
    
    def add_file_type(self, extension, loader_cls, glob_pattern=None, 
                     loader_kwargs=None, post_process=None):
        """Add support for a new file type."""
        self.loader_config[extension] = {
            'loader_cls': loader_cls,
            'glob_pattern': glob_pattern or f"**/*.{extension}",
            'loader_kwargs': loader_kwargs or {},
            'post_process': post_process
        }

# load
# loader = DocumentLoader("local-knowledge-base/**", exclude_folders=["Music", "Online Courses", "Fitness"])
# documents = loader.load_all()

In [None]:
def local_embed_and_store(docs):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = [doc for doc in text_splitter.split_documents(docs) if doc.page_content.strip()]

    if not chunks:
        return "⚠️ No non-empty chunks to embed. Skipping vectorstore update."

    embeddings = OpenAIEmbeddings()

    vectorstore = None
    if os.path.exists(LOCAL_VECTOR_DIR):
        vectorstore = Chroma(persist_directory=LOCAL_VECTOR_DIR, embedding_function=embeddings)
    else:
        if chunks:
            vectorstore = Chroma.from_documents(documents=chunks[:1], embedding=embeddings, persist_directory=LOCAL_VECTOR_DIR)
            chunks = chunks[1:]
        else:
            return "⚠️ No chunks to create new vectorstore."
            
    batches = batch_chunks(chunks)
    total = 1 if not os.path.exists(LOCAL_VECTOR_DIR) else 0
    
    for batch in batches:
        vectorstore.add_documents(batch)
        total += len(batch)

    info = ""
    info += f"Vectorstore updated with {total} new chunks.\n"
    num_docs = vectorstore._collection.count()
    info += f"Vectorstore contains {num_docs} chunks.\n"
    return info

In [None]:
def extract_local_folder(folder_path="local-knowledge-base", exclude=""):

    # try:
    info = f"Process files under: {folder_path}\n"
    loader = DocumentLoader(os.path.join(folder_path, "**"), exclude_folders=[folder.strip() for folder in exclude.split(',')])
    docs = loader.load_all()
    info += loader.print_info
    if not docs:
        return info + "No valid files found in the given range."
    info += f"Fetched {len(docs)} files.\n"
    info += local_embed_and_store(docs)
    return info

    # except Exception as e:
    #     return f"❌ Extraction failed: {str(e)}"

### 2. Gmail

#### Store gmail credential json file under the credentials folder

To avoid complicated steps and focus on LLMs stuff, I chose to utilize the Gmail API in test mode.

I have included the setup and configuration steps in this guide:
[Gmail_API_Credential_Guide](./Gmail_API_Credential_Guide.ipynb)

In [None]:
GMAIL_SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
GMAIL_CREDENTIALS_FILE = 'credentials/gmail_credentials.json'
GMAIL_TOKEN_DIR = 'tokens/gmail_tokens'

In [None]:
def gmail_get_credentials(account_alias):
    token_path = os.path.join(GMAIL_TOKEN_DIR, f'gmail_token_{account_alias}.json')
    creds = None
    if os.path.exists(token_path):
        creds = Credentials.from_authorized_user_file(token_path, GMAIL_SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(GMAIL_CREDENTIALS_FILE, GMAIL_SCOPES)
            creds = flow.run_local_server(port=0)
        with open(token_path, 'w') as token_file:
            token_file.write(creds.to_json())
    return creds

In [None]:
def parse_message(service, msg_id):
    msg = service.users().messages().get(userId='me', id=msg_id, format='raw').execute()
    raw_msg = base64.urlsafe_b64decode(msg['raw'].encode('ASCII'))
    email_message = message_from_bytes(raw_msg)
    subject = email_message['Subject'] or "(No Subject)"
    date = parsedate_to_datetime(email_message['Date'])
    sender = email_message['From'] or ""
    to = email_message['To'] or ""
    cc = email_message['Cc'] or ""
    body = ""
    
    for part in email_message.walk():
        if part.get_content_type() == 'text/plain' and not part.get('Content-Disposition'):
            body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
            break

    content = f"""Subject: {subject}
        From: {sender}
        To: {to}
        Cc: {cc}
        {body}
        """
    return {
        "id": msg_id,
        "subject": subject,
        "date": date,
        "body": content
    }

In [None]:
def fetch_emails(service, start_date, end_date):
    query = (
        f"(category:primary OR is:important OR is:starred OR is:snoozed OR is:sent OR in:chats OR label:SCHEDULED) "
        f"after:{start_date} before:{end_date} -in:spam -in:trash -category:promotions -category:forums"
    )    
        
    all_messages = []
    page_token = None

    while True:
        response = service.users().messages().list(userId='me', q=query, pageToken=page_token).execute()
        messages = response.get('messages', [])
        print(f"Found {len(messages)} sub-messages.")
        all_messages.extend(messages)
        page_token = response.get('nextPageToken')
        if not page_token:
            break
    print(f"Total messages fetched: {len(all_messages)}")
    parsed_emails = []
    for msg in all_messages:
        parsed = parse_message(service, msg['id'])
        if parsed:
            parsed_emails.append(parsed)
    
    return parsed_emails


In [None]:
def gmail_embed_and_store(emails, account):
    docs = []
    for email in emails:
        content = f"Subject: {email['subject']}\n\n{email['body']}"
        doc = Document(
            page_content=content.strip(),
            metadata={
                "date": str(email['date']),
                "gmail_id": email['id'],
                "account": account
            }
        )
        docs.append(doc)

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = [doc for doc in text_splitter.split_documents(docs) if doc.page_content.strip()]

    if not chunks:
        return "⚠️ No non-empty chunks to embed. Skipping vectorstore update."

    embeddings = OpenAIEmbeddings()

    vectorstore = None
    if os.path.exists(GMAIL_VECTOR_DIR):
        vectorstore = Chroma(persist_directory=GMAIL_VECTOR_DIR, embedding_function=embeddings)
    else:
        if chunks:
            vectorstore = Chroma.from_documents(documents=chunks[:1], embedding=embeddings, persist_directory=GMAIL_VECTOR_DIR)
            chunks = chunks[1:]
        else:
            return "⚠️ No chunks to create new vectorstore."
            
    batches = batch_chunks(chunks)
    total = 1 if not os.path.exists(GMAIL_VECTOR_DIR) else 0
    
    for batch in batches:
        vectorstore.add_documents(batch)
        total += len(batch)

    info = ""
    info += f"Vectorstore updated with {total} new chunks from {account}.\n"
    num_docs = vectorstore._collection.count()
    info += f"Vectorstore contains {num_docs} chunks.\n"
    return info

In [None]:
def login_gmail(alias):
    try:
        creds = gmail_get_credentials(alias)
        service = build('gmail', 'v1', credentials=creds)
        profile = service.users().getProfile(userId='me').execute()
        email = profile.get("emailAddress")

        # Store in session
        SESSION_STATE["gmail_service"] = service
        SESSION_STATE["gmail_email"] = email
        SESSION_STATE["gmail_alias"] = alias

        return f"✅ Logged in as: {email}"
    except Exception as e:
        return f"❌ Login failed: {str(e)}"

In [None]:
def extract_gmail(start_date, end_date):
    service = SESSION_STATE.get("gmail_service")
    email_address = SESSION_STATE.get("gmail_email")

    if not service:
        return "❌ Please login first."

    # try:
    info = f"Connected to: {email_address}\n"
    emails = fetch_emails(service, start_date, end_date)

    if not emails:
        return info + "No emails found in the given range."
    info += f"Fetched {len(emails)} emails.\n"
    info += gmail_embed_and_store(emails, account=email_address)
    return info

    # except Exception as e:
    #     return f"❌ Extraction failed: {str(e)}"

### 3. Outlook

#### Set AZURE_CLIENT_ID in .env file

I have included the setup and configuration steps in this guide:
[Outlook_API_Credential_Guide](./Outlook_API_Credential_Guide.ipynb)

In [None]:
load_dotenv()

OUTLOOK_TOKEN_DIR = "tokens/outlook_tokens"
OUTLOOK_CLIENT_ID = os.getenv("AZURE_CLIENT_ID")
OUTLOOK_AUTHORITY = "https://login.microsoftonline.com/common" 
OUTLOOK_SCOPES = ["Mail.Read", "User.Read"]

In [None]:
def fetch_outlook_emails(access_token, start_date, end_date):
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Prefer": "outlook.body-content-type='text'"
    }

    # Filter format: yyyy-mm-ddTHH:MM:SSZ
    query = (
        "https://graph.microsoft.com/v1.0/me/messages"
        f"?$top=100"
        "&$select=id,subject,receivedDateTime,body,sender,toRecipients,ccRecipients"
    )

    all_emails = []

    while query:
        response = requests.get(query, headers=headers)
        if not response.ok:
            print(f"❌ HTTP {response.status_code}: {response.text}")
            break

        res = response.json()
        for msg in res.get("value", []):
            received = msg.get("receivedDateTime", "")
            try:
                received_dt = datetime.fromisoformat(received.replace("Z", "+00:00"))
            except Exception:
                continue

            if not (start_date <= received_dt <= end_date):
                continue

            email_data = {
                "id": msg.get("id"),
                "subject": msg.get("subject", ""),
                "body": msg.get("body", {}).get("content", ""),
                "sender": msg.get("sender", {}).get("emailAddress", {}).get("address", ""),
                "to": [r["emailAddress"]["address"] for r in msg.get("toRecipients", [])],
                "cc": [r["emailAddress"]["address"] for r in msg.get("ccRecipients", [])],
                "date": received_dt.isoformat()
            }

            all_emails.append(email_data)

        query = res.get("@odata.nextLink")

    print(f"✅ Total emails extracted: {len(all_emails)}")
    return all_emails

In [None]:
def outlook_embed_and_store(emails):
    if not emails:
        return "No emails to embed.\n"

    docs = []
    for email in emails:
        content = (
            f"Subject: {email['subject']}\n"
            f"From: {email['sender']}\n"
            f"To: {', '.join(email['to'])}\n"
            f"CC: {', '.join(email['cc'])}\n\n"
            f"{email['body']}"
        )
        doc = Document(
            page_content=content,
            metadata={"date": email["date"], "outlook_id": email["id"]}
        )
        docs.append(doc)

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = [doc for doc in text_splitter.split_documents(docs) if doc.page_content.strip()]

    if not chunks:
        return "⚠️ No non-empty chunks to embed. Skipping vectorstore update."

    embeddings = OpenAIEmbeddings()

    vectorstore = None
    if os.path.exists(OUTLOOK_VECTOR_DIR):
        vectorstore = Chroma(persist_directory=OUTLOOK_VECTOR_DIR, embedding_function=embeddings)
    else:
        if chunks:
            vectorstore = Chroma.from_documents(documents=chunks[:1], embedding=embeddings, persist_directory=OUTLOOK_VECTOR_DIR)
            chunks = chunks[1:]
        else:
            return "⚠️ No chunks to create new vectorstore.\n"
            
    batches = batch_chunks(chunks)
    total = 1 if not os.path.exists(OUTLOOK_VECTOR_DIR) else 0
    
    for batch in batches:
        vectorstore.add_documents(batch)
        total += len(batch)

    info = ""
    info += f"✅ Vectorstore updated with {total} chunks.\n"
    num_docs = vectorstore._collection.count()
    info += f"Vectorstore contains {num_docs} chunks.\n"
    return info

In [None]:
def login_outlook(alias):
    # try:
        token_path = os.path.join(OUTLOOK_TOKEN_DIR, f"outlook_token_{alias}.json")
        SESSION_STATE["outlook_alias"] = alias
        access_token = None

        # Load existing token
        if os.path.exists(token_path):
            with open(token_path, "r") as f:
                result = json.load(f)
            access_token = result.get("access_token")

        # If no token, run device flow
        if not access_token:
            app = PublicClientApplication(OUTLOOK_CLIENT_ID, authority=OUTLOOK_AUTHORITY)
            flow = app.initiate_device_flow(scopes=OUTLOOK_SCOPES)

            if "user_code" not in flow:
                return "❌ Failed to initiate device login."

            print("🔗 Visit:", flow["verification_uri"])
            print("🔐 Enter code:", flow["user_code"])

            result = app.acquire_token_by_device_flow(flow)

            if "access_token" not in result:
                return f"❌ Login failed: {result.get('error_description', 'Unknown error')}"

            access_token = result["access_token"]

            with open(token_path, "w") as f:
                json.dump(result, f)

        # Get user's email via Microsoft Graph
        headers = {"Authorization": f"Bearer {access_token}"}
        user_info = requests.get("https://graph.microsoft.com/v1.0/me", headers=headers).json()
        email = user_info.get("mail") or user_info.get("userPrincipalName")

        # Store in session
        SESSION_STATE["outlook_token"] = access_token
        SESSION_STATE["outlook_email"] = email

        return f"✅ Logged in to Outlook as: {email}"

    # except Exception as e:
    #     return f"❌ Login failed: {str(e)}"

In [None]:
def start_outlook_login(alias):
    token_path = os.path.join(OUTLOOK_TOKEN_DIR, f"outlook_token_{alias}.json")
    access_token = None
    SESSION_STATE["outlook_token_path"] = token_path
    
    # Load existing token
    if os.path.exists(token_path):
        return True, "This alias already verified"

    # If no token, run device flow
    if not access_token:
        app = PublicClientApplication(OUTLOOK_CLIENT_ID, authority=OUTLOOK_AUTHORITY)
        flow = app.initiate_device_flow(scopes=OUTLOOK_SCOPES)

        if "user_code" not in flow:
            return False, "❌ Failed to initiate device login."

        # Store the flow for next step
        SESSION_STATE["outlook_alias"] = alias
        SESSION_STATE["outlook_app"] = app
        SESSION_STATE["outlook_flow"] = flow
        
        msg = f"🔗 Visit: {flow['verification_uri']}\n🔐 Enter code: {flow['user_code']}"
        return False, "🔄 Waiting for verification...\n" + msg

def finish_outlook_login():
    flag = SESSION_STATE.get("outlook_login_flag")
    token_path = SESSION_STATE.get("outlook_token_path")
    if flag:
        with open(token_path, "r") as f:
            result = json.load(f)
        access_token = result.get("access_token")
    else:   
        app = SESSION_STATE.get("outlook_app")
        flow = SESSION_STATE.get("outlook_flow")
        
        result = app.acquire_token_by_device_flow(flow)
    
        if "access_token" not in result:
            return f"❌ Login failed: {result.get('error_description', 'Unknown error')}"
    
        access_token = result["access_token"]
    
        with open(token_path, "w") as f:
            json.dump(result, f)
    

    # Get user's email via Microsoft Graph
    headers = {"Authorization": f"Bearer {access_token}"}
    user_info = requests.get("https://graph.microsoft.com/v1.0/me", headers=headers).json()
    email = user_info.get("mail") or user_info.get("userPrincipalName")

    # Store in session
    SESSION_STATE["outlook_token"] = access_token
    SESSION_STATE["outlook_email"] = email

    return f"✅ Logged in to Outlook as: {email}"

In [None]:
def extract_outlook_emails(start, end, alias):
    try:
        start_date = datetime.strptime(start.strip(), "%Y/%m/%d").replace(tzinfo=timezone.utc)
        end_date = datetime.strptime(end.strip(), "%Y/%m/%d").replace(tzinfo=timezone.utc)
    except ValueError:
        return "❌ Invalid date format. Use YYYY/MM/DD."

    access_token = SESSION_STATE["outlook_token"]

    if not access_token:
        return f"❌ No access token found for '{alias}'. Please login first."

    info = ""
    try:
        emails = fetch_outlook_emails(access_token, start_date, end_date)
        if not emails:
            return f"❌ No email found."
        info += f"✅ Extracted and embedded {len(emails)} Outlook emails.\n"
        info += outlook_embed_and_store(emails)
        return info
    except Exception as e:
        return f"❌ Error: {str(e)}"


### 4. Google Workspace

#### Store google workspace credential json file under the credentials folder

To avoid complicated steps and focus on LLMs stuff, I chose to utilize the Google Drive/Workspace API in test mode.

I have included the setup and configuration steps in this guide:
[Google_Workspace_API_Credential_Guide](./Google_Workspace_API_Credential_Guide.ipynb)

In [None]:
GOOGLE_WORKSPACE_SCOPES = [
    'https://www.googleapis.com/auth/gmail.readonly',
    'https://www.googleapis.com/auth/drive.readonly',
    'https://www.googleapis.com/auth/documents.readonly',
    'https://www.googleapis.com/auth/spreadsheets.readonly',
    'https://www.googleapis.com/auth/presentations.readonly'
]
GOOGLE_WORKSPACE_CREDENTIALS_FILE = 'credentials/google_drive_workspace_credentials.json'
GOOGLE_WORKSPACE_TOKEN_DIR = 'tokens/google_workspace_tokens'

In [None]:
def extract_google_doc(docs_service, file_id):
    doc = docs_service.documents().get(documentId=file_id).execute()
    content = ""
    for elem in doc.get("body", {}).get("content", []):
        if "paragraph" in elem:
            for run in elem["paragraph"]["elements"]:
                content += run.get("textRun", {}).get("content", "")
    return content.strip()

def extract_google_sheet(service, file_id):
    # Get spreadsheet metadata
    spreadsheet = service.spreadsheets().get(spreadsheetId=file_id).execute()
    all_text = ""

    # Loop through each sheet
    for sheet in spreadsheet.get("sheets", []):
        title = sheet["properties"]["title"]
        result = service.spreadsheets().values().get(
            spreadsheetId=file_id,
            range=title
        ).execute()

        values = result.get("values", [])
        sheet_text = f"### Sheet: {title} ###\n"
        sheet_text += "\n".join([", ".join(row) for row in values])
        all_text += sheet_text + "\n\n"

    return all_text.strip()


def extract_google_slide(slides_service, file_id):
    pres = slides_service.presentations().get(presentationId=file_id).execute()
    text = ""
    for slide in pres.get("slides", []):
        for element in slide.get("pageElements", []):
            shape = element.get("shape")
            if shape:
                for p in shape.get("text", {}).get("textElements", []):
                    if "textRun" in p:
                        text += p["textRun"]["content"]
    return text.strip()

def extract_pdf_from_drive(drive_service, file_id):
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        _, done = downloader.next_chunk()
    fh.seek(0)
    reader = PdfReader(fh)
    return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

In [None]:
def login_google_workspace(alias):
    try:
        creds = google_workspace_get_creds(alias)
        service = build('gmail', 'v1', credentials=creds)
        profile = service.users().getProfile(userId='me').execute()
        email = profile.get("emailAddress")

        drive_service = build("drive", "v3", credentials=creds)
        docs_service = build('docs', 'v1', credentials=creds)
        sheets_service = build('sheets', 'v4', credentials=creds)
        slides_service = build('slides', 'v1', credentials=creds)

        # Store in session
        SESSION_STATE["google_workspace_drive_service"] = drive_service
        SESSION_STATE["google_workspace_docs_service"] = docs_service
        SESSION_STATE["google_workspace_sheets_service"] = sheets_service
        SESSION_STATE["google_workspace_slides_service"] = slides_service
        SESSION_STATE["google_workspace_email"] = email
        SESSION_STATE["google_workspace_alias"] = alias

        return f"✅ Logged in as: {email}"
    except Exception as e:
        return f"❌ Login failed: {str(e)}"

In [None]:
def google_workspace_get_creds(account_alias):
    token_path = os.path.join(GOOGLE_WORKSPACE_TOKEN_DIR, f'google_workspace_token_{account_alias}.json')
    
    if os.path.exists(token_path):
        creds = Credentials.from_authorized_user_file(token_path, GOOGLE_WORKSPACE_SCOPES)
    else:
        flow = InstalledAppFlow.from_client_secrets_file(GOOGLE_WORKSPACE_CREDENTIALS_FILE, GOOGLE_WORKSPACE_SCOPES)
        creds = flow.run_local_server(port=0)
        with open("token.json", "w") as token:
            token.write(creds.to_json())
    return creds
    

def get_folder_id_by_name(drive_service, folder_name):
    query = f"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and trashed=false"
    results = drive_service.files().list(
        q=query,
        fields="files(id, name)",
        pageSize=1
    ).execute()

    folders = results.get("files", [])
    if not folders:
        raise ValueError(f"❌ Folder named '{folder_name}' not found.")
    return folders[0]['id']


def extract_docs_from_google_workspace(folder_name):
    info = ""

    file_types = {
        'application/vnd.google-apps.document': lambda fid: extract_google_doc(docs_service, fid),
        'application/vnd.google-apps.spreadsheet': lambda fid: extract_google_sheet(sheets_service, fid),
        'application/vnd.google-apps.presentation': lambda fid: extract_google_slide(slides_service, fid),
        'application/pdf': lambda fid: extract_pdf_from_drive(drive_service, fid),
    }

    drive_service = SESSION_STATE.get("google_workspace_drive_service")
    docs_service = SESSION_STATE.get("google_workspace_docs_service")
    sheets_service = SESSION_STATE.get("google_workspace_sheets_service")
    slides_service = SESSION_STATE.get("google_workspace_slides_service")
    
    if not drive_service or not docs_service or not sheets_service or not slides_service: 
        return None, "Please login first.\n"
        

    folder_id = get_folder_id_by_name(drive_service, folder_name)
    print("folder_id")
    print(folder_id)
    info += f"Collection files from folder: {folder_name}\n"
    
    query = (
        f"'{folder_id}' in parents and ("
        'mimeType="application/vnd.google-apps.document" or '
        'mimeType="application/vnd.google-apps.spreadsheet" or '
        'mimeType="application/vnd.google-apps.presentation" or '
        'mimeType="application/pdf")'
    )
    
    results = drive_service.files().list(
        q=query,
        fields="files(id, name, mimeType)",
        pageSize=20
    ).execute()

    docs = []
    summary_info = {
        'application/vnd.google-apps.document': {'file_type': 'Google Doc', 'count': 0},
        'application/vnd.google-apps.spreadsheet': {'file_type': 'Google Sheet', 'count': 0},
        'application/vnd.google-apps.presentation': {'file_type': 'Google Silde', 'count': 0},
        'application/pdf': {'file_type': 'PDF', 'count': 0}
    }
    for file in results.get("files", []):
        print(file['mimeType'])
        extractor = file_types.get(file['mimeType'])
        if extractor:
            try:
                content = extractor(file["id"])
                if content:
                    docs.append(Document(page_content=content, metadata={"source": file["name"]}))
                    summary_info[file['mimeType']]['count'] += 1
                    print(file['mimeType'])
                    print(summary_info[file['mimeType']]['count'])
            except Exception as e:
                print(f"❌ Error processing {file['name']}: {e}")
    
    total = 0;
    for file_type, element in summary_info.items():
        total += element['count']
        info += f"Found {element['count']} {element['file_type']} files\n"
    info += f"Total documents loaded: {total}\n"
    return docs, info

In [None]:
def google_workspace_embed_and_store(docs):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = [doc for doc in text_splitter.split_documents(docs) if doc.page_content.strip()]

    if not chunks:
        return "⚠️ No non-empty chunks to embed. Skipping vectorstore update."

    embeddings = OpenAIEmbeddings()

    vectorstore = None
    if os.path.exists(GOOGLE_WORKSPACE_VECTOR_DIR):
        vectorstore = Chroma(persist_directory=GOOGLE_WORKSPACE_VECTOR_DIR, embedding_function=embeddings)
    else:
        if chunks:
            vectorstore = Chroma.from_documents(documents=chunks[:1], embedding=embeddings, persist_directory=GOOGLE_WORKSPACE_VECTOR_DIR)
            chunks = chunks[1:]
        else:
            return "⚠️ No chunks to create new vectorstore."
            
    batches = batch_chunks(chunks)
    total = 1 if not os.path.exists(GOOGLE_WORKSPACE_VECTOR_DIR) else 0
    
    for batch in batches:
        vectorstore.add_documents(batch)
        total += len(batch)

    info = ""
    info += f"Vectorstore updated with {total} new chunks.\n"
    num_docs = vectorstore._collection.count()
    info += f"Vectorstore contains {num_docs} chunks.\n"
    return info

In [None]:
def extract_google_workspace_folder(folder_path):

    # try:
    info = f"Process files under: {folder_path}\n"
    docs, embed_store_info = extract_docs_from_google_workspace(folder_path)
    info += embed_store_info
    if not docs:
        return info + "No valid files found in the given range."
    info += f"Fetched {len(docs)} files.\n"
    info += google_workspace_embed_and_store(docs)
    return info

    # except Exception as e:
    #     return f"❌ Extraction failed: {str(e)}"

### 5. Slack

### 6. Gradio UI

In [None]:
VECTOR_DIR = [LOCAL_VECTOR_DIR, GMAIL_VECTOR_DIR, OUTLOOK_VECTOR_DIR, GOOGLE_WORKSPACE_VECTOR_DIR, SLACK_VECTOR_DIR]

In [None]:
# system prompt
prompt_template = PromptTemplate(
    input_variables=["question", "context", "chat_history"],
    template="""
You are a personal assistant trained on the user's private documents, emails, and notes.
Your role is to answer questions as if you are the user themself — based on their experiences, thoughts, habits, personality, and preferences reflected in the uploaded materials.
Also, you are having a conversation with the user. Use the chat history to understand the context of the conversation.
At the beginning of each conversation, ask the user what name they would like to assign to you. If the user later requests a name change, update your name accordingly without delay.

Use the retrieved documents to:
- Summarize the user's background, actions, and communication patterns
- Simulate how the user would respond to questions
- Infer personality traits, professional history, and personal interests

Always cite the type of source (e.g., email, resume, journal) when appropriate. If no relevant information is available, say so honestly.

You must never make assumptions beyond what the user's data reveals.

Chat History:
{chat_history}

Retrieved Context:
{context}

User Question:
{question}
"""
)

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
embeddings = OpenAIEmbeddings()
retrievers = []
for vec_dir in VECTOR_DIR:
    if os.path.exists(vec_dir):
        vectorstore = Chroma(persist_directory=vec_dir, embedding_function=embeddings)
        retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
        retrievers.append(retriever)

merged_retriever = MergerRetriever(retrievers=retrievers)
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, 
    retriever=merged_retriever, 
    memory=memory,
    combine_docs_chain_kwargs={"prompt": prompt_template}
)

In [None]:
def chat_with_rag(user_input, chat_history):
    result = conversation_chain.invoke({"question": user_input})
    answer = result["answer"]
    chat_history.append({"role": "user", "content": user_input})
    chat_history.append({"role": "assistant", "content": answer})
    return "", chat_history

In [None]:
def delete_knowledge(delete_type):
    global conversation_chain, retrievers
    
    if delete_type == "Local Folder":
        vector_dir = LOCAL_VECTOR_DIR
    elif delete_type == "Gmail":
        vector_dir = GMAIL_VECTOR_DIR
    elif delete_type == "Outlook":
        vector_dir = OUTLOOK_VECTOR_DIR
    elif delete_type == "Google Workspace":
        vector_dir = GOOGLE_WORKSPACE_VECTOR_DIR
    elif delete_type == "Slack":
        vector_dir = SLACK_VECTOR_DIR
    
    if os.path.exists(vector_dir):
        Chroma(persist_directory=vector_dir, embedding_function=embeddings).delete_collection()
        retrievers = []
        for vec_dir in VECTOR_DIR:
            if os.path.exists(vec_dir):
                vectorstore = Chroma(persist_directory=vec_dir, embedding_function=embeddings)
                retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
                retrievers.append(retriever)
        
        merged_retriever = MergerRetriever(retrievers=retrievers)
        conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=merged_retriever, memory=memory)
        return "Deleted successfully."
    else:
        return "Vector store does not exist."

In [None]:
with gr.Blocks(title="Personla Knowledge Assistant", theme=gr.themes.Citrus(), css="""
.selected {
    background-color: orange !important;
    box-shadow: 0 4px 12px rgba(255, 140, 0, 0.5) !important;
    color: black;
}
.unselected {
    background-color: gray !important;
    box-shadow: 0 4px 12px rgba(128, 128, 128, 0.4);
    color: white;
}
.gr-button-stop {
    background-color: #cf142b !important;
    color: white !important;
    box-shadow: 0 4px 12px rgba(128, 128, 128, 0.4);
}
""") as ui:
    SESSION_STATE = {
        "gmail_service": None, "gmail_email": None, "gmail_alias": None,
        "outlook_email": None, "outlook_alias": None,
        "outlook_login_app": None, "outlook_login_flow": None,
        "outlook_token_path": None,
        "google_workspace_email": None, "google_workspace_alias": None, 
        "google_workspace_drive_service": None, "google_workspace_docs_service": None,
        "google_workspace_sheets_service": None, "google_workspace_slides_service": None
    }
    outlook_login_flag = gr.State(False)
    current_selected = gr.State("")
    section_names = ["Local Folder", "Gmail", "Outlook", "Google Workspace", "Slack"]

    def show_section(current_selected, current_section):
        updates = []
        if current_selected == current_section:

            for sec in section_names:
                updates.append(gr.update(visible=False))
            for sec in section_names:
                updates.append(gr.update(elem_classes=["unselected"]))
            updates.append("")
        else:
            updates = []
            for sec in section_names:
                if sec == current_selected:
                    updates.append(gr.update(visible=True))
                else:
                    updates.append(gr.update(visible=False))
            for sec in section_names:
                if sec == current_selected:
                    updates.append(gr.update(elem_classes=["selected"]))
                else:
                    updates.append(gr.update(elem_classes=["unselected"]))
            updates.append(current_selected)
        return tuple(updates)

    
    
    gr.Markdown("## Personal Knowledge Assistant")

    chatbot = gr.Chatbot(label="Chat", show_copy_button=True, type="messages")
    user_input = gr.Textbox(
        placeholder="Talk with your personal knowledge assistant...",
        label="Enter Message",
        lines=1
    )
    user_input.submit(
        fn=chat_with_rag,
        inputs=[user_input, chatbot],
        outputs=[user_input, chatbot]
    )
    
    gr.HTML("<hr style='border: none; height: 1px; background-color: #333;'>")

    with gr.Row():
        local_folder_show_up = gr.Button("Local folder", elem_id="local-folder-btn", elem_classes=["unselected"])
        gmail_show_up = gr.Button("Gmail",  elem_id="gmail-btn", elem_classes=["unselected"])
        outlook_show_up = gr.Button("Outlook", elem_id="outlook-btn", elem_classes=["unselected"])
        google_workspace_show_up = gr.Button("Google Workspace",  elem_id="google_workspace-btn", elem_classes=["unselected"])
        slack_show_up = gr.Button("Slack",  elem_id="Slack-btn", elem_classes=["unselected"])
        
        local_input = gr.Textbox(value="Local Folder", visible=False)
        gmail_input = gr.Textbox(value="Gmail", visible=False)
        outlook_input = gr.Textbox(value="Outlook", visible=False)
        workspace_input = gr.Textbox(value="Google Workspace", visible=False)
        slack_input = gr.Textbox(value="Slack", visible=False)
        
    local_folder_section = gr.Column(visible=False)
    gmail_section = gr.Column(visible=False)
    outlook_section = gr.Column(visible=False)
    google_workspace_section = gr.Column(visible=False)
    slack_section = gr.Column(visible=False)


    with local_folder_section:
        gr.Markdown("### Local Documents Extractor")

        with gr.Row():
            local_folder_input = gr.Textbox(label="Folder Path", info="All subfolders under the selected folder will be extracted.", value="local-knowledge-base")
        with gr.Row():
            local_exclude_folder_input = gr.Textbox(label="Folders to Exclude", info="\u00A0", placeholder="Join by comma. e.g. dir1, dir2")
        with gr.Row(): 
            local_extract_button = gr.Button("Extract Local Documents")
        with gr.Row():  
            local_extract_log = gr.Textbox(label="Extraction Log", lines=15)

        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")
        
        with gr.Row(): 
            local_delete_button = gr.Button("Delete Local Knowledge", elem_classes=["gr-button-stop"])
        with gr.Row():  
            local_delete_log = gr.Textbox(label="Delete Log", lines=1)
            
        local_delete_button.click(fn=delete_knowledge, inputs=local_input, outputs=local_delete_log)
        local_extract_button.click(fn=extract_local_folder, inputs=[local_folder_input, local_exclude_folder_input], outputs=local_extract_log)
    
    with gmail_section:
        gr.Markdown("### Local Documents Extractor")
        
        with gr.Row():
            gmail_alias_input = gr.Textbox(label="Gmail Alias (e.g., zhufqiu)", placeholder="Gmail alias")   
        with gr.Row():
            gmail_login_log = gr.Textbox(label="Login Status", lines=1)
        with gr.Row():
            gmail_login_btn = gr.Button("Login")
        
        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")

        with gr.Row():
            gmail_start_date = gr.Textbox(label="Start Date (YYYY/MM/DD)")
            gmail_end_date = gr.Textbox(label="End Date (YYYY/MM/DD)")
        with gr.Row():   
            gmail_extract_btn = gr.Button("Extract Gmail Emails")
        with gr.Row():  
            gmail_extract_log = gr.Textbox(label="Extraction Log", lines=15)

        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")
        
        with gr.Row(): 
            gmail_delete_button = gr.Button("Delete Gmail Knowledge", elem_classes=["gr-button-stop"])
        with gr.Row():  
            gmail_delete_log = gr.Textbox(label="Delete Log", lines=1)
            
        gmail_delete_button.click(fn=delete_knowledge, inputs=gmail_input, outputs=gmail_delete_log)
        gmail_login_btn.click(fn=login_gmail, inputs=gmail_alias_input, outputs=gmail_login_log)
        gmail_extract_btn.click(fn=extract_gmail, inputs=[gmail_start_date, gmail_end_date], outputs=gmail_extract_log)
    
    with outlook_section:
        gr.Markdown("### Outlook Email Extractor")

        with gr.Row():
            outlook_alias = gr.Textbox(label="Outlook Alias(e.g., zhufqiu)", placeholder="Outlook alias")

        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")
        
        with gr.Row():
            outlook_verify_info = gr.Textbox(label="Verification Instructions", lines=3)
        with gr.Row():
            outlook_start_login_btn = gr.Button("Get Verification Code")

        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")
        
        with gr.Row():
            outlook_login_log = gr.Textbox(label="Login Status", info="", lines=1)
        with gr.Row():
            outlook_finish_login_btn = gr.Button("Login")
    
        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")
    
        with gr.Row():
            outlook_start_date = gr.Textbox(label="Start Date (YYYY/MM/DD)")
            outlook_end_date = gr.Textbox(label="End Date (YYYY/MM/DD)")
    
        with gr.Row():
            outlook_extract_btn = gr.Button("Extract Outlook Emails")
    
        with gr.Row():
            outlook_log = gr.Textbox(label="Extraction Log", lines=15)

        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")
        
        with gr.Row(): 
            outlook_delete_button = gr.Button("Delete Outlook Knowledge", elem_classes=["gr-button-stop"])
        with gr.Row():  
            outlook_delete_log = gr.Textbox(label="Delete Log", lines=1)
            
        outlook_delete_button.click(fn=delete_knowledge, inputs=outlook_input, outputs=outlook_delete_log)
        outlook_start_login_btn.click(fn=start_outlook_login, inputs=outlook_alias, outputs=[outlook_login_flag, outlook_verify_info])
        outlook_finish_login_btn.click(fn=finish_outlook_login, outputs=outlook_login_log)
        outlook_extract_btn.click(fn=extract_outlook_emails, inputs=[outlook_start_date, outlook_end_date], outputs=outlook_log)

    with google_workspace_section:
        gr.Markdown("### Google Workspace Extractor")

        with gr.Row():
            google_workspace_alias_input = gr.Textbox(label="Google Account Alias (e.g., zhufqiu)", placeholder="Google Account alias")   
        with gr.Row():
            google_workspace_login_log = gr.Textbox(label="Login Status", lines=1)
        with gr.Row():
            google_workspace_login_btn = gr.Button("Login")
        
        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")

        with gr.Row():
            google_workspace_folder_input = gr.Textbox(label="Folder Path", info="All files under the selected folder will be extracted.", value="google_workspace_knowledge_base")
        with gr.Row(): 
            google_workspace_extract_button = gr.Button("Extract Google Workspace Documents")
        
        with gr.Row(): 
            google_workspace_extract_log = gr.Textbox(label="Extraction Log", lines=15)
        
        gr.HTML("<hr style='border: none; height: 1px; background-color: #9f9f9f;'>")
        
        with gr.Row(): 
            google_workspace_delete_button = gr.Button("Delete Google Workspace Knowledge", elem_classes=["gr-button-stop"])
        with gr.Row():  
            google_workspace_delete_log = gr.Textbox(label="Delete Log", lines=1)
            
        google_workspace_delete_button.click(fn=delete_knowledge, inputs=workspace_input, outputs=google_workspace_delete_log)
        google_workspace_login_btn.click(fn=login_google_workspace, inputs=google_workspace_alias_input, outputs=google_workspace_login_log)
        google_workspace_extract_button.click(fn=extract_google_workspace_folder, inputs=google_workspace_folder_input, outputs=google_workspace_extract_log)
    
    with slack_section:
        gr.Markdown("Slack part")
        gr.Markdown("To be developed")
    
    switch_outputs = [
        local_folder_section, gmail_section, outlook_section, google_workspace_section, slack_section,
        local_folder_show_up, gmail_show_up, outlook_show_up, google_workspace_show_up, slack_show_up,
        current_selected
    ]

    gmail_show_up.click(fn=show_section, inputs=[gmail_input, current_selected], outputs=switch_outputs)
    local_folder_show_up.click(fn=show_section, inputs=[local_input, current_selected], outputs=switch_outputs)
    outlook_show_up.click(fn=show_section, inputs=[outlook_input, current_selected], outputs=switch_outputs)
    google_workspace_show_up.click(fn=show_section, inputs=[workspace_input, current_selected], outputs=switch_outputs)
    slack_show_up.click(fn=show_section, inputs=[slack_input, current_selected], outputs=switch_outputs)

### 7. Launch

In [None]:
# Logout all the gmail accounts before launch
if os.path.exists(GMAIL_TOKEN_DIR):
    shutil.rmtree(GMAIL_TOKEN_DIR)
os.makedirs(GMAIL_TOKEN_DIR, exist_ok=True)

# Logout all the outlook accounts before launch
if os.path.exists(OUTLOOK_TOKEN_DIR):
    shutil.rmtree(OUTLOOK_TOKEN_DIR)
os.makedirs(OUTLOOK_TOKEN_DIR, exist_ok=True)

# Logout all the google accounts before launch
if os.path.exists(GOOGLE_WORKSPACE_TOKEN_DIR):
    shutil.rmtree(GOOGLE_WORKSPACE_TOKEN_DIR)
os.makedirs(GOOGLE_WORKSPACE_TOKEN_DIR, exist_ok=True)

ui.launch()