## Part 1: Imports and Configuration


In [1]:
# Standard library imports
import os
import glob
import base64
import io
from datetime import datetime, timedelta
from pathlib import Path
from email.utils import parsedate_to_datetime

# Third-party imports
from dotenv import load_dotenv
import gradio as gr

# Google API imports
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

# Visualization imports
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

# Document processing imports
import PyPDF2
from docx import Document as DocxDocument

print("All imports successful!")


All imports successful!


In [3]:
# Configuration
MODEL = "gpt-4o-mini"  # Cost-effective model
DB_NAME = "vector_db_gmail_drive"
CREDENTIALS_DIR = "credentials"
TOKENS_DIR = "tokens"

# Google API Scopes
SCOPES = [
    'https://www.googleapis.com/auth/gmail.readonly',
    'https://www.googleapis.com/auth/drive.readonly'
]

# Create necessary directories
Path(CREDENTIALS_DIR).mkdir(exist_ok=True)
Path(TOKENS_DIR).mkdir(exist_ok=True)
Path(DB_NAME).mkdir(exist_ok=True)

# Load environment variables
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

print(f"Configuration complete")
print(f"Model: {MODEL}")
print(f"Vector DB: {DB_NAME}")


Configuration complete
Model: gpt-4o-mini
Vector DB: vector_db_gmail_drive


## Part 2: Google Authentication

This cell will authenticate you with Google. On first run, it will open a browser window asking you to log in and grant permissions.


In [4]:
def authenticate_google():
    """
    Authenticate with Google APIs (Gmail + Drive)
    Returns credentials object
    """
    creds = None
    token_path = os.path.join(TOKENS_DIR, 'token.json')
    credentials_path = os.path.join(CREDENTIALS_DIR, 'google_credentials.json')
    
    # Check if credentials file exists
    if not os.path.exists(credentials_path):
        print("ERROR: google_credentials.json not found!")
        print(f"Please place your credentials file at: {credentials_path}")
        print("See setup instructions above for how to create it.")
        return None
    
    # Load existing token if available
    if os.path.exists(token_path):
        creds = Credentials.from_authorized_user_file(token_path, SCOPES)
    
    # If no valid credentials, authenticate
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            print("Refreshing expired credentials...")
            creds.refresh(Request())
        else:
            print("Opening browser for authentication...")
            print("Please log in and grant permissions.")
            flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
            creds = flow.run_local_server(port=0)
        
        # Save credentials for next time
        with open(token_path, 'w') as token:
            token.write(creds.to_json())
        print("Credentials saved!")
    
    print("Authentication successful!")
    return creds

# Test authentication
print("Testing Google authentication...")
creds = authenticate_google()
if creds:
    print("Ready to access Gmail and Drive!")


Testing Google authentication...
Opening browser for authentication...
Please log in and grant permissions.
Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=229197449433-rvndi6blmjno531psng20kg84uqrc8m6.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A55908%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.readonly+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.readonly&state=oq0IjWvNDJpHLwiE290W9qk996fe6U&access_type=offline
Credentials saved!
Authentication successful!
Ready to access Gmail and Drive!


In [6]:
def fetch_gmail_emails(creds, max_results=100, days_back=180):
    """
    Fetch emails from Gmail
    
    Args:
        creds: Google credentials
        max_results: Maximum number of emails to fetch
        days_back: How many days back to search
    
    Returns:
        List of Document objects
    """
    print(f"\nFetching emails from last {days_back} days (max {max_results})...")
    
    try:
        service = build('gmail', 'v1', credentials=creds)
        
        # Calculate date for filtering
        after_date = datetime.now() - timedelta(days=days_back)
        query = f"after:{after_date.strftime('%Y/%m/%d')}"
        
        # Get list of messages
        results = service.users().messages().list(
            userId='me',
            maxResults=max_results,
            q=query
        ).execute()
        
        messages = results.get('messages', [])
        
        if not messages:
            print("No emails found.")
            return []
        
        print(f"Found {len(messages)} emails. Processing...")
        
        documents = []
        for i, message in enumerate(messages, 1):
            if i % 10 == 0:
                print(f"Processed {i}/{len(messages)} emails...")
            
            # Get full message
            msg = service.users().messages().get(
                userId='me',
                id=message['id'],
                format='full'
            ).execute()
            
            # Extract headers
            headers = msg['payload']['headers']
            subject = next((h['value'] for h in headers if h['name'] == 'Subject'), 'No Subject')
            sender = next((h['value'] for h in headers if h['name'] == 'From'), 'Unknown')
            date_str = next((h['value'] for h in headers if h['name'] == 'Date'), '')
            
            # Extract body
            body = ""
            if 'parts' in msg['payload']:
                for part in msg['payload']['parts']:
                    if part['mimeType'] == 'text/plain':
                        if 'data' in part['body']:
                            body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
                            break
            elif 'body' in msg['payload'] and 'data' in msg['payload']['body']:
                body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8', errors='ignore')
            
            # Create document
            content = f"Subject: {subject}\nFrom: {sender}\nDate: {date_str}\n\n{body}"
            
            doc = Document(
                page_content=content,
                metadata={
                    'source': 'Gmail',
                    'type': 'email',
                    'subject': subject,
                    'sender': sender,
                    'date': date_str,
                    'message_id': message['id']
                }
            )
            documents.append(doc)
        
        print(f"Successfully processed {len(documents)} emails")
        return documents
        
    except Exception as e:
        print(f"Error fetching Gmail: {str(e)}")
        return []

# Test Gmail fetching (adjust parameters as needed)
if creds:
    gmail_docs = fetch_gmail_emails(creds, max_results=50, days_back=90)
    print(f"\nGmail Summary: {len(gmail_docs)} emails loaded")
    if gmail_docs:
        print(f"Sample subject: {gmail_docs[0].metadata['subject'][:60]}...")



Fetching emails from last 90 days (max 50)...
Found 50 emails. Processing...
Processed 10/50 emails...
Processed 20/50 emails...
Processed 30/50 emails...
Processed 40/50 emails...
Processed 50/50 emails...
Successfully processed 50 emails

Gmail Summary: 50 emails loaded
Sample subject: Introducing the Providus Corporate Expense Card....


In [7]:
def extract_text_from_file(file_content, mime_type, file_name):
    """Extract text from various file types"""
    try:
        if mime_type == 'text/plain' or file_name.endswith(('.txt', '.md')):
            return file_content.decode('utf-8', errors='ignore')
        
        elif mime_type == 'application/pdf' or file_name.endswith('.pdf'):
            pdf_file = io.BytesIO(file_content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            return text
        
        elif mime_type in ['application/vnd.openxmlformats-officedocument.wordprocessingml.document', 
                           'application/msword'] or file_name.endswith('.docx'):
            doc_file = io.BytesIO(file_content)
            doc = DocxDocument(doc_file)
            text = "\n".join([para.text for para in doc.paragraphs])
            return text
        
        elif mime_type == 'application/vnd.google-apps.document':
            return "[Google Doc - will be exported as plain text]"
        
        else:
            return f"[Unsupported file type: {mime_type}]"
            
    except Exception as e:
        return f"[Error extracting text: {str(e)}]"

def fetch_drive_files(creds, max_results=50):
    """
    Fetch files from Google Drive
    
    Args:
        creds: Google credentials
        max_results: Maximum number of files to fetch
    
    Returns:
        List of Document objects
    """
    print(f"\nFetching files from Google Drive (max {max_results})...")
    
    try:
        service = build('drive', 'v3', credentials=creds)
        
        # Query for text-based files
        query = "(mimeType='text/plain' or mimeType='application/pdf' or "
        query += "mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document' or "
        query += "mimeType='application/vnd.google-apps.document' or "
        query += "mimeType='text/markdown') and trashed=false"
        
        # Get list of files
        results = service.files().list(
            pageSize=max_results,
            q=query,
            fields="files(id, name, mimeType, createdTime, modifiedTime, size)"
        ).execute()
        
        files = results.get('files', [])
        
        if not files:
            print("No supported files found.")
            return []
        
        print(f"Found {len(files)} files. Processing...")
        
        documents = []
        for i, file in enumerate(files, 1):
            if i % 5 == 0:
                print(f"Processed {i}/{len(files)} files...")
            
            try:
                # Download file content
                if file['mimeType'] == 'application/vnd.google-apps.document':
                    # Export Google Docs as plain text
                    request = service.files().export_media(
                        fileId=file['id'],
                        mimeType='text/plain'
                    )
                else:
                    # Download regular files
                    request = service.files().get_media(fileId=file['id'])
                
                file_content = io.BytesIO()
                downloader = MediaIoBaseDownload(file_content, request)
                
                done = False
                while not done:
                    status, done = downloader.next_chunk()
                
                # Extract text
                content = extract_text_from_file(
                    file_content.getvalue(),
                    file['mimeType'],
                    file['name']
                )
                
                # Skip if no content or too short
                if len(content.strip()) < 50:
                    continue
                
                # Create document
                doc = Document(
                    page_content=content,
                    metadata={
                        'source': 'Google Drive',
                        'type': 'file',
                        'name': file['name'],
                        'mime_type': file['mimeType'],
                        'created': file.get('createdTime', ''),
                        'modified': file.get('modifiedTime', ''),
                        'file_id': file['id']
                    }
                )
                documents.append(doc)
                
            except Exception as e:
                print(f"Skipped {file['name']}: {str(e)}")
                continue
        
        print(f"Successfully processed {len(documents)} files")
        return documents
        
    except Exception as e:
        print(f"Error fetching Drive files: {str(e)}")
        return []

# Test Drive fetching
if creds:
    drive_docs = fetch_drive_files(creds, max_results=30)
    print(f"\nDrive Summary: {len(drive_docs)} files loaded")
    if drive_docs:
        print(f"Sample file: {drive_docs[0].metadata['name']}")



Fetching files from Google Drive (max 30)...
Found 30 files. Processing...
Processed 5/30 files...
Processed 10/30 files...
Processed 15/30 files...
Processed 20/30 files...
Processed 25/30 files...
Processed 30/30 files...
Successfully processed 5 files

Drive Summary: 5 files loaded
Sample file: Philip_Omoigui_New_Resume_UN.pdf


In [8]:
# Combine all documents
all_documents = []

if 'gmail_docs' in globals():
    all_documents.extend(gmail_docs)
if 'drive_docs' in globals():
    all_documents.extend(drive_docs)

print(f"\nTotal documents collected: {len(all_documents)}")
print(f"- Gmail emails: {len([d for d in all_documents if d.metadata['source'] == 'Gmail'])}")
print(f"- Drive files: {len([d for d in all_documents if d.metadata['source'] == 'Google Drive'])}")



Total documents collected: 55
- Gmail emails: 50
- Drive files: 5


In [9]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

chunks = text_splitter.split_documents(all_documents)
print(f"\nSplit into {len(chunks)} chunks")

# Show chunk distribution
gmail_chunks = len([c for c in chunks if c.metadata['source'] == 'Gmail'])
drive_chunks = len([c for c in chunks if c.metadata['source'] == 'Google Drive'])
print(f"- Gmail chunks: {gmail_chunks}")
print(f"- Drive chunks: {drive_chunks}")



Split into 451 chunks
- Gmail chunks: 425
- Drive chunks: 26


In [10]:
# Create embeddings and vector store
print("\nCreating embeddings and vector store...")
print("This may take a minute depending on the number of chunks...")

#embeddings = OpenAIEmbeddings()

# Alternative: Use free HuggingFace embeddings
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete existing vector store if present
if os.path.exists(DB_NAME):
    print("Deleting old vector store...")
    Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()

# Create new vector store
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=DB_NAME
)

print(f"Vector store created with {vectorstore._collection.count()} embeddings")

# Get embedding dimensions
sample_embedding = vectorstore._collection.get(limit=1, include=["embeddings"])["embeddings"][0]
print(f"Embedding dimensions: {len(sample_embedding):,}")



Creating embeddings and vector store...
This may take a minute depending on the number of chunks...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Deleting old vector store...
Vector store created with 451 embeddings
Embedding dimensions: 384


## Part 6: 3D Visualization

Visualize your knowledge base in 3D space. Blue points = Gmail, Green points = Drive files.


In [11]:
def visualize_vectorstore_3d(vectorstore):
    """Create 3D visualization of vector store"""
    print("\nCreating 3D visualization...")
    
    # Get all embeddings and metadata
    collection = vectorstore._collection
    result = collection.get(include=['embeddings', 'documents', 'metadatas'])
    
    vectors = np.array(result['embeddings'])
    documents = result['documents']
    metadatas = result['metadatas']
    
    # Extract sources for coloring
    sources = [meta['source'] for meta in metadatas]
    colors = ['blue' if s == 'Gmail' else 'green' for s in sources]
    
    # Reduce to 3D using t-SNE
    print("Reducing dimensions with t-SNE (this may take a moment)...")
    tsne = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
    reduced_vectors = tsne.fit_transform(vectors)
    
    # Create hover text
    hover_texts = []
    for meta, doc in zip(metadatas, documents):
        if meta['source'] == 'Gmail':
            text = f"Email<br>Subject: {meta.get('subject', 'N/A')[:50]}<br>From: {meta.get('sender', 'N/A')[:30]}"
        else:
            text = f"File<br>Name: {meta.get('name', 'N/A')[:50]}"
        text += f"<br>Preview: {doc[:100]}..."
        hover_texts.append(text)
    
    # Create 3D scatter plot
    fig = go.Figure(data=[go.Scatter3d(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        z=reduced_vectors[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            color=colors,
            opacity=0.8,
            line=dict(width=0.5, color='white')
        ),
        text=hover_texts,
        hoverinfo='text'
    )])
    
    fig.update_layout(
        title='3D Knowledge Base Visualization (Blue=Gmail, Green=Drive)',
        scene=dict(
            xaxis_title='Dimension 1',
            yaxis_title='Dimension 2',
            zaxis_title='Dimension 3'
        ),
        width=900,
        height=700,
        margin=dict(r=20, b=10, l=10, t=40)
    )
    
    print("Visualization ready!")
    return fig

# Create visualization
if len(chunks) > 0:
    viz_fig = visualize_vectorstore_3d(vectorstore)
    viz_fig.show()



Creating 3D visualization...
Reducing dimensions with t-SNE (this may take a moment)...
Visualization ready!


In [None]:
# Create LLM and conversational chain
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Alternative: Ollama locally
# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# Set up conversation memory
memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True,
    output_key='answer'
)

# Create retriever with more context
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}  # Retrieve top 10 relevant chunks
)

# Create conversational chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    verbose=False
)

print("Conversational RAG chain ready!")
print(f"Model: {MODEL}")


Conversational RAG chain ready!
Model: gpt-4o-mini
Retrieval: Top 10 similar chunks
Memory: Conversation buffer


## Part 8: Test Queries

Test the RAG system with some example queries.


In [None]:
# Test the RAG system
def test_query(question):
    """Test a query and show sources"""
    print(f"\nQuestion: {question}")
    result = conversation_chain.invoke({"question": question})
    print(f"\nAnswer: {result['answer']}")
    print(f"\nSources used: {len(result['source_documents'])} documents")
    for i, doc in enumerate(result['source_documents'][:3], 1):
        source = doc.metadata['source']
        if source == 'Gmail':
            print(f"{i}. Email: {doc.metadata.get('subject', 'N/A')[:50]}...")
        else:
            print(f"{i}. File: {doc.metadata.get('name', 'N/A')[:50]}")
    return result

# Example queries
print("Testing RAG system with sample queries...")

#test_query("What are the most recent emails about?")
# test_query("What files do I have related to Python or programming?")
#test_query("Summarize important information from my emails")

print("\nReady for Gradio interface!")


Testing RAG system with sample queries...

Question: Summarize important information from my emails

Answer: I'm sorry, but I don't have access to your emails to summarize any information.

Sources used: 10 documents
1. Email: You still have 2 ‚Ç¶0 ùêàùê≠ùêûùê¶ùê¨ unclaimed!...
2. Email: To Retire 7 Years Earlier, Stop Buying These 7 Thi...
3. Email: Using Metadata as a Weapon Against AI Fakes | Jeff...

Ready for Gradio interface!


## Part 9: Gradio Interface

Launch the interactive chat interface.


In [16]:
def chat_with_sources(question, history):
    """
    Chat function that returns answer with sources
    """
    try:
        result = conversation_chain.invoke({"question": question})
        answer = result['answer']
        
        # Add source information
        sources = result['source_documents'][:3]
        if sources:
            answer += "\n\n---\n**Sources:**\n"
            for i, doc in enumerate(sources, 1):
                if doc.metadata['source'] == 'Gmail':
                    answer += f"{i}. Email: {doc.metadata.get('subject', 'N/A')[:60]}...\n"
                else:
                    answer += f"{i}. File: {doc.metadata.get('name', 'N/A')[:60]}\n"
        
        return answer
    except Exception as e:
        return f"Error: {str(e)}"

def get_stats():
    """Get knowledge base statistics"""
    total_chunks = vectorstore._collection.count()
    gmail_count = len([c for c in chunks if c.metadata['source'] == 'Gmail'])
    drive_count = len([c for c in chunks if c.metadata['source'] == 'Google Drive'])
    
    stats = f"""
**Knowledge Base Statistics**

Total Chunks: {total_chunks}
Gmail Emails: {gmail_count} chunks
Drive Files: {drive_count} chunks

Model: {MODEL}
Embedding Dimensions: {len(sample_embedding):,}
Retrieval: Top 10 relevant chunks per query
    """
    return stats

# Create Gradio interface
with gr.Blocks(title="Personal Knowledge Worker", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # Personal Productivity Knowledge Worker
    
    Search across your Gmail emails and Google Drive files using AI.
    
    **Ask questions like:**
    - "What emails did I receive about [topic]?"
    - "Find information about [subject] in my files"
    - "Summarize recent communications about [project]"
    - "What documents do I have related to [keyword]?"
    """)
    
    with gr.Tab("Chat"):
        chatbot = gr.ChatInterface(
            chat_with_sources,
            type="messages",
            examples=[
                "What are my most recent emails about?",
                "What files do I have in my Drive?",
                "Find information about projects or assignments",
                "Summarize important communications"
            ],
            title="Chat with Your Knowledge Base"
        )
    
    with gr.Tab("Statistics"):
        gr.Markdown(get_stats())
        if 'viz_fig' in globals():
            gr.Plot(viz_fig)
    
    with gr.Tab("Info"):
        gr.Markdown("""
        ## How It Works
        
        This system uses **Retrieval-Augmented Generation (RAG)** to:
        1. Index your Gmail emails and Drive files
        2. Convert them to vector embeddings
        3. Search semantically (by meaning, not just keywords)
        4. Generate contextual answers using GPT-4o-mini
        
        ## Privacy
        - Your data is processed through OpenAI's API for embeddings and responses
        - Vectors are stored locally in ChromaDB
        - Original files remain in Gmail/Drive untouched
        
        ## Tips
        - Be specific in your questions for better results
        - The system searches both emails and files simultaneously
        - Conversation history is maintained during your session
        - Sources are shown below each answer
        
        ## Updating Your Knowledge Base
        - Re-run the Gmail and Drive fetch cells to get new content
        - Rebuild the vector store to include updated data
        - The system will remember conversation context until you restart
        """)

print("\nLaunching Gradio interface...")
demo.launch(inbrowser=True)



Launching Gradio interface...
* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.


