## gmail RAG assistant

In [None]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
# NEW IMPORTS FOR GMAIL
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from datetime import datetime
import base64
from email.mime.text import MIMEText
import re

In [None]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [None]:
# Load environment variables in a file called .env

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
# NEW: Gmail API credentials
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
CREDENTIALS_FILE = 'credentials.json'  # Download from Google Cloud Console
TOKEN_FILE = 'token.json'

In [None]:
# Read in emails using LangChain's loaders
# IMPORTANT: set the email received date range hard-coded below

def authenticate_gmail():
    """Authenticate and return Gmail service object"""
    creds = None
    if os.path.exists(TOKEN_FILE):
        creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)
    
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(CREDENTIALS_FILE, SCOPES)
            creds = flow.run_local_server(port=0)
        
        with open(TOKEN_FILE, 'w') as token:
            token.write(creds.to_json())
    
    return build('gmail', 'v1', credentials=creds)

def get_email_content(service, message_id):
    """Extract email content from message"""
    try:
        message = service.users().messages().get(userId='me', id=message_id, format='full').execute()
        
        # Extract basic info
        headers = message['payload'].get('headers', [])
        subject = next((h['value'] for h in headers if h['name'] == 'Subject'), 'No Subject')
        sender = next((h['value'] for h in headers if h['name'] == 'From'), 'Unknown Sender')
        date = next((h['value'] for h in headers if h['name'] == 'Date'), 'Unknown Date')
        
        # Extract body
        body = ""
        if 'parts' in message['payload']:
            for part in message['payload']['parts']:
                if part['mimeType'] == 'text/plain':
                    data = part['body']['data']
                    body = base64.urlsafe_b64decode(data).decode('utf-8')
                    break
        else:
            if message['payload']['body'].get('data'):
                body = base64.urlsafe_b64decode(message['payload']['body']['data']).decode('utf-8')
        
        # Clean up body text
        body = re.sub(r'\s+', ' ', body).strip()
        
        return {
            'subject': subject,
            'sender': sender,
            'date': date,
            'body': body,
            'id': message_id
        }
    except Exception as e:
        print(f"Error processing message {message_id}: {str(e)}")
        return None

def load_gmail_documents(start_date, end_date, max_emails=100):
    """Load emails from Gmail between specified dates"""
    service = authenticate_gmail()
    
    # Format dates for Gmail API (YYYY/MM/DD)
    start_date_str = start_date.strftime('%Y/%m/%d')
    end_date_str = end_date.strftime('%Y/%m/%d')
    
    # Build query
    query = f'after:{start_date_str} before:{end_date_str}'
    
    # Get message list
    result = service.users().messages().list(userId='me', q=query, maxResults=max_emails).execute()
    messages = result.get('messages', [])
    
    print(f"Found {len(messages)} emails between {start_date_str} and {end_date_str}")
    
    # Convert to LangChain documents
    documents = []
    for i, message in enumerate(messages):
        print(f"Processing email {i+1}/{len(messages)}")
        email_data = get_email_content(service, message['id'])
        
        if email_data and email_data['body']:
            # Create document content
            content = f"""Subject: {email_data['subject']}
From: {email_data['sender']}
Date: {email_data['date']}

{email_data['body']}"""
            
            # Create LangChain document
            doc = Document(
                page_content=content,
                metadata={
                    "doc_type": "email",
                    "subject": email_data['subject'],
                    "sender": email_data['sender'],
                    "date": email_data['date'],
                    "message_id": email_data['id']
                }
            )
            documents.append(doc)
    
    return documents

# SET YOUR DATE RANGE HERE
start_date = datetime(2025, 6, 20)  # YYYY, MM, DD
end_date = datetime(2025, 6, 26)   # YYYY, MM, DD

# Load Gmail documents 
documents = load_gmail_documents(start_date, end_date, max_emails=200)


In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

In [None]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

## Visualizing the Vector Store

Let's take a minute to look at the documents and their embedding vectors to see what's going on.

In [None]:
# Prework (with thanks to Jon R for identifying and fixing a bug in this!)

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']

# Alternatively, color by sender:
senders = [metadata.get('sender', 'unknown') for metadata in metadatas]
unique_senders = list(set(senders))
sender_colors = ['blue', 'green', 'red', 'orange', 'purple', 'brown', 'pink', 'gray']
colors = [sender_colors[unique_senders.index(sender) % len(sender_colors)] for sender in senders]

In [None]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(senders, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(senders, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## Langchain and Gradio to prototype a chat with the LLM


In [None]:

from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Alternative - if you'd like to use Ollama locally, uncomment this line instead
# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# change LLM standard prompt (standard prompt defaults the answer to be 'I don't know' too often, especially when using a small LLM

qa_prompt=PromptTemplate.from_template("Use the following pieces of context to answer the user's question. Answer as best you can given the information you have;\
            if you have a reasonable idea of the answer,/then explain it and mention that you're unsure. \
            But if you don't know the answer, don't make it up. \
            {context} \
            Question: {question} \
            Helpful Answer:"
            )


# Wrap into a StuffDocumentsChain, matching the variable name 'context'
combine_docs_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=qa_prompt,
    document_variable_name="context"
)

# set up the conversation memory for the chat
#memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
memory = ConversationBufferMemory(
    memory_key='chat_history', 
    return_messages=True,
    output_key='answer'  
)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
# conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": qa_prompt},
    return_source_documents=True
)

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=False)