Extract emails from an `mbox` file. In Apple.mail, you can right-click on a mailbox and "Export" to an mbox file.

In [None]:
source_mbox = '[your extracted mailbox]'
extracted_mail_dir = '[a temporary location]'

import mailbox 

# copied from https://stackoverflow.com/questions/26567843/reading-the-mail-content-of-an-mbox-file-using-python-mailbox
def getbody(message): #getting plain text 'email body'
    body = None
    if message.is_multipart():
        for part in message.walk():
            if part.is_multipart():
                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        body = subpart.get_payload(decode=True)
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)
    elif message.get_content_type() == 'text/plain':
        body = message.get_payload(decode=True)
    return body

def extract_emails(mbox_file, output_folder): 
    mbox = mailbox.mbox(mbox_file) 
    for i, message in enumerate(mbox):
        with open(f"{output_folder}/email_{i+1}.eml", "w", encoding="utf-8") as f: 
            sender = 'From: ' + str(message.get('From')) + '\n'
            recipients = 'To: ' + str(message.get('To')) + '\n'
            timestamp = 'Date: ' + str(message.get('Date')) + '\n'
            f.write(sender + recipients + timestamp + str(getbody(message)))

extract_emails(source_mbox, extracted_mail_dir)

These are the llamaindex imports. See `README.md` for how you install these  

In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

start the local LLM and select embedding model 
make sure to run the model in Ollama first:

`$ ollama run llama3.2`

In [5]:
import openai
openai.api_key="not used -- this is a local model"
Settings.llm = Ollama(model="llama3.2", request_timeout=360.0)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

Set up the database

In [3]:
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# set up some constants
collection="[what you want to call your Chroma collection]"
chroma_db_path="[where to store the database]" #note must exist

# initialize client, setting path to save data
db = chromadb.PersistentClient(path=chroma_db_path)

# create collection
chroma_collection = db.get_or_create_collection(collection)

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

HEADS UP -- if you have added your emails before, skip this step. It can take a _VERY LONG TIME_ so start with a small subset (see code)

In [None]:
# load some documents
documents = SimpleDirectoryReader(extracted_mail_dir).load_data()

# if you have a lot of messages, or you want to try things out, add a slice, for example
# documents = documents[0:500]

# create your index
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, show_progress=True
)

HEADSUP again -- if you created the store above, skip this step.

In [13]:
# load your index from stored vectors
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)


Now to the fun part, ask some questions about your email :)

In [None]:
query_engine = index.as_query_engine(streaming=True, response_mode="tree_summarize", verbose=True)
response = query_engine.query("What are the most common subjects in these emails?", )
response.print_response_stream()

In [None]:
response = query_engine.query("Which sender uses the shortest sentences in their emails and what do they write about?")
response.print_response_stream()