In [40]:
pip install matplotlib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [34]:
import poplib
import logging
from email.parser import BytesParser
from email.utils import parsedate_to_datetime

def connect_to_pop3(email_host, email_port, email_user, email_pass):
    try:
        mail = poplib.POP3_SSL(email_host, email_port)
        mail.user(email_user)
        mail.pass_(email_pass)
        logging.info(f"Connected to {email_host}, Total emails: {len(mail.list()[1])}")
        return mail
    except Exception as e:
        logging.error("Failed to connect to POP3 server: " + str(e))
        return None

In [36]:
def get_message_date(mail, msg_num):
    try:
        # Use TOP command to retrieve headers only
        resp, lines, octets = mail.top(msg_num, 0)  # 0 lines of body
        raw_email_bytes = b"\n".join(lines)
        msg = BytesParser(policy=default).parsebytes(raw_email_bytes)
        email_date_str = msg["date"]
        email_date_parsed = parsedate_to_datetime(email_date_str)
        
        if email_date_parsed.tzinfo:
            email_date = email_date_parsed.astimezone().replace(tzinfo=None)
        else:
            email_date = email_date_parsed
        return email_date
    except Exception as e:
        logging.error(f"Error retrieving headers for message {msg_num}: {e}")
        return None

In [37]:
def find_low(mail, start_date, total_messages):
    left = 1
    right = total_messages
    low = None
    while left <= right:
        mid = (left + right) // 2
        email_date = get_message_date(mail, mid)
        if not email_date:
            right = mid - 1  # Adjust and continue
            continue
        if email_date >= start_date:
            low = mid
            right = mid - 1
        else:
            left = mid + 1
    return low


In [63]:
def fetch_and_save_emails(mail, start_date, end_date, output_dir="emails", num_messages=None):
    if not mail:
        return []
    os.makedirs(output_dir, exist_ok=True)
    total_messages = len(mail.list()[1])
    email_file_info = []
    
    # Find the low and high bounds using binary search
    low = find_low(mail, start_date, total_messages)
    high = find_high(mail, end_date, total_messages)
    
    if low is None or high is None or low > high:
        logging.info("No messages found within the specified date range.")
        return email_file_info
    
    scanned = 0
    # Iterate from high to low (newest to oldest within the range)
    for i in range(high, low - 1, -1):
        if num_messages and scanned >= num_messages:
            break
        try:
            resp, lines, octets = mail.retr(i)
            raw_email_bytes = b"\n".join(lines)
            msg = BytesParser(policy=default).parsebytes(raw_email_bytes)
            email_date_str = msg["date"]
            email_date_parsed = parsedate_to_datetime(email_date_str)
            if email_date_parsed.tzinfo:
                email_date = email_date_parsed.astimezone().replace(tzinfo=None)
            else:
                email_date = email_date_parsed
            # Double-check date is within range (in case of out-of-order dates)
            if not (start_date <= email_date <= end_date):
                logging.info(f"Skipping email {i} outside date range: {email_date}")
                continue
            subject = msg["subject"] or "No Subject"
            logging.info(f"Saving email {i}: {subject} on {email_date_str}")
            eml_filename = os.path.join(output_dir, f"email_{i}.eml")
            with open(eml_filename, "wb") as f:
                f.write(raw_email_bytes)
            email_file_info.append((eml_filename, email_date.isoformat(), msg["Subject"]))
            scanned += 1
        except Exception as e:
            logging.error(f"Error processing email {i}: {e}")
    return email_file_info


In [39]:
def extract_email_content(eml_file):
    try:
        with open(eml_file, "rb") as f:
            msg = BytesParser(policy=policy.default).parse(f)
        subject = msg["subject"] if msg["subject"] else "No Subject"
        sender = msg["from"] if msg["from"] else "Unknown Sender"
        body = None
        if msg.is_multipart():
            for part in msg.walk():
                content_type = part.get_content_type()
                content_disposition = str(part.get("Content-Disposition"))
                if content_type == "text/plain" and "attachment" not in content_disposition:
                    charset = part.get_content_charset() or "utf-8"
                    body = part.get_payload(decode=True).decode(charset, errors="ignore")
                    break
        if body is None:
            if msg.get_body(preferencelist=("plain",)):
                body = msg.get_body(preferencelist=("plain",)).get_content()
            elif msg.get_body(preferencelist=("html",)):
                body = "HTML Email: " + msg.get_body(preferencelist=("html",)).get_content()
            else:
                body = "No readable content available."
        logging.info(f"Extracted Email: {subject} from {sender}")
        return {"subject": subject, "sender": sender, "body": body}
    except Exception as e:
        logging.error(f"Error processing {eml_file}: {e}")
        return {"subject": "Error", "sender": "Error", "body": "Could not process email."}


In [64]:
mail  = connect_to_pop3(
    "mail.streamax.com", 
    995, 
    "kmhuang@streamax.com", 
    "atstreamaM4;"
    )

print(mail)

2025-04-17 17:46:25,607 - INFO - Connected to mail.streamax.com, Total emails: 3096


<poplib.POP3_SSL object at 0x11f52e6c0>


In [65]:
from datetime import datetime

In [66]:
output = fetch_and_save_emails(
    mail, 
    start_date=datetime(2025, 4, 15), 
    end_date=datetime(2025, 4, 16), 
    output_dir="emails", 
    num_messages=None
    )

2025-04-17 17:46:27,158 - INFO - Saving email 3057: 【汇报类】紫金锌业项目—个人工作日报20250415 on Tue, 15 Apr 2025 23:58:48 +0800
2025-04-17 17:46:27,169 - INFO - Saving email 3056: 【工作日报】20250415-紫金锌业矿山项目 on Tue, 15 Apr 2025 23:34:07 +0800
2025-04-17 17:46:27,178 - INFO - Saving email 3055: 【汇报类】新疆喀什紫金锌业矿山项目日报4.15 on Tue, 15 Apr 2025 22:37:28 +0800
2025-04-17 17:46:27,185 - INFO - Saving email 3054: 【汇报类】彭昌杰4月15日沙特公交项目工作日志 on Tue, 15 Apr 2025 22:30:39 +0800
2025-04-17 17:46:27,334 - INFO - Saving email 3053: Re: Re: 今日工作日报 on Tue, 15 Apr 2025 21:40:19 +0800
2025-04-17 17:46:27,342 - INFO - Saving email 3052:  【项目日报】深圳出租项目 on Tue, 15 Apr 2025 20:50:28 +0800
2025-04-17 17:46:27,435 - INFO - Saving email 3051: 4月15号 深圳出租夜班运维值班日报 on Tue, 15 Apr 2025 20:46:22 +0800
2025-04-17 17:46:27,449 - INFO - Saving email 3050: US Video Block Data 20250414 on Tue, 15 Apr 2025 12:28:28 +0000
2025-04-17 17:46:27,506 - INFO - Saving email 3049: 【汇报类】产品质量(双)周报（3月31日-4月11日） on Tue, 15 Apr 2025 19:20:10 +0800
2025-04-17 17

In [67]:
output

[('emails/email_3057.eml',
  '2025-04-15T23:58:48',
  '【汇报类】紫金锌业项目—个人工作日报20250415'),
 ('emails/email_3056.eml', '2025-04-15T23:34:07', '【工作日报】20250415-紫金锌业矿山项目'),
 ('emails/email_3055.eml', '2025-04-15T22:37:28', '【汇报类】新疆喀什紫金锌业矿山项目日报4.15'),
 ('emails/email_3054.eml', '2025-04-15T22:30:39', '【汇报类】彭昌杰4月15日沙特公交项目工作日志'),
 ('emails/email_3053.eml', '2025-04-15T21:40:19', 'Re: Re: 今日工作日报'),
 ('emails/email_3052.eml', '2025-04-15T20:50:28', ' 【项目日报】深圳出租项目'),
 ('emails/email_3051.eml', '2025-04-15T20:46:22', '4月15号 深圳出租夜班运维值班日报'),
 ('emails/email_3050.eml',
  '2025-04-15T20:28:28',
  'US Video Block Data 20250414'),
 ('emails/email_3049.eml',
  '2025-04-15T19:20:10',
  '【汇报类】产品质量(双)周报（3月31日-4月11日）'),
 ('emails/email_3048.eml',
  '2025-04-15T18:50:45',
  '【汇报类】工作日报--深圳出租--SIM卡--深圳公交'),
 ('emails/email_3047.eml', '2025-04-15T18:46:18', '【汇报类】深圳机场-国际货站车辆项目-宋宇工作日报'),
 ('emails/email_3046.eml', '2025-04-15T18:45:43', '20250415_徐家辉_日报'),
 ('emails/email_3045.eml',
  '2025-04-15T18:32:44',
  '回复:  回复

In [71]:
pip install ace-tools

Collecting ace-tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace-tools
Successfully installed ace-tools-0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [74]:
pip install ace_tools_open

Collecting ace_tools_open
  Downloading ace_tools_open-0.1.0-py3-none-any.whl.metadata (1.1 kB)
Collecting itables (from ace_tools_open)
  Downloading itables-2.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading ace_tools_open-0.1.0-py3-none-any.whl (3.0 kB)
Downloading itables-2.3.0-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: itables, ace_tools_open
Successfully installed ace_tools_open-0.1.0 itables-2.3.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [75]:
import pandas as pd
import ace_tools_open as tools

In [77]:
# Create a pandas DataFrame
df = pd.DataFrame(output, columns=["email_file", "email_date", "subject"])

# Convert the 'email_date' column to datetime
df['email_date'] = pd.to_datetime(df['email_date'])

tools.display_dataframe_to_user(name="Email Data", dataframe=df) 

# df.head()  # Show the first few rows to confirm

Email Data


email_file,email_date,subject
Loading ITables v2.3.0 from the internet... (need help?),,


In [79]:
df

Unnamed: 0,email_file,email_date,subject
0,emails/email_3057.eml,2025-04-15 23:58:48,【汇报类】紫金锌业项目—个人工作日报20250415
1,emails/email_3056.eml,2025-04-15 23:34:07,【工作日报】20250415-紫金锌业矿山项目
2,emails/email_3055.eml,2025-04-15 22:37:28,【汇报类】新疆喀什紫金锌业矿山项目日报4.15
3,emails/email_3054.eml,2025-04-15 22:30:39,【汇报类】彭昌杰4月15日沙特公交项目工作日志
4,emails/email_3053.eml,2025-04-15 21:40:19,Re: Re: 今日工作日报
5,emails/email_3052.eml,2025-04-15 20:50:28,【项目日报】深圳出租项目
6,emails/email_3051.eml,2025-04-15 20:46:22,4月15号 深圳出租夜班运维值班日报
7,emails/email_3050.eml,2025-04-15 20:28:28,US Video Block Data 20250414
8,emails/email_3049.eml,2025-04-15 19:20:10,【汇报类】产品质量(双)周报（3月31日-4月11日）
9,emails/email_3048.eml,2025-04-15 18:50:45,【汇报类】工作日报--深圳出租--SIM卡--深圳公交


In [78]:
get_message_date(mail, msg_num=25)

2025-04-17 17:55:16,769 - ERROR - Error retrieving headers for message 25: -ERR EOF


# Vector database test

## Built vector database

In [80]:
import os
import gc
import uuid
import shutil
import json
import time
import base64
import poplib
import logging
import tempfile
import concurrent.futures
from datetime import datetime
from email import policy
from email.parser import BytesParser
from email.utils import parsedate_to_datetime
from email.policy import default

import streamlit as st
from openai import OpenAI
import openai
import time

import threading
import re

import tcvectordb
from tcvectordb.model.enum import FieldType, IndexType, MetricType, ReadConsistency
from tcvectordb.model.index import Index, VectorIndex, FilterIndex, HNSWParams
from tcvectordb.model.document import Document, SearchParams, Filter
from tcvectordb.model.collection import Embedding

In [None]:
def build_vectordb_from_emails(emails):
    # st.info("Building vector index for emails...")
    # Create or reuse a database
    db = tencent_vectordb_client.create_database(database_name='email_db')
    # Define index fields and embedding configuration
    index = Index(
        FilterIndex(name='id', field_type=FieldType.String, index_type=IndexType.PRIMARY_KEY),
        VectorIndex(
            name='vector',
            dimension=1024,  # Adjust based on your model's dimension
            index_type=IndexType.HNSW,
            metric_type=MetricType.COSINE,
            params=HNSWParams(m=16, efconstruction=200)
        ),
        FilterIndex(name='subject', field_type=FieldType.String, index_type=IndexType.FILTER)
    )
    embedding_conf = Embedding(vector_field='vector', field='text', model_name='BAAI/bge-m3')
    collection = db.create_collection(
        name='email_emb',
        shard=1,
        replicas=1,
        description='Collection for email embeddings',
        embedding=embedding_conf,
        index=index
    )
    docs = []
    for i, email in enumerate(emails):
        combined_text = (
            f"Subject: {email['subject']}\n"
            f"From: {email['sender']}\n"
            f"Date: {email['date']}\n"
            f"Body:\n{email['body']}"
        )
        # Use chunk_text to split if the combined text is too long
        if len(combined_text) > 8192:
            chunks = chunk_text(combined_text, max_length=8192, overlap=500)
            # st.info(f"Email {i} split into {len(chunks)} chunks.")
        else:
            chunks = [combined_text]
        for j, chunk in enumerate(chunks):
            doc = Document(
                id=f"email_{i}_chunk_{j}",
                text=chunk,
                subject=email['subject'],
                sender=email['sender'],
                date=email['date']
            )
            docs.append(doc)
    # st.info(f"Total documents (chunks) to upsert: {len(docs)}")
    # st.info("Upserting documents in batches...")
    # st.info(f"Upserting {len(docs)} email chunks into vector DB in batches...")
    parallel_batch_upsert(docs, batch_size=5, timeout=1000, delay=3)
    # st.info("Vector index built successfully.")
    return collection