In [None]:
import os
import json
import time
from tqdm import tqdm
from typing import List
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from tenacity import retry, stop_after_attempt, wait_random_exponential
import tiktoken

RateLimitError = Exception

# ====== CONFIG ======
SOURCE_JSON = "parsed_course_data.json"
CHROMA_DIR = "persist/chroma_data"
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
MAX_TOKENS_PER_BATCH = 250000
BATCH_DELAY_SECONDS = 1.5
# =====================

# ✅ 初始化
embedding_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
tokenizer = tiktoken.encoding_for_model(EMBEDDING_MODEL_NAME)

# ✅ 載入課程資料
with open(SOURCE_JSON, "r", encoding="utf-8") as f:
    raw_courses = json.load(f)


# ✅ 時間欄位轉 metadata（含字串與布林標記）
def build_time_metadata(time_slots):
    if isinstance(time_slots, list):
        time_str = ",".join(time_slots)
        time_flags = {f"ts_{ts}": True for ts in time_slots}
    else:
        time_str = ""
        time_flags = {}
    return {"time_slots": time_str, **time_flags}


# ✅ 建立 Document 列表
documents = []
for c in raw_courses:
    content = f"課程名稱：{c['title']}\n課程介紹：{c['description']}\n授課老師：{c.get('instructor','')}\n課程網址：{c.get('course_url','')}"
    metadata = {"course_id": c["course_id"], "title": c["title"], "instructor": c.get("instructor", ""), "course_url": c.get("course_url", ""), **build_time_metadata(c.get("time_slots", []))}
    documents.append(Document(page_content=content, metadata=metadata))


# ✅ 計算 token 數
def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))


# ✅ 按 token 數分批
def batch_by_token_limit(docs: List[Document], max_tokens: int):
    batch, total = [], 0
    for doc in docs:
        tokens = count_tokens(doc.page_content)
        if total + tokens > max_tokens and batch:
            yield batch
            batch, total = [], 0
        batch.append(doc)
        total += tokens
    if batch:
        yield batch


# ✅ 安全封裝的 add_texts，內建 retry
@retry(wait=wait_random_exponential(min=2, max=10), stop=stop_after_attempt(5))
def safe_add_texts(vectordb, texts, metadatas):
    vectordb.add_texts(texts=texts, metadatas=metadatas)


# ✅ 初始化 Chroma 向量庫
vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embedding_model)

# ✅ 分批嵌入與寫入
batches = list(batch_by_token_limit(documents, MAX_TOKENS_PER_BATCH))
for i, batch in enumerate(tqdm(batches, desc="Embedding batches")):
    texts = [d.page_content for d in batch]
    metadatas = [d.metadata for d in batch]
    safe_add_texts(vectordb, texts, metadatas)
    time.sleep(BATCH_DELAY_SECONDS)

vectordb.persist()
print(f"✅ 完成：總共處理 {len(documents)} 筆課程，結果已儲存於 {CHROMA_DIR}")

  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embedding_model)
Embedding batches: 100%|██████████| 8/8 [01:55<00:00, 14.44s/it]

✅ 完成：總共處理 1300 筆課程，結果已儲存於 ../persist/chroma_data



  vectordb.persist()


In [None]:
import zipfile
import os


def zip_chroma_data(folder_path, output_zip="chroma_data.zip"):
    with zipfile.ZipFile(output_zip, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                filepath = os.path.join(root, file)
                arcname = os.path.relpath(filepath, start=folder_path)
                zipf.write(filepath, arcname)


zip_chroma_data("persist/chroma_data")  # ⬅ 這裡改成你的 Chroma 資料夾路徑
print("✅ 已壓縮為 chroma_data.zip")

✅ 已壓縮為 chroma_data.zip


In [8]:
from azure.storage.blob import BlobServiceClient
import os
from dotenv import load_dotenv

load_dotenv()
account_name = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
account_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
container_name = os.getenv("AZURE_BLOB_CONTAINER")

connection_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connection_str)

try:
    container_client = blob_service_client.get_container_client(container_name)
    print("✅ 成功連線，列出 container 中的 blob:")
    for blob in container_client.list_blobs():
        print(" -", blob.name)
except Exception as e:
    print("❌ 連線失敗，請檢查 account name/key/container name")
    print("錯誤訊息：", e)

✅ 成功連線，列出 container 中的 blob:


In [9]:
from azure.storage.blob import BlobClient
from azure.storage.blob import ContentSettings

test_blob_name = "test-upload.txt"
test_blob_client = blob_service_client.get_blob_client(container=container_name, blob=test_blob_name)

with open("test-upload.txt", "w", encoding="utf-8") as f:
    f.write("這是一個測試檔案")

with open("test-upload.txt", "rb") as data:
    test_blob_client.upload_blob(data, overwrite=True, content_settings=ContentSettings(content_type="text/plain"))
    print("✅ 測試檔案已上傳")

✅ 測試檔案已上傳


In [17]:
import os
from dotenv import load_dotenv
from azure.storage.blob import BlobClient, ContentSettings
from tqdm import tqdm

# --------------- 🔧 設定常數 ----------------
LOCAL_FILE = "chroma_data.zip"
BLOB_NAME = "course_vector.zip"
CONTENT_TYPE = "application/zip"
MAX_SINGLE_PUT_SIZE = 16 * 1024 * 1024  # 16MB
MAX_BLOCK_SIZE = 4 * 1024 * 1024  # 4MB
TIMEOUT = 600
MAX_CONCURRENCY = 4

# --------------- 🔐 載入憑證 ----------------
load_dotenv()
ACCOUNT_NAME = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
ACCOUNT_KEY = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
CONTAINER_NAME = os.getenv("AZURE_BLOB_CONTAINER")

if not all([ACCOUNT_NAME, ACCOUNT_KEY, CONTAINER_NAME]):
    raise ValueError("❌ 請確認 .env 中帳號資訊是否齊全")

# --------------- 🔗 建立連線字串 ----------------
AZURE_CONN_STR = f"DefaultEndpointsProtocol=https;AccountName={ACCOUNT_NAME};AccountKey={ACCOUNT_KEY};EndpointSuffix=core.windows.net"


# --------------- 📦 準備上傳檔案 ----------------
class TqdmUploadWrapper:
    def __init__(self, file, total):
        self.file = file
        self.progress_bar = tqdm(total=total, unit="B", unit_scale=True, desc="📤 Uploading")

    def read(self, size):
        data = self.file.read(size)
        self.progress_bar.update(len(data))
        return data

    def __getattr__(self, attr):
        return getattr(self.file, attr)


# --------------- ☁️ 建立 BlobClient 並上傳 ----------------
file_size = os.path.getsize(LOCAL_FILE)

blob_client = BlobClient.from_connection_string(conn_str=AZURE_CONN_STR, container_name=CONTAINER_NAME, blob_name=BLOB_NAME, max_single_put_size=MAX_SINGLE_PUT_SIZE, max_block_size=MAX_BLOCK_SIZE)

with open(LOCAL_FILE, "rb") as f:
    wrapped = TqdmUploadWrapper(f, total=file_size)
    blob_client.upload_blob(data=wrapped, blob_type="BlockBlob", overwrite=True, content_settings=ContentSettings(content_type=CONTENT_TYPE), max_concurrency=MAX_CONCURRENCY, timeout=TIMEOUT)
    wrapped.progress_bar.close()

print(f"\n✅ 上傳完成：{BLOB_NAME} 至 container {CONTAINER_NAME}")



[A[A



[A[A

[A[A

📤 Uploading: 100%|██████████| 23.8M/23.8M [01:03<00:00, 378kB/s]


✅ 上傳完成：course_vector.zip 至 container course-data



