In [1]:
def meta_func(obj: dict, default_meta: dict) -> dict:
    return {
        **default_meta,
        "recipe_name":        obj.get("recipe_name"),
        "recipe_link":        obj.get("recipe_link"),
        "user_name":          obj.get("user_name"),
        "user_recipe_count":  obj.get("user_recipe", 0),
        "user_fans_count":    obj.get("user_fans", 0),
        "view_count":         obj.get("view_count", 0),
        "like_count":         obj.get("like_count", 0),
        "together_count":     obj.get("together_count", 0),
        "comment_count":      obj.get("comment_count", 0),
        "date":               obj.get("date"),
    }

In [2]:
import json
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

def build_docs_from_jsonl(path, meta_func):
    docs = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            obj = json.loads(line)
            # 1. 用 Python 拼接出 page_content 字符串
            content_parts = [
                obj.get("recipe_name") or "",
                obj.get("description") or "",
                ", ".join(obj.get("tags") or []),
                "\n".join(
                    f"{ing.get('name', '')} {ing.get('amount', '')}"
                    for ing in (obj.get("ingredients") or [])
                ),
                "\n".join(
                    f"{k}. {v}"
                    for k, v in sorted((obj.get("steps") or {}).items(), key=lambda x: int(x[0]))
                ),
            ]
            page_content = "\n".join(content_parts)
            # 2. 构造 metadata
            default_meta = {"source": path, "seq_num": i}
            metadata = meta_func(obj, default_meta)
            docs.append(Document(page_content=page_content, metadata=metadata))
    return docs

In [3]:
docs = build_docs_from_jsonl("./output/data_0510_2130.jsonl", meta_func)
print(docs[0].page_content[:100])
print(docs[0].metadata)

かにの雪Q餅
描述小時候的奇福餅乾與其他材料混合的口感，真是會讓人一口接一口，停不下來喔！配茶，配咖啡都很讚的。
素食, 蔬食
棉花糖 200g
奶油 85g
蔓越莓乾 60g
奶粉 85g
奇福餅乾
{'source': './output/data_0510_2130.jsonl', 'seq_num': 0, 'recipe_name': 'かにの雪Q餅', 'recipe_link': 'https://icook.tw/recipes/235470', 'user_name': 'かに小玉の幸福食堂', 'user_recipe_count': 279, 'user_fans_count': 862, 'view_count': 6293, 'like_count': 213, 'together_count': 1, 'comment_count': 1, 'date': '2018-01-10'}


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# 切成不超過 500 字，重疊 50 字
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)

# 嵌入並存入 Chroma
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./recipe_chroma_db",
    collection_name="recipes"
)





  vectordb.persist()
