## 嵌入模型
***
- embed_documents
- embed_query
- 嵌入缓存
- 使用国产的嵌入模型

#### embed_documents
****

In [1]:
from langchain_openai import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings()
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

(5, 1536)

#### embed_query
****

In [2]:
query_embedding = embeddings_model.embed_query("What is the meaning of life?")
print(query_embedding)

[0.004411248955875635, -0.0296554546803236, -0.00819849781692028, -0.003228119807317853, -0.026195652782917023, -0.018806500360369682, -0.01920190639793873, 0.011454419232904911, -0.021265432238578796, -0.0014070895267650485, 0.0019152481108903885, 0.016829470172524452, 0.014024559408426285, -0.006524200085550547, 0.014667093753814697, -0.0031817832496017218, 0.040454983711242676, -0.007846339605748653, 0.0035802784841507673, -0.012547964230179787, 0.0031756050884723663, 0.003774892305955291, -0.002380159217864275, -0.008464161306619644, -0.012455291114747524, 0.005019803531467915, 0.01802804507315159, -0.030347416177392006, 0.023464879021048546, -0.024292759597301483, 0.026566345244646072, -0.01581624336540699, -0.020524045452475548, -0.0062678041867911816, -0.013048400171101093, -0.0004958021454513073, 0.011744796298444271, 0.0011645944323390722, -0.006412992253899574, -0.0051341005600988865, 0.025355413556098938, 0.020178066566586494, 0.0008147527114488184, 0.019276045262813568, -0.

### 缓存嵌入结果
****


In [3]:
! pip install --upgrade --quiet  langchain-openai faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

underlying_embeddings = OpenAIEmbeddings()

store = LocalFileStore("/tmp/langchain_cache")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

In [5]:
list(store.yield_keys())

[]

In [6]:
raw_documents = TextLoader("meow.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

创建向量存储

In [7]:
%%time
db = FAISS.from_documents(documents, cached_embedder)

CPU times: user 21.3 ms, sys: 11.5 ms, total: 32.8 ms
Wall time: 2.58 s


再次创建将读取缓存，从而加快速度降低成本

In [10]:
%%time
db2 = FAISS.from_documents(documents, cached_embedder)

CPU times: user 1.75 ms, sys: 1.28 ms, total: 3.03 ms
Wall time: 2.49 ms


查看缓存

In [11]:
list(store.yield_keys())[:5]

['text-embedding-ada-00263c0fca5-f3da-5691-8a86-2beceea627f5']

#### 使用国产嵌入模型

****

筛选模型： https://cloud.siliconflow.cn/models?types=embedding

In [12]:
from langchain_openai import OpenAIEmbeddings
import os
embeddings_model = OpenAIEmbeddings(
    model="BAAI/bge-m3",
    api_key=os.environ.get("DEEPSEEK_API_KEY"),
    base_url=os.environ.get("DEEPSEEK_API_BASE")+"/v1",
)

In [13]:
embeddings = embeddings_model.embed_documents(
    [
        "床前明月光",
        "疑是地上霜",
        "举头望明月",
        "低头思故乡",
        "李白《静夜思》"
    ]
)
len(embeddings), len(embeddings[0])

(5, 1024)