In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os

In [2]:
docs = TextLoader('../tests/state_of_the_union.txt',encoding='utf-8').load()

In [3]:
print(docs[0])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citizens blockin

In [4]:
documents  = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0).split_documents(docs)

In [5]:
documents

[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', metadata={'source': '../tests/state_of_the_union.txt'}),
 Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', metadata={'source': '../tests/state_of_the_union.txt'}),
 Document(page_content='With a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.', metadata={'source': '../tests/state_of_the_union.txt'}),
 Document(page_content='Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.', metadata={'source': '../tests/state_of_the_union.txt'}),
 Document

In [6]:
type(documents)

list

In [6]:
[document.page_content for document in documents]

['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.',
 'Last year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.',
 'With a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.',
 'Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.',
 'He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people.',
 'From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.',
 'Groups of citizens blocking tanks with 

In [7]:
embeddings_model = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"),base_url=os.getenv("OPENAI_BASE_URL"))

In [8]:
embeddings_model.embed_query(documents[0].page_content)

[-0.020120255503122006,
 -0.004595722038485556,
 -0.024042108853084974,
 -0.022164217966931226,
 -0.01102462309796452,
 0.023569444023445203,
 0.011733622205069317,
 0.006559842218552228,
 0.009664110769256022,
 -0.012768378388637248,
 0.035616047421991766,
 -0.00806726509440507,
 -0.03415972342146964,
 -0.01912382417454404,
 0.008910398865668318,
 -0.014461034803979258,
 0.026469314278858423,
 -0.01860005953825098,
 0.02631601858418884,
 -0.007933129498924048,
 0.017590852326687126,
 -0.004103893384366949,
 0.017130961517388105,
 0.015904582921515493,
 -0.010692479011331009,
 -0.009114795298313752,
 0.013311305918086571,
 -0.026622611836173142,
 0.0067386892135323055,
 -0.010168713443715382,
 -0.002200453526209127,
 -0.020835643483042315,
 -0.014333287149991181,
 -0.03045504145581543,
 -0.0136945488800508,
 -0.015086998122256317,
 -0.022994577717853723,
 0.003458767871425549,
 -0.013017486686443022,
 -0.015725737323519267,
 0.011880532752213659,
 -0.007434913834635064,
 0.002859950743

In [9]:
db = Chroma.from_documents(documents[:2], embeddings_model)

In [10]:
documents[:2]

[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', metadata={'source': '../tests/state_of_the_union.txt'}),
 Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', metadata={'source': '../tests/state_of_the_union.txt'})]

In [11]:
#使用文本进行语义相似度搜索
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)
print(docs[0].page_content)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.


In [12]:
#使用嵌入向量进行语义相似度搜索
query = "What did the president say about Ketanji Brown Jackson"
embedding_vector = embeddings_model.embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.


In [13]:
from langchain.document_loaders import PyPDFLoader

In [14]:
loaders = [
    PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture03.pdf"),
    PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture04.pdf"),
]

In [15]:
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [16]:
# 加载 PDF
loaders_chinese = [
    # 故意添加重复文档，使数据混乱
    PyPDFLoader("docs/matplotlib/第一回：Matplotlib初相识.pdf"),
    PyPDFLoader("docs/matplotlib/第一回：Matplotlib初相识.pdf"),
    PyPDFLoader("docs/matplotlib/第二回：艺术画笔见乾坤.pdf"),
    PyPDFLoader("docs/matplotlib/第三回：布局格式定方圆.pdf")
]
docs_chinese = []
for loader in loaders_chinese:
    docs_chinese.extend(loader.load())

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,  # 每个文本块的大小。这意味着每次切分文本时，会尽量使每个块包含 1500 个字符。
    chunk_overlap = 150  # 每个文本块之间的重叠部分。
)

In [18]:
splits = text_splitter.split_documents(docs)

In [19]:
splits

[Document(page_content="MachineLearning-Lecture01  \nInstructor (Andrew Ng):  Okay. Good morning. Welcome to CS229, the machine \nlearning class. So what I wanna do today is ju st spend a little time going over the logistics \nof the class, and then we'll start to  talk a bit about machine learning.  \nBy way of introduction, my name's  Andrew Ng and I'll be instru ctor for this class. And so \nI personally work in machine learning, and I' ve worked on it for about 15 years now, and \nI actually think that machine learning is th e most exciting field of all the computer \nsciences. So I'm actually always excited about  teaching this class. Sometimes I actually \nthink that machine learning is not only the most exciting thin g in computer science, but \nthe most exciting thing in all of human e ndeavor, so maybe a little bias there.  \nI also want to introduce the TAs, who are all graduate students doing research in or \nrelated to the machine learni ng and all aspects of machin e learn

In [20]:
splits_chinese = text_splitter.split_documents(docs_chinese)

In [21]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [22]:
embedding1 = embeddings_model.embed_query(sentence1)
embedding2 = embeddings_model.embed_query(sentence2)
embedding3 = embeddings_model.embed_query(sentence3)

In [23]:
sentence1_chinese = "我喜欢狗"
sentence2_chinese = "我喜欢犬科动物"
sentence3_chinese = "外面的天气很糟糕"

In [24]:
embedding1_chinese = embeddings_model.embed_query(sentence1_chinese)
embedding2_chinese = embeddings_model.embed_query(sentence2_chinese)
embedding3_chinese = embeddings_model.embed_query(sentence3_chinese)

In [25]:
import numpy as np

In [26]:
np.dot(embedding1, embedding2)

0.9631227500523609

In [27]:
np.dot(embedding1, embedding3)

0.7703257495981695

In [28]:
np.dot(embedding2, embedding3)

0.7591627401108028

In [29]:
np.dot(embedding1_chinese, embedding2_chinese)

0.9438907110861763

In [30]:
np.dot(embedding1_chinese, embedding3_chinese)

0.791947385075386

In [31]:
np.dot(embedding2_chinese, embedding3_chinese)

0.7800345305212735

In [7]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/cs229_lectures/'
embedding = OpenAIEmbeddings()

vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

  warn_deprecated(


In [8]:
# 可以看见包含了我们之前进行分割的209个文档
print(vectordb._collection.count())

0
