In [1]:
%pip install sentence-transformers langchain langchain-community faiss-cpu

Collecting sentence-transformers
  Using cached sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting langchain
  Using cached langchain-0.1.15-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community
  Using cached langchain_community-0.0.32-py3-none-any.whl.metadata (8.5 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.8.0-cp38-cp38-win_amd64.whl.metadata (3.8 kB)
Collecting transformers<5.0.0,>=4.32.0 (from sentence-transformers)
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.2.2-cp38-cp38-win_amd64.whl.metadata (26 kB)
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Using cached huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Using cached SQLAlchemy-2.0.29-cp38-cp38-win_amd64.whl.metadata (9.8 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Using cached aiohttp-3.9.3-cp38

#### Описание сценариев использования

1. Получение индекса - `index = Store.getIndex(name='index')`.

2. Загрузка документов - `index.loadDocuments(documents)`. Индекс сохраняется автоматически после загрузки документов.

3. Получение ретривера - `retriever = index.retriever()`.

4. Отправка запросов - `retriever.invoke(query)`.

In [2]:
# подготовка текстов для примеров использования

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

raw_documents = TextLoader('example.txt').load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 1489, which is longer than the specified 1000
Created a chunk of size 1231, which is longer than the specified 1000
Created a chunk of size 1740, which is longer than the specified 1000
Created a chunk of size 1948, which is longer than the specified 1000
Created a chunk of size 1105, which is longer than the specified 1000
Created a chunk of size 2218, which is longer than the specified 1000
Created a chunk of size 1160, which is longer than the specified 1000
Created a chunk of size 1546, which is longer than the specified 1000
Created a chunk of size 1103, which is longer than the specified 1000
Created a chunk of size 3071, which is longer than the specified 1000
Created a chunk of size 1055, which is longer than the specified 1000
Created a chunk of size 2011, which is longer than the specified 1000
Created a chunk of size 1232, which is longer than the specified 1000
Created a chunk of size 1513, which is longer than the specified 1000
Created a chunk of s

#### Пример использования векторного хранилища

In [3]:
from ..\api\vectorstore import Store

In [4]:
%%time

# получение индекса с попыткой загрузки. Имя по умолчанию = 'index'
index = Store.getIndex("my_index")

  from .autonotebook import tqdm as notebook_tqdm


CPU times: user 3.44 s, sys: 735 ms, total: 4.18 s
Wall time: 5.03 s


In [5]:
%%time

# загружаем документы. В этот момент индекс сохраняется, чтобы не потерять данные
index.loadDocuments(documents)

CPU times: user 8.15 s, sys: 282 ms, total: 8.43 s
Wall time: 7.23 s


In [6]:
%%time

index2 = Store.getIndex("my_index")

CPU times: user 305 ms, sys: 212 ms, total: 517 ms
Wall time: 3.37 s


In [7]:
%%time

# другая разбивка
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=10)
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 449, which is longer than the specified 100
Created a chunk of size 477, which is longer than the specified 100
Created a chunk of size 1489, which is longer than the specified 100
Created a chunk of size 146, which is longer than the specified 100
Created a chunk of size 289, which is longer than the specified 100
Created a chunk of size 239, which is longer than the specified 100
Created a chunk of size 235, which is longer than the specified 100
Created a chunk of size 139, which is longer than the specified 100
Created a chunk of size 647, which is longer than the specified 100
Created a chunk of size 183, which is longer than the specified 100
Created a chunk of size 1231, which is longer than the specified 100
Created a chunk of size 1740, which is longer than the specified 100
Created a chunk of size 322, which is longer than the specified 100
Created a chunk of size 682, which is longer than the specified 100
Created a chunk of size 774, which is longer 

CPU times: user 255 ms, sys: 36.2 ms, total: 291 ms
Wall time: 265 ms


In [8]:
%%time

# догрузим еще документов
index2.loadDocuments(documents)

CPU times: user 12.1 s, sys: 30.9 ms, total: 12.1 s
Wall time: 9.42 s


In [10]:
retriever = index2.retriever()

In [11]:
retriever.invoke("Как поучаствовать в вашем процессе")

[Document(page_content='Решение Комиссии о допуске участников закупки к участию в процедуре\nповышения стартовой цены оформляется протоколом.', metadata={'source': 'example.txt'}),
 Document(page_content='Причины, по которым конкурентная закупка признана несостоявшейся, в\nслучае ее признания таковой.', metadata={'source': 'example.txt'}),
 Document(page_content='Участники закупки, допущенные к участию в конкурентном отборе с\nповышением стартовой цены, регистрируются для участия в процедуре повышения\nстартовой посредством направления уведомления непосредственно перед ее началом.', metadata={'source': 'example.txt'}),
 Document(page_content='В случае подачи только заявки на участие в запросе котировок,\nКомиссия вправе признать запрос котировок несостоявшимся.', metadata={'source': 'example.txt'})]