In [None]:
%pip install langchain langchain-community langchain-text-splitters langchain_huggingface

In [None]:
%pip install chromadb tiktoken pypdf sentence-transformers

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from langchain_core.documents import Document
import textwrap

def preview_doc(doc: Document, max_length=100):
  '''Метод для сжатия вывода контента документов.'''
  short_metadata = textwrap.shorten(str(doc.metadata), width=max_length, placeholder="…")
  short_content = textwrap.shorten(doc.page_content, width=max_length, placeholder="…")
  return f"Document(metadata={short_metadata}, page_content={short_content})"

### Подготовка данных

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# загружаем PDF файл
loader = PyPDFLoader('https://www.cs.princeton.edu/~arvindn/publications/OpenWPM_1_million_site_tracking_measurement.pdf')
pages = loader.load()
preview_doc(pages[0])

"Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate':…, page_content=Online Tracking: A 1-million-site Measurement and Analysis Steven Englehardt Princeton University…)"

In [None]:
from unstructured.cleaners.core import clean

# зачищаем текст от мусора
for page in pages:
  page.page_content = clean(page.page_content, extra_whitespace=True, dashes=True, bullets=True, lowercase=False)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# разбиваем PDF на чанки
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(pages)

In [None]:
for doc in splits[:5]:
  print(preview_doc(doc))

Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate':…, page_content=Online Tracking: A 1 million site Measurement and Analysis Steven Englehardt Princeton University…)
Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate':…, page_content=(ﬁngerprinting based) tracking, the eﬀect of browser privacy tools, and the exchange of tracking…)
Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate':…, page_content=underlying browser, and comprehensive browser instrumentation. We demonstrate our platform’s…)
Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate':…, page_content=other hand, web privacy measurement presents formidable engineering and methodological challenges.…)
Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate':…, page_con

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# выбираем модель для создания эмбеддингов
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embedding_model = HuggingFaceEmbeddings(
  model_name=model_name,
  model_kwargs=model_kwargs,
  encode_kwargs=encode_kwargs
)

In [None]:
from langchain_chroma import Chroma

# инициализируем векторную базу
db = Chroma.from_documents(documents=splits, embedding=embedding_model)

In [None]:
# выполняем тестовый запрос!
query = 'third party'
docs = db.similarity_search(query)

for idx, doc in enumerate(docs):
  print(idx, doc)

0 page_content='question: how many third parties are there? In short, a lot: the total num ber of third parties present on at least two ﬁrst parties is over 81,000. What is more surprising is that the prevalence of third parties quickly drops oﬀ: only 123 of these 81,000 are present on more than 1% of sites. This suggests that the number of third parties that a regular user will encounter on a daily basis is relatively small. The eﬀect is accentuated when we consider that diﬀerent third parties may be owned by' metadata={'creationdate': '2016-10-25T21:39:52+00:00', 'total_pages': 20, 'keywords': '', 'page': 7, 'source': 'https://www.cs.princeton.edu/~arvindn/publications/OpenWPM_1_million_site_tracking_measurement.pdf', 'title': '', 'subject': '', 'trapped': '/False', 'page_label': '8', 'creator': 'LaTeX with hyperref package', 'moddate': '2016-10-25T21:39:52+00:00', 'producer': 'pdfTeX-1.40.16', 'author': '', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live

In [None]:
# выполняем тестовый запрос со скорингом!
query = 'third party'
docs = db.similarity_search_with_score(query)

print(query)

for idx, doc in enumerate(docs):
  print(idx, doc[-1], preview_doc(doc[0]))

docs = [doc[0] for doc in docs]

third party
0 0.9131031036376953 Document(metadata={'title': '', 'subject': '', 'trapped': '/False', 'page': 7, 'producer': 'pdfTeX-1.40.16',…, page_content=question: how many third parties are there? In short, a lot: the total num ber of third parties…)
1 0.9142496585845947 Document(metadata={'keywords': '', 'trapped': '/False', 'creator': 'LaTeX with hyperref package', 'page_label': '10',…, page_content=by Ghostery as a function of the prominence of the third party. As deﬁned earlier, a third party’s…)
2 0.96332848072052 Document(metadata={'producer': 'pdfTeX-1.40.16', 'subject': '', 'title': '', 'moddate': '2016-10-25T21:39:52+00:00',…, page_content=parties ordered by ﬁrst party count and third parties ordered by prominence.)
3 1.0424506664276123 Document(metadata={'moddate': '2016-10-25T21:39:52+00:00', 'producer': 'pdfTeX-1.40.16', 'creationdate':…, page_content=support of third party domains. A large number of third party domains are HTTP only (54%). However,…)


### ChromaDB

In [None]:
# запись базы на диск
Chroma.from_documents(docs, embedding_model, persist_directory='./chroma_db')

<langchain_chroma.vectorstores.Chroma at 0x2489d21d450>

In [None]:
# чтение с диска
Chroma(persist_directory='./chroma_db', embedding_function=embedding_model)

<langchain_chroma.vectorstores.Chroma at 0x2489b486e10>

In [None]:
from uuid import uuid4

example_db = Chroma.from_documents(docs, embedding_model, ids=[str(uuid4()) for doc in docs])

In [None]:
example_db._collection.count()

238

In [None]:
example_db.get()

### ChromaDB CRUD-операции

#### Create

In [None]:
new_document_id = str(uuid4())
new_document_source = {'source': 'my_thoughts'}
new_document = 'At the moment, protection against digital surveillance is popularized and rooted in the dark side of the Internet.'

example_db._collection.add(
  embeddings=embedding_model.embed_documents([new_document]),
  documents=[new_document],
  metadatas=[new_document_source],
  ids=[new_document_id]
)

In [None]:
example_db._collection.count()

239

#### Read

In [None]:
example_db.get(new_document_id)

{'ids': ['50f88f26-c6c3-4bb4-84f9-5030d9162162'],
 'embeddings': None,
 'documents': ['At the moment, protection against digital surveillance is popularized and rooted in the dark side of the Internet.'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'source': 'my_thoughts'}]}

#### Update

In [None]:
docs = example_db.similarity_search('dark side of the Internet')
print(docs)

[Document(id='50f88f26-c6c3-4bb4-84f9-5030d9162162', metadata={'source': 'my_thoughts'}, page_content='At the moment, protection against digital surveillance is popularized and rooted in the dark side of the Internet.'), Document(id='79728553-5d15-4628-bfd4-f4a4d451ee71', metadata={'author': '', 'producer': 'pdfTeX-1.40.16', 'keywords': '', 'title': '', 'moddate': '2016-10-25T21:39:52+00:00', 'page': 14, 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'creator': 'LaTeX with hyperref package', 'subject': '', 'creationdate': '2016-10-25T21:39:52+00:00', 'trapped': '/False', 'source': 'https://www.cs.princeton.edu/~arvindn/publications/OpenWPM_1_million_site_tracking_measurement.pdf', 'page_label': '15', 'total_pages': 20}, page_content='the raiders of the lost trackers: An archaeological study of web tracking from 1996 to 2016. In Proceedings of USENIX Security), 2016. [30] J. Leyden. Sites pulling sneaky ﬂash cookie snoop. http

In [None]:
docs[0].metadata = { 'source': "not_my_thoughts" }

# Epstein files reference ( ˶°ㅁ°) !!
docs[0].page_content = 'At the moment, protection against digital surveillance is popularized ██████████████████████████████'

example_db.update_document(new_document_id, docs[0])

print(example_db._collection.get(ids=[new_document_id]))

{'ids': ['50f88f26-c6c3-4bb4-84f9-5030d9162162'], 'embeddings': None, 'documents': ['At the moment, protection against digital surveillance is popularized ██████████████████████████████'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'not_my_thoughts'}]}


#### Delete

In [None]:
print('Было:', example_db._collection.count())

example_db._collection.delete(ids=new_document_id)

print('Стало:', example_db._collection.count())

Было: 239
Стало: 238


#### Filtering

In [None]:
example_db.get(where_document={'$contains': 'third party’s prominence'})

{'ids': ['759efb05-0ab3-478c-bf8e-e6341f87475e',
  '03f26fb0-c76f-4b70-8dbb-d8299015e633'],
 'embeddings': None,
 'documents': ['by Ghostery as a function of the prominence of the third party. As deﬁned earlier, a third party’s prominence is the sum of the inverse ranks of the sites it appears on. We also tested Ghostery, and found that it is eﬀective at reducing the number of third parties and ID cookies (Fig ure 11 in the Appendix). The average number of third party includes went down from 17.7 to 3.3, of which just 0.3 had third party cookies (0.1 with IDs). We examined the promi nent third parties that are not',
  'by Ghostery as a function of the prominence of the third party. As deﬁned earlier, a third party’s prominence is the sum of the inverse ranks of the sites it appears on. We also tested Ghostery, and found that it is eﬀective at reducing the number of third parties and ID cookies (Fig ure 11 in the Appendix). The average number of third party includes went down from 17.7 

<img src="https://media.tenor.com/5X5MilxS3xUAAAAi/theresa-civilight-eterna.gif" width="600" height="300" />