In [None]:
%pip install langchain langchain-community langchain-text-splitters langchain_huggingface

In [None]:
%pip install chromadb tiktoken pypdf sentence-transformers

### Подготовка данных

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# загружаем PDF файл
loader = PyPDFLoader('https://www.cs.princeton.edu/~arvindn/publications/OpenWPM_1_million_site_tracking_measurement.pdf')
pages = loader.load_and_split()
pages[0]

Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2016-10-25T21:39:52+00:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2016-10-25T21:39:52+00:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'source': 'https://www.cs.princeton.edu/~arvindn/publications/OpenWPM_1_million_site_tracking_measurement.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}, page_content='Online Tracking:\nA 1-million-site Measurement and Analysis\nSteven Englehardt\nPrinceton University\nste@cs.princeton.edu\nArvind Narayanan\nPrinceton University\narvindn@cs.princeton.edu\nThis is an extended version of our paper that appeared at ACM CCS 2016.\nABSTRACT\nWe present the largest and most detailed measurement of\nonline tracking conducted to date, based on a crawl of the\ntop 1 million websites. We make 15 types of measurements\non each site,

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# разбиваем PDF на чанки
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(pages)

In [None]:
splits

[Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2016-10-25T21:39:52+00:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2016-10-25T21:39:52+00:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'source': 'https://www.cs.princeton.edu/~arvindn/publications/OpenWPM_1_million_site_tracking_measurement.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}, page_content='Online Tracking:\nA 1-million-site Measurement and Analysis\nSteven Englehardt\nPrinceton University\nste@cs.princeton.edu\nArvind Narayanan\nPrinceton University\narvindn@cs.princeton.edu\nThis is an extended version of our paper that appeared at ACM CCS 2016.\nABSTRACT\nWe present the largest and most detailed measurement of\nonline tracking conducted to date, based on a crawl of the\ntop 1 million websites. We make 15 types of measurements\non each site

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# выбираем модель для создания эмбеддингов
model_name = 'jhgan/ko-sroberta-multitask'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embedding_model = HuggingFaceEmbeddings(
  model_name=model_name,
  model_kwargs=model_kwargs,
  encode_kwargs=encode_kwargs
)

In [None]:
from langchain_chroma import Chroma

# инициализируем векторную базу
db = Chroma.from_documents(documents=splits, embedding=embedding_model)

In [None]:
# выполняем тестовый запрос!
query = 'third party'
docs = db.similarity_search(query)

for idx, doc in enumerate(docs):
  print(idx, doc)

0 page_content='third party has the technical capability to use an uninten-
tionally shared ID for any purpose, including tracking the
user or sharing data. However, the results should be in-' metadata={'subject': '', 'moddate': '2016-10-25T21:39:52+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'author': '', 'creationdate': '2016-10-25T21:39:52+00:00', 'trapped': '/False', 'page': 17, 'total_pages': 20, 'title': '', 'creator': 'LaTeX with hyperref package', 'keywords': '', 'page_label': '18', 'source': 'https://www.cs.princeton.edu/~arvindn/publications/OpenWPM_1_million_site_tracking_measurement.pdf', 'producer': 'pdfTeX-1.40.16'}
1 page_content='third party has the technical capability to use an uninten-
tionally shared ID for any purpose, including tracking the
user or sharing data. However, the results should be in-' metadata={'keywords': '', 'page_label': '18', 'trapped': '/False', 'ptex.fullbanner': 'This is pd

In [None]:
# выполняем тестовый запрос со скорингом!
query = 'third party'
docs = db.similarity_search_with_score(query)

print(query)

for idx, doc in enumerate(docs):
  print(idx, doc[-1], doc)

docs = [doc[0] for doc in docs]

third party
0 88.93215942382812 (Document(id='bc921521-ab35-475e-bb8e-f2f96990d9cb', metadata={'subject': '', 'page': 17, 'producer': 'pdfTeX-1.40.16', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'creationdate': '2016-10-25T21:39:52+00:00', 'keywords': '', 'total_pages': 20, 'author': '', 'creator': 'LaTeX with hyperref package', 'title': '', 'source': 'https://www.cs.princeton.edu/~arvindn/publications/OpenWPM_1_million_site_tracking_measurement.pdf', 'trapped': '/False', 'page_label': '18', 'moddate': '2016-10-25T21:39:52+00:00'}, page_content='third party has the technical capability to use an uninten-\ntionally shared ID for any purpose, including tracking the\nuser or sharing data. However, the results should be in-'), 88.93215942382812)
1 88.93215942382812 (Document(id='63408adc-6744-4ec8-833b-93aeb2461769', metadata={'subject': '', 'page_label': '18', 'source': 'https://www.cs.princeton.edu/~arvindn/publications/Ope

### ChromaDB

In [None]:
# запись базы на диск
Chroma.from_documents(docs, embedding_model, persist_directory='./chroma_db')

<langchain_chroma.vectorstores.Chroma at 0x189765d79d0>

In [None]:
# чтение с диска
Chroma(persist_directory='./chroma_db', embedding_function=embedding_model)

<langchain_chroma.vectorstores.Chroma at 0x189724cff10>

In [None]:
from uuid import uuid4

example_db = Chroma.from_documents(docs, embedding_model, ids=[str(uuid4()) for doc in docs])

In [None]:
example_db._collection.count()

526

In [None]:
example_db.get()

{'ids': ['28fcc4d9-52cf-47b8-b450-c07123dd2e22',
  '5553cb5d-535f-4f72-b6ff-26928fe1ce29',
  'db766cdb-7a09-4069-8a2f-705284eeea08',
  'e16f4ccf-55e7-415e-9bf6-defcd09443a2',
  'cd5c59f1-7e82-4624-b6af-72b60063dd5e',
  '69c6aedf-9b98-4898-ad72-a0165813a659',
  'cf2afed5-0ac0-4d3c-8f3a-5eff44cf528e',
  'a05dca08-f68d-4f61-804b-725d4d48bf8c',
  '66cdbc72-d7bd-49e2-b95a-cd422f5e73ba',
  '287eb321-657f-4345-907c-3a4622ecb8cc',
  'f1bb2baa-cf28-426b-9be2-f85838035b4c',
  '8540bfb5-173b-4d45-8f85-34da14420b17',
  '9eee0451-7fa8-4727-b9a4-8b1fc235cb6d',
  '5715ffdb-1779-4fd5-b34d-faf9a34c777a',
  '5758e987-b21a-4846-b874-6439704f05e5',
  '09dac63b-b80e-4309-8c88-b75ea075c20c',
  'bf97a19d-37e9-4b2a-b17d-438bd29158c1',
  '9afbd89b-974d-4975-a500-5049423fcc3a',
  '61f707cb-7861-45b1-885f-5f3e018d9b9c',
  '84b6832a-71d0-4555-a2b7-0af6931d2d77',
  '648cbe64-80fd-4741-9f7c-6059b2b1d172',
  '8f0756d3-7164-4b14-8fe9-414823b0cee8',
  'ca3c5b99-1f60-4c21-af80-ed7e82818507',
  'c9d5c11d-057c-4629-8eed-

### ChromaDB CRUD-операции

#### Create

In [None]:
new_document_id = str(uuid4())
new_document_source = {'source': 'my_thoughts'}
new_document = 'At the moment, protection against digital surveillance is popularized and rooted in the dark side of the Internet.'

example_db._collection.add(
  embeddings=embedding_model.embed_documents([new_document]),
  documents=[new_document],
  metadatas=[new_document_source],
  ids=[new_document_id]
)

In [None]:
example_db._collection.count()

527

#### Read

In [None]:
example_db.get(new_document_id)

{'ids': ['651a0c91-8dc3-473f-a140-1ed6381cb03c'],
 'embeddings': None,
 'documents': ['At the moment, protection against digital surveillance is popularized and rooted in the dark side of the Internet.'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'source': 'my_thoughts'}]}

#### Update

In [None]:
docs = example_db.similarity_search('dark side of the Internet')
print(docs)

page_content='At the moment, protection against digital surveillance is popularized and rooted in the dark side of the Internet.' metadata={'source': 'my_thoughts'}


In [None]:
docs[0].metadata = { 'source': "not_my_thoughts" }

# Epstein files reference ( ˶°ㅁ°) !!
docs[0].page_content = 'At the moment, protection against digital surveillance is popularized ██████████████████████████████'

example_db.update_document(new_document_id, docs[0])

print(example_db._collection.get(ids=[new_document_id]))

{'ids': ['8f761e18-d409-4aee-b4b0-83e9cb159100'], 'embeddings': None, 'documents': ['At the moment, protection against digital surveillance is popularized ██████████████████████████████'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'not_my_thoughts'}]}


#### Delete

In [None]:
print('Было:', example_db._collection.count())

example_db._collection.delete(ids=new_document_id)

print('Стало:', example_db._collection.count())

Было: 264
Стало: 263


#### Filtering

In [None]:
example_db.get(where_document={'$contains': 'cookie-syncing parties'})

{'ids': ['ed38893e-cf24-40ed-93f0-ce7beff59935',
  '5ed14bfe-2d06-4a99-a4e6-ff145d841d1b'],
 'embeddings': None,
 'documents': ['other third parties (this includes both events where it is a\nreferer and where it is a receiver). We present details of the\ntop cookie-syncing parties in Appendix 13.3.\nMore interestingly, we ﬁnd that the vast majority of top\nthird parties sync cookies with at least one other party: 45\nof the top 50, 85 of the top 100, 157 of the top 200, and\n460 of the top 1,000. This adds further evidence that cookie\nsyncing is an under-researched privacy concern.',
  'other third parties (this includes both events where it is a\nreferer and where it is a receiver). We present details of the\ntop cookie-syncing parties in Appendix 13.3.\nMore interestingly, we ﬁnd that the vast majority of top\nthird parties sync cookies with at least one other party: 45\nof the top 50, 85 of the top 100, 157 of the top 200, and\n460 of the top 1,000. This adds further evidence that 

<img src="https://media.tenor.com/5X5MilxS3xUAAAAi/theresa-civilight-eterna.gif" width="600" height="300" />