# Vector Search

### Setup

In [1]:
import os
import ast
import json
import requests
import weaviate

import pandas as pd

from dotenv import load_dotenv, find_dotenv
from langchain_weaviate.vectorstores import WeaviateVectorStore

In [2]:
_ = load_dotenv(find_dotenv()) # read local .env file

weaviate_url = os.getenv("WEAVIATE_URL") 
weaviate_key = os.getenv("WEAVIATE_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Connect to local Weaviate instance running in docker
weaviate_client = weaviate.Client(
    url=weaviate_url,  
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=weaviate_key),  
    additional_headers={
        "X-OpenAI-Api-Key": openai_key
    }
)
weaviate_client.is_ready()

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


True

In [4]:
# delete existing items from the database
weaviate_client.schema.delete_all()

### Document loading and splitting

Load the manifesto PDF documents and split them into chunks.

In [21]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [27]:
pdf_loader = PyPDFLoader(file_path="../data/Conservative-Manifesto-GE2024.pdf")
pdf_chunks = pdf_loader.load()

print(f"Loads PDFs and creates {len(pdf_chunks)} chunks. One per page of the manifesto.")

Loads PDF and creates 80 chunks. One per page.


This next step further splits the PDF text into 169 smaller documents with some overlap, while maintaining the metadata (source and page number).

In [32]:
pdf_text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size = 1000,
    chunk_overlap = 100,
    is_separator_regex=False,
)

pdf_splits = pdf_text_splitter.split_documents(pdf_chunks)
print(f"Creates {len(pdf_splits)} splits")


Creates 235 splits


### Embeddings and database loading

First, create the embeddings

In [33]:
from langchain.vectorstores import Weaviate
from langchain_openai import OpenAIEmbeddings

from weaviate.classes.query import Filter

In [34]:
embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002", api_key=openai_key)

Then load these into the vector store

In [35]:
weaviate_instance = Weaviate(client=weaviate_client, index_name="test", text_key="test")

In [36]:
weaviate_db = Weaviate.from_documents(pdf_splits, embeddings, client=weaviate_client)

### Vector search

In [37]:
search_query = "What is Conversative housing policy?"
docs = weaviate_db.similarity_search(search_query)

In [38]:
docs[0]

Document(page_content='51 \nThe Conservative and Unionist Party Manifesto 2024Our plan to build \nmore houses in the \nright places', metadata={'page': 52, 'source': '../data/Conservative-Manifesto-GE2024.pdf'})

In [42]:
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")


Document 1:
51 
The Conservative and Unionist Party Manifesto 2024Our plan to build 
more houses in the 
right p...

Document 2:
54 We will support those who want to build or 
commission their own home by making the 
planning pro...

Document 3:
support local people into home ownership and 
create a dedicated taskforce in Homes England 
to deli...

Document 4:
52  We will deliver a secure future 
for communities by giving more 
people a better chance of livin...


Add filters

In [41]:
source_filter = Filter.by_property("source").equal("../data/Conservative-Manifesto-GE2024.pdf")
filtered_search_results = weaviate_db.similarity_search(search_query, filters=source_filter)

for i, result in enumerate(filtered_search_results):
    print(f"\nDocument {i+1}:")
    print(f"Page: {result.metadata['page']}")
    print(result.page_content[:100] + "...")



Document 1:
Page: 52
51 
The Conservative and Unionist Party Manifesto 2024Our plan to build 
more houses in the 
right p...

Document 2:
Page: 55
54 We will support those who want to build or 
commission their own home by making the 
planning pro...

Document 3:
Page: 67
support local people into home ownership and 
create a dedicated taskforce in Homes England 
to deli...

Document 4:
Page: 53
52  We will deliver a secure future 
for communities by giving more 
people a better chance of livin...


In [43]:
page_filter = Filter.by_property("page").equal(52)
page_search_results = weaviate_db.similarity_search(search_query, filters=page_filter, k=3)

for i, result in enumerate(page_search_results):
    print(f"\nDocument {i+1}:")
    print(f"Page: {result.metadata['page']}")
    print(result.page_content[:100] + "...")


Document 1:
Page: 52
51 
The Conservative and Unionist Party Manifesto 2024Our plan to build 
more houses in the 
right p...

Document 2:
Page: 55
54 We will support those who want to build or 
commission their own home by making the 
planning pro...

Document 3:
Page: 67
support local people into home ownership and 
create a dedicated taskforce in Homes England 
to deli...


In [45]:
schemas = weaviate_client.schema.get()
schemas

{'classes': [{'class': 'LangChain_09c839fb259744c3af543e4a977e32fd',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'multiTenancyConfig': {'autoTenantCreation': False, 'enabled': False},
   'properties': [{'dataType': ['text'],
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'text',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Wed Jun 12 08:06:26 2024",
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'source',
     'tokenization': 'word'},
    {'dataType': ['number'],
     'description': "This property was generated by Weaviate's auto-schema feature on Wed Jun 12 08:06:26 2024",
     'indexFilterable': True,
     'indexSearchable': False,
     'name': 'page'}],
   'replicationConfig': {'factor': 1},
   'shardingConfig': 

### Vector search with different retrieval methods

Try alternative retrieval methods, such as MMR and SelfQuery

### Chat with memory

Explore LangChain's memory module in order to involve chat history in requests to GPT