# Document loading

### Setup

In [48]:
import os
import weaviate

import pandas as pd

from dotenv import load_dotenv, find_dotenv
from langchain_weaviate.vectorstores import WeaviateVectorStore

from langchain.vectorstores import Weaviate
from langchain_openai import OpenAIEmbeddings

In [49]:
_ = load_dotenv(find_dotenv()) # read local .env file

weaviate_url = os.getenv("WEAVIATE_URL") 
weaviate_key = os.getenv("WEAVIATE_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

In [50]:
# Connect to local Weaviate instance running in docker
weaviate_client = weaviate.Client(
    url=weaviate_url,  
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=weaviate_key),  
    additional_headers={
        "X-OpenAI-Api-Key": openai_key
    }
)
weaviate_client.is_ready()

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


True

In [51]:
# delete existing items from the database
weaviate_client.schema.delete_all()

In [52]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key=openai_key)

In [54]:
# find out what index name and test key are for
weaviate_instance = Weaviate(client=weaviate_client, index_name="test", text_key="test", embedding=embeddings)

In [66]:
weaviate_instance = Weaviate.from_documents(pdf_splits, embeddings, client=weaviate_client)
type(weaviate_instance)

langchain_community.vectorstores.weaviate.Weaviate

In [81]:
len(pdf_splits)

235

In [88]:
docs_added = weaviate_instance.add_documents(pdf_splits[10:100])
len(docs_added)

In [80]:
weaviate_client.schema.delete_all()

In [89]:
weaviate_client.get_meta()

{'hostname': 'http://[::]:8080',
 'modules': {'generative-openai': {'documentationHref': 'https://platform.openai.com/docs/api-reference/completions',
   'name': 'Generative Search - OpenAI'},
  'qna-openai': {'documentationHref': 'https://platform.openai.com/docs/api-reference/completions',
   'name': 'OpenAI Question & Answering Module'},
  'text2vec-openai': {'documentationHref': 'https://platform.openai.com/docs/guides/embeddings/what-are-embeddings',
   'name': 'OpenAI Module'}},
 'version': '1.25.1'}

In [90]:
len(weaviate_client.schema.get()['classes'])

1

In [91]:
weaviate_client.schema.get()['classes']

[{'class': 'Test',
  'description': "This property was generated by Weaviate's auto-schema feature on Thu Jun 13 08:06:46 2024",
  'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
   'cleanupIntervalSeconds': 60,
   'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
  'multiTenancyConfig': {'autoTenantCreation': False, 'enabled': False},
  'properties': [{'dataType': ['text'],
    'description': "This property was generated by Weaviate's auto-schema feature on Thu Jun 13 08:06:46 2024",
    'indexFilterable': True,
    'indexSearchable': True,
    'name': 'test',
    'tokenization': 'word'},
   {'dataType': ['text'],
    'description': "This property was generated by Weaviate's auto-schema feature on Thu Jun 13 08:06:46 2024",
    'indexFilterable': True,
    'indexSearchable': True,
    'name': 'source',
    'tokenization': 'word'},
   {'dataType': ['number'],
    'description': "This property was generated by Weaviate's auto-schema feature on Thu Jun 13 08:06

In [101]:
len(weaviate_client.data_object.get()['objects'])

25

In [102]:
weaviate_client.data_object.get()['objects'][0]

{'class': 'Test',
 'creationTimeUnix': 1718266019626,
 'id': '0039e1b7-2381-4f00-8b31-aafbf4859ffc',
 'lastUpdateTimeUnix': 1718266019626,
 'properties': {'page': 15,
  'source': '../data/Conservative-Manifesto-GE2024.pdf',
  'test': 'massive simplification of the tax system which \nmeans that 93% of self-employed people – \nfour million of them – will no longer pay self-\nemployed National Insurance. \nAs well as cutting National Insurance for 29 \nmillion people, we will also not raise the rate \nof income tax or VAT.\nIncreasing pay for working \npeople\nWe introduced the National Living Wage in \n2016 and this year achieved our ambition of \nraising it to two thirds of median earnings – \nincreasing it to £11.44 per hour and extending it \nto cover all workers aged over 21. This ended \nlow pay for those on the National Living Wage, \nwith someone working full-time receiving a pay \nrise worth £1,800. As a result of our personal \ntax cuts and increases in the minimum wage, \nthe t

### Document loading and splitting

Load the manifesto PDF documents and split them into chunks.

In [21]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [27]:
pdf_loader = PyPDFLoader(file_path="../data/Conservative-Manifesto-GE2024.pdf")
pdf_chunks = pdf_loader.load()

print(f"Loads PDFs and creates {len(pdf_chunks)} chunks. One per page of the manifesto.")

Loads PDF and creates 80 chunks. One per page.


In [47]:
type(pdf_chunks[0])

langchain_core.documents.base.Document

This next step further splits the PDF text into 169 smaller documents with some overlap, while maintaining the metadata (source and page number).

In [32]:
pdf_text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size = 1000,
    chunk_overlap = 100,
    is_separator_regex=False,
)

pdf_splits = pdf_text_splitter.split_documents(pdf_chunks)
print(f"Creates {len(pdf_splits)} splits")


Creates 235 splits


### Embeddings and database loading

First, create the embeddings

In [34]:
embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002", api_key=openai_key)

Then load these into the vector store

In [35]:
weaviate_instance = Weaviate(client=weaviate_client, index_name="test", text_key="test")

In [36]:
weaviate_db = Weaviate.from_documents(pdf_splits, embeddings, client=weaviate_client)

### Add other documents

Follow the same process for the other manifestos to add them as embeddings to the vector store.

Convert the code above into functions/classes for reusability.

In [1]:
from pathlib import Path
from manifesto_qa.app import weaviate_instance
from manifesto_qa.document_loader import load_and_split_pdf, add_documents_to_store

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


In [2]:
data_dir = Path("/Users/longbe01/Documents/projects/llm-rag/data")
data_dir.as_posix()

'/Users/longbe01/Documents/projects/llm-rag/data'

In [15]:
weaviate_instance._client.schema.delete_all()

In [21]:
for file in data_dir.glob("*.pdf"):

    pdf_splits = load_and_split_pdf(file) 
    print(f"Split file {file.name} into {len(pdf_splits)} chunks.")

    docs_added = add_documents_to_store(weaviate_instance, pdf_splits)
    print(f"Added {len(docs_added)} chunks to vector db for file {file.name}.\n")


Split file Green-Party-2024-General-Election-Manifesto-Long-version_imprint.pdf into 166 chunks.
Added 166 chunks to vector db for file Green-Party-2024-General-Election-Manifesto-Long-version_imprint.pdf.

Split file Change-Labour-Party-Manifesto-2024-large-print.pdf into 252 chunks.
Added 252 chunks to vector db for file Change-Labour-Party-Manifesto-2024-large-print.pdf.

Split file Reform_UK_Contract_With_The_People.pdf into 73 chunks.
Added 73 chunks to vector db for file Reform_UK_Contract_With_The_People.pdf.

Split file Conservative-Manifesto-GE2024.pdf into 235 chunks.
Added 235 chunks to vector db for file Conservative-Manifesto-GE2024.pdf.

Split file For_a_Fair_Deal_-_Liberal_Democrat_Manifesto_2024.pdf into 229 chunks.
Added 229 chunks to vector db for file For_a_Fair_Deal_-_Liberal_Democrat_Manifesto_2024.pdf.



### View the database schema

Understand how the documents have been loaded (and how this relates to creating collections directly using the weaviate API). 

Sense check the documents have all been loaded correctly, and view an example record.

In [2]:
from manifesto_qa.client import read_all_objects

In [7]:
weaviate_client = weaviate_instance._client
all_records = read_all_objects(weaviate_client, "ManifestoQa", 20)

print(f"There are {len(all_records)} objects in the vector database \n")

print("Example object: \n ", all_records[0])

There are 955 objects in the vector database 

Example record: 
  {'_additional': {'id': '002b5fa2-58e9-4670-afa8-c26a8e0520c4'}, 'page': 34, 'source': '/Users/longbe01/Documents/projects/llm-rag/data/For_a_Fair_Deal_-_Liberal_Democrat_Manifesto_2024.pdf', 'text': '35•  Implement a ten-year plan to invest in hospitals and the primary care estate to \nend the scandal of crumbling roofs, dangerous concrete and life-expired buildings.\n•  Create a new ‘Patients Charter’ to harness lived experience of patients and embed patient voice, partnership and safety standards across health and care settings, including:\n• A new legal right to a second opinion.\n• A new legal right to maintain contact in all health and care settings.\n• Protecting patient data and patients’ rights to opt out of data sharing.\n•  Implement the recommendations of the Infected Blood Inquiry in full, including delivering full and fair compensation to all victims of the scandal in a timely and transparent manner.\n•  Int

In [26]:
weaviate_client.schema.get()['classes'][0]["properties"]

[{'dataType': ['text'],
  'description': "This property was generated by Weaviate's auto-schema feature on Mon Jun 17 08:00:35 2024",
  'indexFilterable': True,
  'indexSearchable': True,
  'name': 'text',
  'tokenization': 'word'},
 {'dataType': ['text'],
  'description': "This property was generated by Weaviate's auto-schema feature on Mon Jun 17 08:00:35 2024",
  'indexFilterable': True,
  'indexSearchable': True,
  'name': 'source',
  'tokenization': 'word'},
 {'dataType': ['number'],
  'description': "This property was generated by Weaviate's auto-schema feature on Mon Jun 17 08:00:35 2024",
  'indexFilterable': True,
  'indexSearchable': False,
  'name': 'page'}]

Notes

In [31]:
weaviate_client.data_object.get()['totalResults']

25

In [42]:

cursor = None
def get_batch_with_cursor(collection, batch_size, cursor):
    query = (
        weaviate_client.query.get(
            collection,
            ["page", "source", "text"],
        )
        .with_additional(["id"])
        .with_limit(batch_size)
    )

    if cursor is not None:
        result = query.with_after(cursor).do()
    else:
        result = query.do()
    
    return result["data"]["Get"][collection]

In [45]:
cursor = None
results = []

while True:
    next_batch = get_batch_with_cursor("ManifestoQa", 10, cursor)

    if len(next_batch) == 0:
        break

    results.append(next_batch)
    cursor = next_batch[-1]["_additional"]["id"]

{'data': {'Get': {'ManifestoQa': [{'_additional': {'id': '002b5fa2-58e9-4670-afa8-c26a8e0520c4'}, 'page': 34, 'source': '/Users/longbe01/Documents/projects/llm-rag/data/For_a_Fair_Deal_-_Liberal_Democrat_Manifesto_2024.pdf', 'text': '35•  Implement a ten-year plan to invest in hospitals and the primary care estate to \nend the scandal of crumbling roofs, dangerous concrete and life-expired buildings.\n•  Create a new ‘Patients Charter’ to harness lived experience of patients and embed patient voice, partnership and safety standards across health and care settings, including:\n• A new legal right to a second opinion.\n• A new legal right to maintain contact in all health and care settings.\n• Protecting patient data and patients’ rights to opt out of data sharing.\n•  Implement the recommendations of the Infected Blood Inquiry in full, including delivering full and fair compensation to all victims of the scandal in a timely and transparent manner.\n•  Introduce truly independent complai

In [49]:
results[0]

list