# Document loading

### Setup

Import the existing weaviate instance. Remove any existing schemas before we start.

In [1]:
from pathlib import Path

from manifesto_qa.app import WEAVIATE_INDEX_NAME, TEXT_EMBEDDINGS_MODEL, GENERATIVE_MODEL

from manifesto_qa.vectordb import VectorDB
from manifesto_qa.document_loader import load_and_split_pdf, add_documents_to_store

2024-08-16 11:51:43.377 
  command:

    streamlit run /Users/longbe01/Documents/projects/llm-rag/venv-llm-rag/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-08-16 11:51:43.741 Session state does not function when running a script without `streamlit run`


In [2]:
vector_db = VectorDB(WEAVIATE_INDEX_NAME, TEXT_EMBEDDINGS_MODEL, GENERATIVE_MODEL)

In [7]:
vector_db.delete_all_schemas()
vector_db.reset_manifesto_schema()

### Document loading and splitting

Load each of the manifesto PDF documents and split them into chunks. Then, add them to the vector database (using OpenAI's text embeddings model to generate embeddings).

In [8]:
data_dir = Path("/Users/longbe01/Documents/projects/llm-rag/data")
data_dir.as_posix()

'/Users/longbe01/Documents/projects/llm-rag/data'

In [9]:
for file in data_dir.glob("*.pdf"):

    pdf_splits = load_and_split_pdf(file) 
    print(f"Split file {file.name} into {len(pdf_splits)} chunks.")

    docs_added = add_documents_to_store(vector_db.instance, pdf_splits)
    print(f"Added {len(docs_added)} chunks to vector db for file {file.name}.\n")


Split file Plaid_Cymru_Maniffesto_2024_ENGLISH.pdf into 166 chunks.
Added 166 chunks to vector db for file Plaid_Cymru_Maniffesto_2024_ENGLISH.pdf.

Split file Green-Party-2024-General-Election-Manifesto-Long-version_imprint.pdf into 166 chunks.
Added 166 chunks to vector db for file Green-Party-2024-General-Election-Manifesto-Long-version_imprint.pdf.

Split file Change-Labour-Party-Manifesto-2024-large-print.pdf into 252 chunks.
Added 252 chunks to vector db for file Change-Labour-Party-Manifesto-2024-large-print.pdf.

Split file 2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf into 76 chunks.
Added 76 chunks to vector db for file 2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf.

Split file Reform_UK_Contract_With_The_People.pdf into 73 chunks.
Added 73 chunks to vector db for file Reform_UK_Contract_With_The_People.pdf.

Split file Conservative-Manifesto-GE2024.pdf into 235 chunks.
Added 235 chunks to vector db for file Conservative-Manifesto-GE2024.pd

### View the database schema

Understand how the documents have been loaded (and how this relates to creating collections directly using the weaviate API). 

Sense check the documents have all been loaded correctly, and view an example record.

In [3]:
all_records = vector_db.read_all_objects()

print(f"There are {len(all_records)} objects in the vector database \n")
print("Example object: \n ", all_records[0])

There are 1197 objects in the vector database 

Example object: 
  {'_additional': {'id': '0006dacf-185b-405f-b50a-3125fbb62310'}, 'page': 115, 'source': '/Users/longbe01/Documents/projects/llm-rag/data/Change-Labour-Party-Manifesto-2024-large-print.pdf', 'text': 'With Labour, the Wales Office will once again become \nan advocate for Wales at home and abroad and \nfacilitate closer collaboration between our governments. \nThe Wales Office will ensure on issues under the \ncompetence of the UK government the voice of Wales is \nproperly heard. \nNorthern Ireland\nNorthern Ireland needs stability and long-term certainty \nafter the challenges of recent years. Labour will work'}


In [4]:
vector_db.client.schema.get()['classes'][0]["properties"]

[{'dataType': ['text'],
  'description': 'The document content chunked',
  'indexFilterable': True,
  'indexSearchable': True,
  'moduleConfig': {'text2vec-openai': {'dimensions': 1536,
    'model': 'text-embedding-3-small',
    'skip': False,
    'tokenization': 'lowercase',
    'type': 'text',
    'vectorizePropertyName': True}},
  'name': 'text',
  'tokenization': 'word'},
 {'dataType': ['text'],
  'description': 'The source document (PDF)',
  'indexFilterable': True,
  'indexSearchable': True,
  'moduleConfig': {'text2vec-openai': {'skip': False,
    'tokenization': 'whitespace',
    'vectorizePropertyName': False}},
  'name': 'source',
  'tokenization': 'word'},
 {'dataType': ['number'],
  'description': 'Page number',
  'indexFilterable': True,
  'indexSearchable': False,
  'moduleConfig': {'text2vec-openai': {'skip': False,
    'tokenization': 'whitespace',
    'vectorizePropertyName': False}},
  'name': 'page'}]

In [5]:
vector_db.client.schema.get()['classes'][0]

{'class': 'ManifestoQa',
 'description': 'Index storing political party GE 2024 manifesto documents.',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'generative-openai': {'model': 'gpt-3.5-turbo'},
  'text2vec-openai': {'baseURL': 'https://api.openai.com',
   'model': 'ada',
   'vectorizeClassName': True}},
 'multiTenancyConfig': {'autoTenantCreation': False, 'enabled': False},
 'properties': [{'dataType': ['text'],
   'description': 'The document content chunked',
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-openai': {'dimensions': 1536,
     'model': 'text-embedding-3-small',
     'skip': False,
     'tokenization': 'lowercase',
     'type': 'text',
     'vectorizePropertyName': True}},
   'name': 'text',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'description': 'The source document (PDF)',
   'indexF

In [6]:
vector_db.client.schema.get()

{'classes': [{'class': 'ManifestoQa',
   'description': 'Index storing political party GE 2024 manifesto documents.',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'generative-openai': {'model': 'gpt-3.5-turbo'},
    'text2vec-openai': {'baseURL': 'https://api.openai.com',
     'model': 'ada',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'autoTenantCreation': False, 'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'The document content chunked',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'dimensions': 1536,
       'model': 'text-embedding-3-small',
       'skip': False,
       'tokenization': 'lowercase',
       'type': 'text',
       'vectorizePropertyName': True}},
     'name': 'text',
     'tokenization': 'word'},
    {'dataType': ['text'],
