<a href="https://colab.research.google.com/github/geosword/devclub-weaviate/blob/main/weaviate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai==0.28.0 weaviate-client langchain langchain-community pypdf

In [None]:
from IPython.display import HTML, display

def set_css():
    display(HTML('''
    <style>
    pre {
        white-space: pre-wrap;
    }
    </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css)


In [None]:
import openai
import os
from google.colab import userdata
from typing import List, Optional
import weaviate
import datetime
import json
from weaviate.auth import Auth
from weaviate.util import generate_uuid5
from weaviate.classes.query import Filter
import weaviate.classes as wvc
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader, UnstructuredURLLoader

In [None]:
def call_gpt4(system_prompt, user_prompt, chunks=None):
    openai.api_key = os.getenv("OPENAI_API_KEY", userdata.get('OPENAI_API_KEY'))
    if not openai.api_key:
        raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.")

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    # Add chunks if provided
    if chunks:
        all_chunks = "---------\n".join(chunks)
        messages.append({"role": "system", "content": f"Use the following context to answer the question <Context>{all_chunks}</Context>"})

    print(json.dumps(messages, indent=4))
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=messages
    )
    return response['choices'][0]['message']['content']

In [None]:
def init_pdf_collection(client: weaviate.Client, collection_name: str) -> None:
    documents = client.collections.delete(collection_name)

    documents= client.collections.create(
        name=collection_name,
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),    # Set the vectorizer to "text2vec-openai" to use the OpenAI API for vector-related operations
        # https://weaviate.io/developers/weaviate/config-refs/datatypes
        properties=[
            wvc.config.Property(
                name="content",
                data_type=wvc.config.DataType.TEXT,
                vectorize_property_name=True,
                tokenization=wvc.config.Tokenization.LOWERCASE
            ),
            wvc.config.Property(
                name="date",
                data_type=wvc.config.DataType.DATE,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="page_number",
                data_type=wvc.config.DataType.INT,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="total_pages",
                data_type=wvc.config.DataType.INT,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="file_path",
                data_type=wvc.config.DataType.TEXT,
                vectorize_property_name=True  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="chunk_number",
                data_type=wvc.config.DataType.INT,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="total_chunks",
                data_type=wvc.config.DataType.INT,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            )

            # https://weaviate.io/developers/weaviate/config-refs/datatypes
        ]
    )

In [None]:
def ingest(file_path, collection):
    # Define the file path and metadata

    # Process the PDF file asynchronously
    documents = load_pdf(file_path)

    # Output the processed documents
    for i,doc in enumerate(documents):
        pagenumber = i
        chunks = chunk_text(doc.page_content)
        # Store chunks in Weaviate
        for j,chunk in enumerate(chunks):
            print(f'Processing {file_path} page {i+1} chunk {j+1} of {len(chunks)}')
            data_object = {
                "date": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
                "content": chunk,
                "page_number": pagenumber,
                "file_path": file_path,
                "total_pages": len(documents),
                "chunk_number": j,
                "total_chunks": len(chunks)
            }
            collection.data.insert(data_object)

In [None]:
def get_weaviate_client() -> weaviate.Client:
    """
    Get a Weaviate client instance.

    Returns:
        weaviate.Client: The Weaviate client instance
    """
    weaviate_url = userdata.get('WEAVIATE_URL')
    api_key = userdata.get('WEAVIATE_API_KEY')
    openai_key = userdata.get('OPENAI_API_KEY')

    client = weaviate.connect_to_weaviate_cloud(
                    cluster_url=weaviate_url,
                    auth_credentials=Auth.api_key(api_key),
                    headers={'X-OpenAI-Api-key': openai_key}
                )
    return client

In [None]:
def load_pdf(file_path: str) -> list:
    """
    load a pdf into a list of pages / documents

    Args:
        file_path: HTTPS URL to PDF file

    Returns:
        list: Processed documents

    Raises:
        ValueError: If URL does not use HTTPS protocol
    """

    try:
        loader = PyPDFLoader(file_path)
        # documentS because the single document is split into multiple documents
        documents = loader.load()

        return documents

    finally:
        if 'temp_path' in locals():
            os.unlink(temp_path)

In [None]:
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    """
    Splits text into overlapping chunks of specified size.

    Args:
        text (str): The input text to be chunked
        chunk_size (int): Maximum size of each chunk in characters
        overlap (int): Number of characters to overlap between chunks

    Returns:
        List[str]: List of text chunks
    """
    # ... keeping the same chunking logic from wvsearch.py ...
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + chunk_size

        if end < text_length:
            next_period = text.find('.', end - 50, end + 50)
            if next_period != -1:
                end = next_period + 1
            else:
                next_space = text.find(' ', end)
                if next_space != -1:
                    end = next_space

        chunks.append(text[start:end].strip())
        start = end - overlap

    return chunks

In [None]:
def search_weaviate(**kwargs):
    """
    Search for documents using the vector store.

    Args:
        **kwargs: Keyword arguments
            search_term (str): The search query text
            metadata_filter (dict, optional): Dictionary of metadata key-value pairs to filter results
            limit (int, optional): Maximum number of results to return (default: 10)

    Returns:
        list: List of search results with content and metadata
    """
    search_term = kwargs.get('search_term')
    limit = kwargs.get('limit', 10)
    collection = kwargs.get('collection')
    file_path = kwargs.get('file_path')

    results = collection.query.near_text(
        query=search_term,
        limit=10,
        filters=Filter.by_property("file_path").equal(file_path)
    )
    return results

In [None]:
llmtest = call_gpt4("You are a helpful assistant.", "According to the Hitch Hikers Guide to the galaxy. What is the Answer to everything?")
print(llmtest)

In [None]:
wclient = get_weaviate_client()

In [None]:
collection_name="pdf"
init_pdf_collection(wclient, collection_name)

In [None]:
collection = wclient.collections.get(collection_name)

In [None]:
file_path = "/content/sample_data/paris.pdf"
ingest(file_path, collection)


In [None]:
results = search_weaviate(search_term="when did Paris become a tourist destination?", collection=collection, file_path=file_path)
llm_chunks=[]
if results:
    for i, chunk in enumerate(results.objects):
      print(chunk.properties.get('content'))
      llm_chunks.append(chunk.properties.get('content'))
      print('----------------------------------')
else:
    print("No results found.")

In [None]:
answer = call_gpt4("You are a helpful assistant.", "When did Paris become a tourist destination?",llm_chunks)
print('')
print("LLMS ANSWER -------------------")
print(answer)

In [None]:
await wclient._connection.close()