In [None]:
import os
import pickle
import random
from collections import namedtuple

import requests
import weaviate
from weaviate import classes as wvc
from weaviate.classes.query import MetadataQuery
from weaviate.classes.config import Property, DataType, ReferenceProperty

## Data Structure

In [None]:
Document = namedtuple("Document", ["summary", "document"])

## Constants

In [None]:
EMBEDDING_UPPER_LIMIT = 512 - 15 # 20 tokens for special tokens (guessed)
METADATA_SIZE = 5 # For metadata.
EMBEDDING_ENDPOINT = "http://embedding:8080" # Endpoint for embedding service

In [None]:
# Connect to Weaviate instance
client = weaviate.connect_to_local()

## Helper Functions

In [None]:
def slicer(document: str, chunk_length: int, overlap: int) -> list[str]:
    """
    Slices a document into overlapping chunks.

    Args:
        document: The input string to be sliced.
        chunk_length: The desired length of each chunk.
        overlap: The number of characters to overlap between consecutive chunks.

    Returns:
        A list of strings, where each string is a chunk of the original document.
    """
    if not isinstance(document, str):
        raise TypeError("Input 'document' must be a string.")
    if not isinstance(chunk_length, int) or chunk_length <= 0:
        raise ValueError("'chunk_length' must be a positive integer.")
    if not isinstance(overlap, int) or overlap < 0:
        raise ValueError("'overlap' must be a non-negative integer.")
    if overlap >= chunk_length:
        raise ValueError("'overlap' cannot be greater than or equal to 'chunk_length'.")

    chunks = []
    start_index = 0
    doc_len = len(document)

    while start_index < doc_len:
        end_index = start_index + chunk_length
        chunk = document[start_index:end_index]
        chunks.append(chunk)

        # Move the start index for the next chunk
        # If the next chunk would exceed the document length, we stop
        if start_index + chunk_length - overlap >= doc_len and start_index + chunk_length >= doc_len :
            break
        start_index += (chunk_length - overlap)
        # Ensure the last chunk doesn't go beyond the document length
        # if start_index + chunk_length > doc_len and start_index < doc_len :
        #   chunks.append(document[start_index:])
        #   break

    return chunks

## Getting Document Data

This is just an example, you should implement your own logic to prepare the data.

The system requires a document paired with summary to provide a document matching process.

In [None]:
# prepare data
doc_collection = list[Document]

with open("./all-doc-with-tr.pkl", "rb") as f:
    data = pickle.load(f)

doc_collection = [] # Initialize an empty list to store documents
for idex, (doc_orig, _, doc_summarized) in enumerate(data):
    # Create a document
    doc = Document(  
        summary=doc_summarized,
        document=doc_orig,
    )
    # Add the document to the collection
    doc_collection.append(doc)

In [None]:
# Shuffle and take partial collection
random.shuffle(doc_collection)
doc_db = doc_collection[:15000]

## Querying Documents

In [None]:
user_query = "舉辦推廣Haskell語言活動"

document_collection = client.collections.get("Chunk")   

vd_query_response = document_collection.query.hybrid(
    query=user_query,
    query_properties=["summary", "content"],
    alpha=0.5,
    limit=100,
    return_metadata=MetadataQuery(score=True, explain_score=True),
)

response_contents = []
for obj in vd_query_response.objects:
    response_contents.append(obj.properties["content"])

print(response_contents)

### GraphQL Query

In [None]:
# Define the GraphQL payload, with a placeholder for the user query
graphql_payload = """
{
  Get {
    Chunk( # Replace 'Document' if your collection name is different
      hybrid: {
        query: "%s",     # Your user query
        alpha: 0.5,                  # Balance between vector/keyword search
        properties: ["summary", "content"] # Properties for keyword (BM25) search
      },
      limit: 2 # Limit the number of results FROM Weaviate
    ) {
      # --- Specify properties you need for reranking or display ---
      content   # You specifically extracted this for the reranker
      summary   # You searched this property, might be useful context

      # --- Add any other properties from your 'Document' schema ---
      # Example: other_property

      # --- Request metadata ---
      _additional {
        score        # The hybrid search score
        explainScore # Breakdown of keyword/vector contribution
        id           # Useful for unique identification
      }
      orig {
        ... on Document {
          content
        }
      }
    }
  }
}
"""

In [None]:
user_query = "生涯規劃離職"

response = requests.post(
    url="http://localhost:8080/v1/graphql",
    headers={"Content-Type": "application/json"},
    json={
        "query": graphql_payload % user_query,
        "variables": None,
        "operationName": None
    }
)

print(response.json())

## Define Document Collection

**WARNING**: This procedure will drop the existing collection and create a new one. All data will be lost. Do **NOT** run this if you just want to insert new data.

In [None]:
# We use two-way references to link the original documents to their chunks.

# Collection for the original documents
# Remove existing class if it exists
client.collections.delete("Document")

# Make a collection for the document class.
documents = client.collections.create(
    name="Document",
    properties=[
        Property(name="content", data_type=DataType.TEXT),
    ],
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
)

# Collection for the chunks
# Remove existing class if it exists
client.collections.delete("Chunk")

# Make a collection for the chunk (sliced document) class.
documents = client.collections.create(
    name="Chunk",
    properties=[
        Property(name="summary", data_type=DataType.TEXT),
        Property(name="content", data_type=DataType.TEXT),
    ],
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
        base_url=EMBEDDING_ENDPOINT,  # The URL of the embedding service
    ),
    references=[
        ReferenceProperty(
            name="orig",
            target_collection="Document",
        ),
    ],
)

### Add Data to Collection

In [None]:
# Add the test data to the collection
document_collection = client.collections.get("Document")
chunk_collection = client.collections.get("Chunk")


for i, doc in enumerate(doc_db):
    chunkSize = EMBEDDING_UPPER_LIMIT - METADATA_SIZE - len(doc.summary)
    overlap = 150 # You can try different overlaps.

    if chunkSize <= overlap:
        print(f"Chunk size must be greater than overlap {overlap}. Possibly too long summary or too large overlap.")
        continue
    
    # Slice the document into chunks
    chunks = slicer(doc.document, chunk_length=chunkSize, overlap=overlap) # You can try different overlaps.
    
    # 1. Insert original document
    doc_id = document_collection.data.insert(
        properties={
            "content": doc.document, # Original document
        },
    )
    print(f"Document #{i}: Original document inserted.")

    # 2. Insert chunks
    for j, chunk in enumerate(chunks):
        chunk_collection.data.insert(
            properties={
                "summary": doc.summary,
                "content": chunk,
            },
            references={"orig": doc_id} # UUID reference to the original document
        )
        print(f"Document #{i}: Chunk #{j} inserted.")

    print(f"Document #{i}: Done.")
