In [None]:
# loads the environment variables for this project
from dotenv import load_dotenv
load_dotenv()

In [None]:
# install the Weaviate client
# pip install -U weaviate-client

import weaviate
import os

client = weaviate.connect_to_embedded(
    headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")  # Replace with your API key
    }
)

In [None]:
# create a new collection to hold the vectors
# each vector will be associated with a chunk of text from the blog article
# we are using OpenAI here, but this can be changed to another AI API
import weaviate.classes as wvc

collection_name = "BlogArticleChunks"

# If the collection already exists, delete it
if client.collections.exists(collection_name): 
    client.collections.delete(collection_name)

blog_article_chunks = client.collections.create(
    name = collection_name,
    properties = [
        wvc.config.Property(
            name = "filename",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name = "chunk",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name = "chunk_index",
            data_type=wvc.config.DataType.INT
        )
    ],
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config=wvc.config.Configure.Generative.openai(
        #model = "gpt-4-turbo",
        temperature = 0.2, # 0 is deterministic, 1 is random
    )  # Ensure the `generative-openai` module is used for generative queries
)

In [None]:
import os, glob, re
from typing import List

# chunks the text into smaller chunk_size pieces
def chunk_text(text: str, chunk_size: int, overlap_size: int) -> List[str]:   
    source_text = re.sub(r"\s+", " ", text)  # Remove multiple whitespaces
    text_words = re.split(r"\s", source_text)  # Split text by single whitespace

    chunks = []
    for i in range(0, len(text_words), chunk_size):  # Iterate through & chunk data
        chunk = " ".join(text_words[max(i - overlap_size, 0): i + chunk_size])  # Join a set of words into a string
        chunks.append(chunk)
    
    return chunks

def load_and_process_file(collection_name: str, blog_file: str):
    blog_article_chunks = client.collections.get(collection_name)

    with open(blog_file, mode = "r") as file:
        chunks_list = list()
        blog_text = file.read().replace("\n", " ")
        chunked_text = chunk_text(blog_text, 150, 25)

        for index, chunk in enumerate(chunked_text):
            properties = {
                "filename": os.path.basename(blog_file),
                "chunk": chunk,
                "chunk_index": index
            }
            chunks_list.append(properties)
        blog_article_chunks.data.insert_many(chunks_list)

In [None]:
# load all the blog files and process them as chunks
blog_files = glob.glob("./blogs/*.txt")

for blog_file in blog_files:
    load_and_process_file(collection_name, blog_file)

# print out the total number of chunks in the collection
response = blog_article_chunks.aggregate.over_all(total_count=True)
print(response.total_count)

In [None]:
# define the RAG query method
def rag_query(collection_name: str, question: str, group_task: str, max_results: int = 10):
  chunks = client.collections.get(collection_name)
  response = chunks.generate.near_text(
      query = question,
      limit = max_results,
      grouped_task = group_task
  )

  print(response.generated)

In [None]:
rag_query(
  collection_name, 
  "What is Solution Street?", 
  "Summarize the key information here in bullet points"
)

In [None]:
rag_query(
  collection_name, 
  "What is Solution Street?", 
  "Summarize the key information here into question and answer format"
)

In [None]:

rag_query(
  collection_name, 
  "What are the top things that CEO Joel Nylund has learned over the years as an engineer?",
  "Summarize the key information here into a bulleted list"
)

In [None]:
rag_query(
  collection_name, 
  "tell me about the Solution Street's cricket charity event",
  "Summarize the key information here into a bulleted list using only the information I give you. If you do not know, respond with I do not know."
)


In [None]:
rag_query(
  collection_name, 
  "What are some tips and recommendations for improving client communication?",
  "Summarize the key information here into question and answer format as JSON with a question property and an answer property"
)

In [None]:
rag_query(
  collection_name, 
  "Is the earth flat or round?",
  "Summarize the key information here into an authorative paragraph. If the provided context does not contain information about the earth, respond with I do not know."
)

In [None]:
# now let's add the flat-earth.txt file to the collection
with open("./other-files/flat-earth.txt", mode = "r") as file:
    chunks_list = list()
    blog_text = file.read().replace("\n", " ")
    chunked_text = chunk_text(blog_text, 150, 25)

    for index, chunk in enumerate(chunked_text):
        properties = {
            "filename": "flat-earth.txt",
            "chunk": chunk,
            "chunk_index": index
        }
        chunks_list.append(properties)
    blog_article_chunks.data.insert_many(chunks_list)

In [None]:
rag_query(
  collection_name, 
  "Is the earth flat or round?",
  "Summarize the key information here into an authorative paragraph. If the provided context does not contain information about the earth, respond with I do not know."
)

In [None]:
# close the client
client.close()

In [None]:
###
### FOR RESETTING THE DATA
###
# delete collection - THIS WILL DELETE THE COLLECTION AND ALL ITS DATA
client.collections.delete(collection_name)  # Replace with your collection name