In [14]:
# loads the environment variables for this project
from dotenv import load_dotenv
load_dotenv()

True

In [15]:
# install the Weaviate client
# pip install -U weaviate-client

import weaviate
import os

client = weaviate.connect_to_embedded(
    headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")  # Replace with your API key
    }
)

Started /Users/ghodum/.cache/weaviate-embedded: process ID 2464


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-06-10T12:16:21-04:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-06-10T12:16:21-04:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-06-10T12:16:21-04:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-06-10T12:16:21-04:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-06-10T12:16:21-04:00"}


{"action":"lsm_recover_from_active_wal_success","class":"BlogArticleChunks","index":"blogarticlechunks","level":"info","msg":"successfully recovered from write-ahead-log","path":"/Users/ghodum/.local/share/weaviate/blogarticlechunks/VZb9CMJQDdQO/lsm/objects/segment-1718030953785184000.wal","shard":"VZb9CMJQDdQO","time":"2024-06-10T12:16:22-04:00"}
{"action":"lsm_recover_from_active_wal_success","class":"BlogArticles","index":"blogarticles","level":"info","msg":"successfully recovered from write-ahead-log","path":"/Users/ghodum/.local/share/weaviate/blogarticles/skmc3QErGi6e/lsm/objects/segment-1718030635095733000.wal","shard":"skmc3QErGi6e","time":"2024-06-10T12:16:22-04:00"}
{"action":"lsm_recover_from_active_wal_success","class":"BlogArticleChunks","index":"blogarticlechunks","level":"info","msg":"successfully recovered from write-ahead-log","path":"/Users/ghodum/.local/share/weaviate/blogarticlechunks/VZb9CMJQDdQO/lsm/property__id/segment-1718030953789315000.wal","shard":"VZb9CMJQDd

In [7]:
# create a new collection to hold the vectors
# each vector will be associated with a chunk of text from the blog article
# we are using OpenAI here, but this can be changed to another AI API
import weaviate.classes as wvc

collection_name = "BlogArticleChunks"

# If the collection already exists, delete it
if client.collections.exists(collection_name): 
    client.collections.delete(collection_name)

blog_article_chunks = client.collections.create(
    name = collection_name,
    properties = [
        wvc.config.Property(
            name = "filename",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name = "chunk",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name = "chunk_index",
            data_type=wvc.config.DataType.INT
        )
    ],
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config=wvc.config.Configure.Generative.openai(
        #model = "gpt-4-turbo",
        temperature = 0.2, # 0 is deterministic, 1 is random
    )  # Ensure the `generative-openai` module is used for generative queries
)

In [8]:
import os, glob, re
from typing import List

# chunks the text into smaller chunk_size pieces
def chunk_text(text: str, chunk_size: int, overlap_size: int) -> List[str]:   
    source_text = re.sub(r"\s+", " ", text)  # Remove multiple whitespaces
    text_words = re.split(r"\s", source_text)  # Split text by single whitespace

    chunks = []
    for i in range(0, len(text_words), chunk_size):  # Iterate through & chunk data
        chunk = " ".join(text_words[max(i - overlap_size, 0): i + chunk_size])  # Join a set of words into a string
        chunks.append(chunk)
    
    return chunks

def load_and_process_file(collection_name: str, file: str):
    blog_article_chunks = client.collections.get(collection_name)

    with open(blog_file, mode = "r") as file:
        chunks_list = list()
        blog_text = file.read().replace("\n", " ")
        chunked_text = chunk_text(blog_text, 150, 25)

        for index, chunk in enumerate(chunked_text):
            properties = {
                "filename": os.path.basename(blog_file),
                "chunk": chunk,
                "chunk_index": index
            }
            chunks_list.append(properties)
        blog_article_chunks.data.insert_many(chunks_list)

In [9]:
# load all the blog files and process them as chunks
blog_files = glob.glob("./blogs/*.txt")

for blog_file in blog_files:
    load_and_process_file(collection_name, blog_file)

# print out the total number of chunks in the collection
response = blog_article_chunks.aggregate.over_all(total_count=True)
print(response.total_count)

1180


In [16]:
# define the RAG method
def rag_query(collection_name: str, question: str, group_task: str, max_results: int = 10):
  chunks = client.collections.get(collection_name)
  response = chunks.generate.near_text(
      query = question,
      limit = max_results,
      grouped_task = group_task
  )

  print(response.generated)

In [17]:
rag_query(
  collection_name, 
  "What is Solution Street?", 
  "Summarize the key information here in bullet points"
)

- Solution Street is an information technology consultancy based in Northern Virginia
- They help private, government, and non-profit organizations achieve their business goals through web applications
- Employees are experts in J2EE, .NET, PHP, and Ruby on Rails technologies
- They offer competitive salary, bonus, health/dental benefits, life insurance, PTO, 401(k), and a fun working environment
- Solution Street sponsors NovaJug and partners with Capital Area Food Bank
- They have been in business for 20 years and have made the Inc 5000 list for the third consecutive year
- Solution Street is a Salesforce Cloud Alliance partner and offers career opportunities for developers in various technologies


In [45]:
rag_query(
  collection_name, 
  "What is Solution Street?", 
  "Summarize the key information here into question and answer format"
)

1. What is Solution Street and what do they do?
- Solution Street is an information technology consultancy based in Northern Virginia that helps private, government, and non-profit organizations achieve their business goals through the design and rapid deployment of web applications.

2. What technologies do Solution Street employees specialize in?
- Solution Street employees are experts in building large, highly scalable and well-performing web applications using J2EE, .NET, PHP, and Ruby on Rails technologies.

3. What benefits does Solution Street offer to its employees?
- Solution Street offers a relaxed, fun, flexible working environment with competitive salary, bonus, 100% paid health/dental, life insurance, 15 days PTO per year, long-term disability, and 401(k) contributions. They also provide snacks and drinks in the office and are located near restaurants and the W&OD Trail.

4. How did Solution Street come up with its name?
- The name "Solution Street" was inspired by the fou

In [46]:

rag_query(
  collection_name, 
  "What are the top things that CEO Joel Nylund has learned over the years as an engineer?",
  "Summarize the key information here into a bulleted list"
)

- Joel Nylund has experience in entrepreneurship, consulting, and software engineering
- He is a partner at Solution Street, where he has built custom software for customers
- Joel emphasizes the importance of being an engineer, not just a coder
- He values communication skills and being a good listener in software engineering
- Joel highlights the significance of knowing and effectively using tools in software development
- He stresses the importance of being nice, supportive, and holding teammates accountable in a team setting
- Joel discusses the importance of collaboration and communication in software development projects
- He shares insights on building a great technical staff, including providing the best equipment and promoting continuous learning
- Joel reflects on the alignment of personal and professional values in his career at Solution Street


In [47]:
rag_query(
  collection_name, 
  "tell me about the Solution Street's cricket charity event",
  "Summarize the key information here into a bulleted list using only the information I give you. If you do not know, respond with I do not know."
)


- Solution Street partners with Capital Area Food Bank for charity sponsorship
- Capital Area Food Bank hosts events like The Great American Milk Drive and Blue Jeans Ball
- Solution Street is collecting nonperishable items for donation
- Solution Street is an IT consultancy helping organizations with web applications
- Solution Street offers competitive salary, benefits, and a fun working environment
- Solution Street supports various charities chosen by company-wide vote
- Solution Street is a sponsor of NovaJug
- Solution Street offers opportunities for internships in web development
- Solution Street celebrated its 15th year in business in 2017


In [48]:
rag_query(
  collection_name, 
  "What are some tips and recommendations for improving client communication?",
  "Summarize the key information here into question and answer format as JSON with a question property and an answer property"
)

[
    {
        "question": "What are some key communication skills that are important for successful communication?",
        "answer": "Making others feel comfortable, listening to what they are saying, talking with them, speaking simply and clearly, and discussing important details in person or by phone."
    },
    {
        "question": "How can over-communication be effective in a team setting?",
        "answer": "Over-communicating through different channels such as oral delivery, written memos, email memos, and PowerPoints can help emphasize, reiterate, and achieve consistency in communication."
    },
    {
        "question": "Why is it important to document findings in communication?",
        "answer": "Documenting findings is important to ensure clarity, understanding, and accountability in communication, and to avoid misunderstandings or misinterpretations."
    },
    {
        "question": "What are some common interactions that consultants and others might have on a pro

In [49]:
rag_query(
  collection_name, 
  "Is the earth flat or round?",
  "Summarize the key information here into an authorative paragraph"
)

The book "Switch: How to Change Things When Change is Hard" discusses the concept of exhaustion being mistaken for laziness, highlighting the limited amount of self-control or willpower individuals have. The book uses the Rider and Elephant metaphor to explain how to achieve change by directing the rational side (Rider), motivating the emotional side (Elephant), and shaping the environment (Path). Additionally, the challenges of working remotely while traveling are explored, including the need to adjust to different time zones and plan for logistical hurdles. The importance of good planning, finding the right work environment, and managing work hours are emphasized for a successful remote work experience. Furthermore, the potential impact of technology on future activities like buying clothes is discussed, predicting a shift towards virtual reality experiences for shopping. Overall, the importance of understanding personal limitations, effective planning, and adapting to changing envir

In [50]:
# now let's add the flat-earth.txt file to the collection
with open("./other-files/flat-earth.txt", mode = "r") as file:
    chunks_list = list()
    blog_text = file.read().replace("\n", " ")
    chunked_text = chunk_text(blog_text, 150, 25)

    for index, chunk in enumerate(chunked_text):
        properties = {
            "filename": "flat-earth.txt",
            "chunk": chunk,
            "chunk_index": index
        }
        chunks_list.append(properties)
    blog_article_chunks.data.insert_many(chunks_list)

In [51]:
rag_query(
  collection_name, 
  "Is the earth flat or round?",
  "Summarize the key information here into an authorative paragraph"
)

Charles K. Johnson, president of the International Flat Earth Research Society, firmly believes that the Earth is flat, with the known, inhabited world being circular and flat like a phonograph record. According to Johnson, the North Pole is at the center, with a wall of southern ice at the outer edge that no one has crossed. He argues that the sun and moon are only about 32 miles in diameter and circle above the Earth near the equator, with their rising and setting being tricks of perspective. Johnson's beliefs are grounded in the Bible, and he refutes common beliefs about the Earth's shape, space exploration, and the Shuttle program. He contends that the main purpose of the space program is to perpetuate the myth that the Earth is a globe, and he aims to bring about a declaration that the Earth is flat, making the United States the first flat Earth nation in recorded history.


{"action":"lsm_init_disk_segment_build_bloom_filter_primary","class":"BlogArticleChunks","index":"blogarticlechunks","level":"debug","msg":"building bloom filter took 250.466µs\n","path":"/Users/ghodum/.local/share/weaviate/blogarticlechunks/VZb9CMJQDdQO/lsm/objects/segment-1718030772935671000.db","shard":"VZb9CMJQDdQO","time":"2024-06-10T10:49:13-04:00","took":250466}
{"action":"lsm_init_disk_segment_build_bloom_filter_secondary","class":"BlogArticleChunks","index":"blogarticlechunks","level":"debug","msg":"building bloom filter took 324.905µs\n","path":"/Users/ghodum/.local/share/weaviate/blogarticlechunks/VZb9CMJQDdQO/lsm/objects/segment-1718030772935671000.db","secondary_index_position":0,"shard":"VZb9CMJQDdQO","time":"2024-06-10T10:49:13-04:00","took":324905}
{"action":"lsm_memtable_flush_complete","class":"BlogArticleChunks","index":"blogarticlechunks","level":"debug","msg":"flush and switch took 4.100085ms\n","path":"/Users/ghodum/.local/share/weaviate/blogarticlechunks/VZb9CMJQ

In [4]:
# close the client
client.close()

In [None]:
###
### FOR RESETTING THE DATA
###
# delete collection - THIS WILL DELETE THE COLLECTION AND ALL ITS DATA
client.collections.delete(collection_name)  # Replace with your collection name