In [1]:
# loads the environment variables for this project
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# install the Weaviate client
# pip install -U weaviate-client

import weaviate
import os

client = weaviate.connect_to_embedded(
    headers = {
        "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")  # Replace with your API key
    }
)

Started /Users/ghodum/.cache/weaviate-embedded: process ID 67563


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-06-10T10:17:56-04:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-06-10T10:17:56-04:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-06-10T10:17:56-04:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-06-10T10:17:56-04:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-06-10T10:17:56-04:00"}


{"action":"lsm_recover_from_active_wal_success","class":"BlogArticleChunks","index":"blogarticlechunks","level":"info","msg":"successfully recovered from write-ahead-log","path":"/Users/ghodum/.local/share/weaviate/blogarticlechunks/oT4HrI2EH3Jy/lsm/objects/segment-1718029022701378000.wal","shard":"oT4HrI2EH3Jy","time":"2024-06-10T10:17:57-04:00"}
{"action":"lsm_recover_from_active_wal_success","class":"BlogArticles","index":"blogarticles","level":"info","msg":"successfully recovered from write-ahead-log","path":"/Users/ghodum/.local/share/weaviate/blogarticles/mBblO7faYogL/lsm/objects/segment-1718028733899343000.wal","shard":"mBblO7faYogL","time":"2024-06-10T10:17:57-04:00"}
{"action":"lsm_recover_from_active_wal_success","class":"BlogArticleChunks","index":"blogarticlechunks","level":"info","msg":"successfully recovered from write-ahead-log","path":"/Users/ghodum/.local/share/weaviate/blogarticlechunks/oT4HrI2EH3Jy/lsm/property_chunk_index/segment-1718029022736032000.wal","shard":"oT

In [3]:
# create a new collection to hold the vectors
# we are using OpenAI here, but this can be changed to another AI API
import weaviate.classes as wvc

collection_name = "BlogArticles"

# If the collection already exists, delete it
if client.collections.exists(collection_name): 
    client.collections.delete(collection_name)

blog_articles = client.collections.create(
    name = collection_name,
    vectorizer_config = wvc.config.Configure.Vectorizer.text2vec_openai(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config = wvc.config.Configure.Generative.openai()  # Ensure the `generative-openai` module is used for generative queries
)

{"level":"info","msg":"Created shard blogarticles_skmc3QErGi6e in 1.805148ms","time":"2024-06-10T10:18:02-04:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-06-10T10:18:02-04:00","took":67276}


In [4]:
# now we load each blog article, create an embedding vector, and store it along with the blog metadata
import os
import glob

blog_articles = list()
blog_dir = glob.glob('blogs/*.txt')

for blog_file in blog_dir:
  blog_filename = os.path.basename(blog_file)
  with open(blog_file, mode = 'r') as file:
    blog_articles.append({
        "filename": blog_filename,
        "content": file.read().replace('\n', ' ')
    })

blog_articles_collection = client.collections.get(collection_name)
blog_articles_collection.data.insert_many(blog_articles)



BatchObjectReturn(all_responses=[UUID('c5bf0c01-5a2d-4729-a118-116655e0a6bb'), UUID('584dea00-1188-4974-a9dd-244f22861c96'), UUID('618e8ed0-d121-4630-bab9-32702235fb3f'), UUID('d5a1a468-4e15-4a18-86e8-7576fa60f913'), UUID('a2df472f-f66a-45ab-af53-84869cf322c5'), UUID('2833cbb0-df2c-45f7-b234-f3fb8bc939a5'), UUID('9bd527ff-ecb3-4485-9ab0-1ab1b056b29b'), UUID('116ecbd4-521d-4b1a-8f92-b4b0d910e14e'), UUID('0c1f260b-d50d-46a4-b922-4e6a0e55b544'), UUID('8b74feae-aa3d-4f5c-8424-5aa96a08c19e'), UUID('21a73d8f-1a52-4dea-94ca-358add6bfac8'), UUID('e0e51bf9-e686-4b90-a000-7752f4b8b3ac'), UUID('5cdbc440-59b1-4a0f-b758-85edeeac326a'), UUID('80397797-d760-4c25-ae1c-1576e096301b'), UUID('49629106-7de5-4a0a-85e0-f02c2231a515'), UUID('524b1a9a-8708-40bf-8890-a5bf8119b6ca'), UUID('1c9dc3f8-da12-4a66-8d83-4e93467e9220'), UUID('31dbd028-48bd-4b81-b2c1-f48732abb630'), UUID('f4042536-9d06-4bee-bb92-c431095bf0ef'), UUID('89f564cf-e12e-4ab7-b482-769cc1270b48'), UUID('51cbf92e-06cd-4af1-95d3-16055a188845'), U

In [5]:
from weaviate.classes.query import MetadataQuery

# given a blog article, let's find the top 5 similar articles using the weaviate client

for item in blog_articles_collection.iterator():
    filename = item.properties['filename']
    content = item.properties['content']

    response = blog_articles_collection.query.near_text(
        query = content,
        limit = 6, # we want the top 5 similar articles, but we also get the same article back, so we ask for 6
        return_metadata = MetadataQuery(distance = True)
    )
    
    print(f"Similar articles to {filename}:")

    for object in response.objects:
        similar_filename = object.properties['filename']
    
        # skip the same file
        if similar_filename == filename:
            continue

        distance = object.metadata.distance
        print(f"\t{similar_filename} (distance: {distance})")

    print("\n\n")


Similar articles to flutterflow-tutorial-part-2.txt:
	flutterflow-tutorial-part-1.txt (distance: 0.07137751579284668)
	lets-take-a-low-code-platform-for-a-test-drive.txt (distance: 0.17590725421905518)
	lets-take-a-low-code-platform-for-a-test-drive-part-2.txt (distance: 0.18758714199066162)
	rapid-development-of-mobile-apps-using-react-native-and-expo.txt (distance: 0.18983227014541626)
	learning-react-js-by-example.txt (distance: 0.19688403606414795)



Similar articles to rapid-development-of-mobile-apps-using-react-native-and-expo.txt:
	getting-react-on-the-rails.txt (distance: 0.14515560865402222)
	learning-react-js-by-example.txt (distance: 0.1489618420600891)
	making-sense-of-mobile-application-types-and-frameworks.txt (distance: 0.1522362232208252)
	learning-react-js-by-example-part-2.txt (distance: 0.1568998098373413)
	testing-react.txt (distance: 0.15968042612075806)



Similar articles to thankfulness.txt:
	what-i-learned-from-30-years-of-professional-software-engineering.tx

In [6]:
# close the client
client.close()

{"action":"restapi_management","level":"info","msg":"Shutting down... ","time":"2024-06-10T10:19:06-04:00"}
{"action":"restapi_management","level":"info","msg":"Stopped serving weaviate at http://127.0.0.1:8079","time":"2024-06-10T10:19:06-04:00"}


In [None]:
###
### FOR RESETTING THE DATA
###
# delete collection "BlogArticles" - THIS WILL DELETE THE COLLECTION AND ALL ITS DATA
client.collections.delete(collection_name)  # Replace with your collection name