In [None]:
# loads the environment variables for this project
from dotenv import load_dotenv
load_dotenv()

In [None]:
# install the Weaviate client
# pip install -U weaviate-client

import weaviate
import os

client = weaviate.connect_to_embedded(
    headers = {
        "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")  # Replace with your API key
    }
)

In [None]:
# create a new collection to hold the vectors
# we are using OpenAI here, but this can be changed to another AI API
import weaviate.classes as wvc

collection_name = "BlogArticles"

# If the collection already exists, delete it
if client.collections.exists(collection_name): 
    client.collections.delete(collection_name)

blog_articles = client.collections.create(
    name = collection_name,
    vectorizer_config = wvc.config.Configure.Vectorizer.text2vec_openai(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config = wvc.config.Configure.Generative.openai()  # Ensure the `generative-openai` module is used for generative queries
)

In [None]:
# now we load each blog article, create an embedding vector, and store it along with the blog metadata
import os
import glob

blog_articles = list()
blog_dir = glob.glob('blogs/*.txt')

for blog_file in blog_dir:
  blog_filename = os.path.basename(blog_file)
  with open(blog_file, mode = 'r') as file:
    blog_articles.append({
        "filename": blog_filename,
        "content": file.read().replace('\n', ' ')
    })

blog_articles_collection = client.collections.get(collection_name)
blog_articles_collection.data.insert_many(blog_articles)



In [None]:
from weaviate.classes.query import MetadataQuery

# given a blog article, let's find the top 5 similar articles using the weaviate client

for item in blog_articles_collection.iterator():
    filename = item.properties['filename']
    content = item.properties['content']

    response = blog_articles_collection.query.near_text(
        query = content,
        limit = 6, # we want the top 5 similar articles, but we also get the same article back, so we ask for 6
        return_metadata = MetadataQuery(distance = True)
    )
    
    print(f"Similar articles to {filename}:")

    for object in response.objects:
        similar_filename = object.properties['filename']
    
        # skip the same file
        if similar_filename == filename:
            continue

        distance = object.metadata.distance
        print(f"\t{similar_filename} (distance: {distance})")

    print("\n\n")


In [None]:
# close the client
client.close()

In [None]:
###
### FOR RESETTING THE DATA
###
# delete collection "BlogArticles" - THIS WILL DELETE THE COLLECTION AND ALL ITS DATA
client.collections.delete(collection_name)  # Replace with your collection name