In [None]:
%pip install "weaviate-client==3.*"

### Extract information from code files

In [None]:
import os
import json
import yaml

# Define the path to the data directory
DATA_DIR = "../data"

# Define a function to extract tags, description, and code from a file
def extract_information(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # Extract the docstring
    docstring_start = content.find('"""') + 3
    docstring_end = content.find('"""', docstring_start)
    docstring = content[docstring_start:docstring_end]

    # Parse the docstring as YAML
    metadata = yaml.safe_load(docstring)

    # Extract tags and description
    tags = metadata['tags']
    description = metadata['description']

    # Extract code
    code_start = content.find('"""', docstring_end) + len('"""')
    if code_start != -1:
        code = content[code_start:].strip()
    else:
        print(f"Error in file {file_path}: No code found after docstring.")
        return

    return {
        'file_name': os.path.basename(file_path),
        'tags': tags,
        'description': description,
        'code': code
    }

# Define the main function to iterate through the files and extract information
def main():
    for file_name in os.listdir(DATA_DIR):
        # Check if the file is a Python file and not a subdirectory
        if file_name.endswith('.py') and os.path.isfile(os.path.join(DATA_DIR, file_name)):
            file_path = os.path.join(DATA_DIR, file_name)
            file_data = extract_information(file_path)
            print(json.dumps(file_data, indent=2))

# Run the main function
main()

### Set up the vector database

In [None]:
from weaviate import Client

# Initialize Weaviate client
client = Client("http://localhost:8080")

In [None]:
def create_schema():
    """
    Creates the schema for the 'code_example' class in Weaviate.
    """
    class_obj = {
        "class": "code_example",
        "vectorizer": "text2vec-openai",
        "moduleConfig": {
            "text2vec-openai": {}
        },
        "properties": [
            {
                "name": "file_name",
                "dataType": ["string"],
                "description": "File name associated with the code"
            },
            {
                "name": "tags",
                "dataType": ["string[]"],
                "description": "Tags associated with the code"
            },
            {
                "name": "description",
                "dataType": ["text"],
                "description": "Description of the code",
                "moduleConfig": {
                    "text2vec-openai": {
                        "vectorizePropertyName": 'true'
                    }
                },
            },
            {
                "name": "code",
                "dataType": ["text"],
                "description": "The actual code"
            }
        ]
    }

    client.schema.delete_all()  # Clear existing schema
    client.schema.create_class(class_obj)  # Create the new class

# Create the schema
create_schema()

In [None]:
def query_collection():
    response = (
        client.query
        .get("code_example", ["description", "code", "file_name", "tags"])
        .with_limit(2)
        .do()
    )
    return response

# Query the collection (should return 0 results initially)
query_result = query_collection()
print(query_result)

### Populate the weaviate database

In [None]:
from weaviate.util import generate_uuid5 

def populate_weaviate(documents):
    """
    Populates the Weaviate database with the provided documents.
    Clears the database if it is already populated.
    """

    # Configure batch
    client.batch.configure(batch_size=len(documents))

    # Initialize a batch process
    with client.batch as batch:
        # Batch import data
        for i, doc in enumerate(documents):
            print(f"Importing document: {i+1}")
            try:
                properties = {
                    "file_name": doc["file_name"],
                    "tags": doc["tags"],
                    "description": doc["description"],
                    "code": doc["code"]
                }
                batch.add_data_object(
                    data_object=properties,
                    class_name="code_example",
                    uuid=generate_uuid5(properties["file_name"])
                )
            except Exception as e:
                print(f"Error adding document to Weaviate: {e}")

In [None]:
def process_files():
    """
    Modified function to process files and populate Weaviate database.
    """
    documents = []
    for file_name in os.listdir(DATA_DIR):
        if file_name.endswith('.py'):
            file_path = os.path.join(DATA_DIR, file_name)
            data = extract_information(file_path)
            
            if data:
                documents.append({"file_name": file_name, **data})
            else:
                log_error(file_name, "tags, description, or code")

    # Populate Weaviate with extracted documents
    populate_weaviate(documents)

In [None]:
# Run the main function
process_files()