In [None]:
%pip install weaviate-client

### Extract information from code files

In [22]:
import os
import json
import yaml

# Define the path to the data directory
DATA_DIR = "../data"

# Define a function to extract tags, description, and code from a file
def extract_information(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # Extract the docstring
    docstring_start = content.find('"""') + 3
    docstring_end = content.find('"""', docstring_start)
    docstring = content[docstring_start:docstring_end]

    # Parse the docstring as YAML
    metadata = yaml.safe_load(docstring)

    # Extract tags and description
    tags = metadata['tags']
    description = metadata['description']

    # Extract code
    code_start = content.find('"""', docstring_end) + len('"""')
    if code_start != -1:
        code = content[code_start:].strip()
    else:
        print(f"Error in file {file_path}: No code found after docstring.")
        return

    return {
        'file_name': os.path.basename(file_path),
        'tags': tags,
        'description': description,
        'code': code
    }

# Define the main function to iterate through the files and extract information
def main():
    for file_name in os.listdir(DATA_DIR):
        # Check if the file is a Python file and not a subdirectory
        if file_name.endswith('.py') and os.path.isfile(os.path.join(DATA_DIR, file_name)):
            file_path = os.path.join(DATA_DIR, file_name)
            file_data = extract_information(file_path)
            print(json.dumps(file_data, indent=2))

# Run the main function
main()

{
  "file_name": "example_4.py",
  "tags": [
    "langchain"
  ],
  "description": "Create a chain that does the following and streams the response:\n- Accept a string as input\n- Format messages from System and Human as a prompt\n- Pass messages to OpenAI\n- Parse the OpenAI response as a string\n",
  "code": "from langchain.chat_models import ChatOpenAI\nfrom langchain.prompts import ChatPromptTemplate\nfrom langchain.schema.messages import HumanMessage, SystemMessage\nfrom langchain.schema.output_parser import StrOutputParser\n\n# Generate system and human messages\nmessages = [\n    SystemMessage(content=\"You're a helpful assistant\"),\n    HumanMessage(content=\"What is the purpose of model regularization?\"),\n]\n\nprompt = ChatPromptTemplate.from_messages(messages)\nmodel = ChatOpenAI()\noutput_parser = StrOutputParser()\n\nchain = prompt | model | output_parser\n\n# Stream the chain\nfor chunk in chain.stream({}):\n    print(chunk, end=\"\", flush=True)"
}
{
  "file_name": "ex

### Set up the vector database

In [34]:
from weaviate import Client

# Initialize Weaviate client
client = Client("http://localhost:8080")

In [36]:
def create_schema():
    """
    Creates the schema for the 'code_example' class in Weaviate.
    """
    class_obj = {
        "class": "code_example",
        "vectorizer": "text2vec-openai",
        "moduleConfig": {
            "text2vec-openai": {},
            "generative-openai": {}
        },
        "properties": [
            {
                "name": "tags",
                "dataType": ["string[]"],
                "description": "Tags associated with the code"
            },
            {
                "name": "description",
                "dataType": ["text"],
                "description": "Description of the code"
            },
            {
                "name": "code",
                "dataType": ["text"],
                "description": "The actual code"
            }
        ]
    }

    client.schema.delete_all()  # Clear existing schema
    client.schema.create_class(class_obj)  # Create the new class

# Create the schema
create_schema()

In [39]:
def query_collection():
    response = (
        client.query
        .get("code_example", ["description", "code"])
        .with_limit(2)
        .do()
    )
    return response

# Query the collection (should return 0 results initially)
query_result = query_collection()
print(query_result)

{'data': {'Get': {'Code_example': []}}}


### Populate the weaviate database

In [28]:
def populate_weaviate(documents):
    """
    Populates the Weaviate database with the provided documents.
    Clears the database if it is already populated.
    """
    # Check if the database is already populated
    if client.data_object.get("code_example", limit=1)["total"] > 0:
        # Clear the database
        client.schema.delete_all()

    # Add each document to the Weaviate database
    for doc in documents:
        try:
            # Embed description using OpenAI
            description_embedding = client.batch.openai.embed([doc["description"]])[0]
            client.data_object.create(
                data_object=doc,
                class_name="code_example",
                vector=description_embedding,
                uuid=str(uuid.uuid4())
            )
        except Exception as e:
            print(f"Error adding document to Weaviate: {e}")

In [29]:
def process_files():
    """
    Modified function to process files and populate Weaviate database.
    """
    documents = []
    for file_name in os.listdir(DATA_DIR):
        if file_name.endswith('.py'):
            file_path = os.path.join(DATA_DIR, file_name)
            data = extract_information(file_path)
            
            if data:
                documents.append({"file_name": file_name, **data})
            else:
                log_error(file_name, "tags, description, or code")

    # Populate Weaviate with extracted documents
    populate_weaviate(documents)

In [30]:
# Run the main function
process_files()

ValueError: Not valid 'uuid' or 'uuid' can not be extracted from value