In [5]:
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()

# Get OPENAI_API_KEY from .env file
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_ORGANIZATION"] = os.getenv('OPENAI_ORGANIZATION')

### Set up the vector database

In [1]:
from weaviate import Client

# Initialize Weaviate client
client = Client("http://localhost:8080")

In [23]:
def create_schema():
    """
    Creates the schema for the 'code_example' class in Weaviate.
    """
    class_obj = {
        "class": "code_example",
        "vectorizer": "text2vec-openai",
        "moduleConfig": {
            "text2vec-openai": {}
        },
        "properties": [
            {
                "name": "file_name",
                "dataType": ["string"],
                "description": "File name associated with the code"
            },
            {
                "name": "tags",
                "dataType": ["string[]"],
                "description": "Tags associated with the code"
            },
            {
                "name": "description",
                "dataType": ["text"],
                "description": "Description of the code",
                "moduleConfig": {
                    "text2vec-openai": {
                        "vectorizePropertyName": 'true'
                    }
                },
            },
            {
                "name": "code",
                "dataType": ["text"],
                "description": "The actual code"
            }
        ]
    }

    client.schema.delete_all()  # Clear existing schema
    client.schema.create_class(class_obj)  # Create the new class

# Create the schema
create_schema()

### Populate the database with code examples

In [24]:
from weaviate.util import generate_uuid5 

def populate_weaviate(documents):
    """
    Populates the Weaviate database with the provided documents.
    Clears the database if it is already populated.
    """

    # Configure batch
    client.batch.configure(batch_size=len(documents))

    # Initialize a batch process
    with client.batch as batch:
        # Batch import data
        for i, doc in enumerate(documents):
            print(f"Importing document: {i+1}")
            try:
                properties = {
                    "file_name": doc["file_name"],
                    "tags": doc["tags"],
                    "description": doc["description"],
                    "code": doc["code"]
                }
                batch.add_data_object(
                    data_object=properties,
                    class_name="code_example",
                    uuid=generate_uuid5(properties["file_name"])
                )
            except Exception as e:
                print(f"Error adding document to Weaviate: {e}")

In [25]:
import os
import json
import yaml

# Define the path to the data directory
DATA_DIR = "../data"

# Define a function to extract tags, description, and code from a file
def extract_information(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # Extract the docstring
    docstring_start = content.find('"""') + 3
    docstring_end = content.find('"""', docstring_start)
    docstring = content[docstring_start:docstring_end]

    # Parse the docstring as YAML
    metadata = yaml.safe_load(docstring)

    # Extract tags and description
    tags = metadata['tags']
    description = metadata['description']

    # Extract code
    code_start = content.find('"""', docstring_end) + len('"""')
    if code_start != -1:
        code = content[code_start:].strip()
    else:
        print(f"Error in file {file_path}: No code found after docstring.")
        return

    return {
        'file_name': os.path.basename(file_path),
        'tags': tags,
        'description': description,
        'code': code
    }

In [26]:
def process_files():
    """
    Modified function to process files and populate Weaviate database.
    """
    documents = []
    for file_name in os.listdir(DATA_DIR):
        if file_name.endswith('.py'):
            file_path = os.path.join(DATA_DIR, file_name)
            data = extract_information(file_path)
            
            if data:
                documents.append({"file_name": file_name, **data})
            else:
                log_error(file_name, "tags, description, or code")

    # Populate Weaviate with extracted documents
    populate_weaviate(documents)

In [27]:
# Run the main function
process_files()

Importing document: 1
Importing document: 2
Importing document: 3
Importing document: 4
Importing document: 5
Importing document: 6
Importing document: 7
Importing document: 8
Importing document: 9


### Query the database

In [2]:
def query_collection(query):
    response = (
        client.query
        .get("code_example", ["description", "code", "file_name", "tags"])
        .with_near_text({
            "concepts": [query]
        })
        .with_limit(10)
        .with_additional(["distance"])
        .do()
    )
    return response

In [5]:
def format_query_results(query_result):
    result_string = ""
    for result in query_result['data']['Get']['Code_example']:
        result_string += "# Example:\n"
        result_string += f"{result['code']}\n"
    return result_string

In [6]:
new_request = """
- Accept a string named answer
- Format string as an object to pass to the prompt
- Create System and Human messages templates. The System message has formatted output instructions via Pydantic. The Human message uses answer as context. Output instructions format to a Pydantic schema for hmw_question with a question (description: up to 10 word "how might we" question) and a role (description: either marketing, technology, or design) 
- Pass messages to OpenAI
- Parse response using Pydantic
"""

# Query the collection
query_result = query_collection(new_request)

# Format and print the results
format_query_results(query_result)

'# Example:\n# Create a chain that does the following and streams the response:\n# - Accept nothing as input\n# - Format messages from System and Human as a prompt\n# - Pass messages to OpenAI\n# - Parse the OpenAI response as a string\n# - Stream the response\n\nfrom langchain.chat_models import ChatOpenAI\nfrom langchain.prompts import ChatPromptTemplate\nfrom langchain.schema.messages import HumanMessage, SystemMessage\nfrom langchain.schema.output_parser import StrOutputParser\n\n# Generate system and human messages\nmessages = [\n    SystemMessage(content="You\'re a helpful assistant"),\n    HumanMessage(content="What is the purpose of model regularization?"),\n]\n\nprompt = ChatPromptTemplate.from_messages(messages)\nmodel = ChatOpenAI()\noutput_parser = StrOutputParser()\n\nchain = prompt | model | output_parser\n\n# Stream the chain\nfor chunk in chain.stream({}):\n    print(chunk, end="", flush=True)\n# Example:\n# Create a chain that does the following:\n# - Accept a string a