In [None]:
from dotenv import load_dotenv
from tqdm import tqdm
import sys

if "/Users/nehiljain/code/e2b-hackathon-oct/app" not in sys.path:
    sys.path.append("/Users/nehiljain/code/e2b-hackathon-oct/app")
if "/Users/nehiljain/code/e2b-hackathon-oct/" not in sys.path:
    sys.path.append("/Users/nehiljain/code/e2b-hackathon-oct/")

load_dotenv()

In [None]:
BLOG_URLS = ["https://e2b.dev/blog/python-code-interpreter-with-o1-and-gpt-4o"]

In [None]:
import os
import json
import hashlib
from langchain_community.document_loaders import FireCrawlLoader
from langchain_core.documents import Document

import tiktoken


def num_tokens_from_string(string: str, model_name: str) -> int:
    """
    This function takes a string and a model name as input and returns the number of tokens in the string
    according to the specified model's encoding.

    Args:
    string (str): The input text to be tokenized.
    model_name (str): The name of the OpenAI model to determine the encoding.

    Returns:
    int: The number of tokens in the input string.
    """
    # Get the encoding for the specified model
    encoding = tiktoken.encoding_for_model(model_name)
    # Encode the string and count the number of tokens
    num_tokens = len(encoding.encode(string))
    return num_tokens


def get_hashed_filename(url: str) -> str:
    """
    Generate a hashed filename for the given URL.

    Args:
    url (str): The URL to be hashed.

    Returns:
    str: The hashed filename.
    """
    return hashlib.md5(url.encode()).hexdigest() + ".json"


def load_docs_from_cache_or_scrape(
    url: str, cache_folder: str, model_name: str
) -> list:
    """
    Load documents from cache if available, otherwise scrape using FireCrawlLoader and save to cache.
    Adds token count as metadata to each document.

    Args:
    url (str): The URL to scrape.
    cache_folder (str): The folder to store cached JSON files.
    model_name (str): The name of the OpenAI model to determine the encoding.

    Returns:
    list: The loaded documents with token count metadata.
    """
    # Ensure the cache folder exists
    os.makedirs(cache_folder, exist_ok=True)

    # Generate the hashed filename
    hashed_filename = get_hashed_filename(url)
    cache_file_path = os.path.join(cache_folder, hashed_filename)

    # Check if the cached file exists
    if os.path.exists(cache_file_path):
        with open(cache_file_path, "r") as f:
            raw_file_data = json.load(f)
            docs = [Document(**doc) for doc in raw_file_data]
    else:
        # Scrape the documents using FireCrawlLoader
        loader = FireCrawlLoader(url=url, mode="scrape")
        docs = loader.load()

        # Save the documents to the cache file
        with open(cache_file_path, "w") as f:
            json.dump([doc.dict() for doc in docs], f)

    # Add token count as metadata to each document
    for doc in docs:
        token_count = num_tokens_from_string(doc.page_content, model_name)
        doc.metadata["token_count"] = token_count

    return docs


BLOG_URL = "https://cookbook.openai.com/examples/structured_outputs_intro"
CACHE_FOLDER = "./data/cache"
MODEL_NAME = "gpt-4o"

In [None]:
blog_docs = [
    {"url": url, "docs": load_docs_from_cache_or_scrape(url, CACHE_FOLDER, MODEL_NAME)}
    for url in tqdm(BLOG_URLS)
]

In [None]:
# checking the token count for each blog. Seems reasonable
for blog in blog_docs:
    print(blog["url"])
    print(blog["docs"][0].metadata["token_count"])

In [None]:
import sys

if "/Users/nehiljain/code/e2b-hackathon-oct/app" not in sys.path:
    sys.path.append("/Users/nehiljain/code/e2b-hackathon-oct/app")
if "/Users/nehiljain/code/e2b-hackathon-oct/" not in sys.path:
    sys.path.append("/Users/nehiljain/code/e2b-hackathon-oct/")


from app.schemas import BlogCodeRecipe, IsBlogPostTechnical, CodeRecipeDescriptions
from app.llms import model4o
from app.prompts import (
    extract_is_blog_post_technical_prompt,
    extract_code_metadata_prompt,
    extract_all_code_recipes_prompt,
)

extract_tech_deets_model = model4o.with_structured_output(IsBlogPostTechnical)
extract_code_deets_model = model4o.with_structured_output(BlogCodeRecipe)
first_pass_details_chain = (
    extract_is_blog_post_technical_prompt | extract_tech_deets_model
)

first_pass_code_chain = extract_code_metadata_prompt | extract_code_deets_model

metadata_details = first_pass_details_chain.invoke(
    {"blog_post": blog_docs[-1]["docs"][0].page_content}
)


extract_code_recipe_description_chain = (
    extract_all_code_recipes_prompt
    | model4o.with_structured_output(CodeRecipeDescriptions)
)
code_recipe_descriptions = extract_code_recipe_description_chain.invoke(
    {"blog_post": blog_docs[-1]["docs"][0].page_content}
)

In [None]:
code_recipe_descriptions.model_dump()

In [None]:
code_recipe_extraction_inputs = [
    {
        "code_recipe_description": str(code_recipe_description),
        "blog_post": blog_docs[0]["docs"][0].page_content,
    }
    for code_recipe_description in code_recipe_descriptions.recipes
]
code_details = first_pass_code_chain.batch(code_recipe_extraction_inputs)

In [None]:
print(blog_docs[-1]["docs"][0].page_content)

In [None]:
from app.e2b_runner import run_code_project
from app.schemas import BlogCodeRecipe, CodeFile


def update_env_file(blog_code_recipe: BlogCodeRecipe, env_content: str):
    # Parse the env_content into a dictionary
    env_dict = dict(line.split("=") for line in env_content.strip().split("\n"))

    # Check if there is a .env file in the blog_code_recipe
    for code_file in blog_code_recipe.code:
        if code_file.filepath == ".env":
            # Update the .env file content with the values from env_content
            env_lines = code_file.content.split("\n")
            updated_env_lines = []
            existing_keys = set()
            for line in env_lines:
                if line.strip() and "=" in line:
                    key, _ = line.split("=", 1)
                    existing_keys.add(key)
                    if key in env_dict:
                        updated_env_lines.append(f"{key}={env_dict[key]}")
                    else:
                        updated_env_lines.append(line)
                else:
                    updated_env_lines.append(line)

            # Add new keys from env_dict that don't exist in the current .env file
            for key, value in env_dict.items():
                if key not in existing_keys:
                    updated_env_lines.append(f"{key}={value}")

            code_file.content = "\n".join(updated_env_lines)
            break


env_content = """
... INPUT HERE
"""

blog_recipe_to_test = code_details[0]
update_env_file(blog_recipe_to_test, env_content)
print(f"Running code project: {blog_recipe_to_test.title}")
result = run_code_project(blog_recipe_to_test)
print(f"Exit Code: {result.exit_code}")
print(f"Standard Output: {result.stdout}")
print(f"Standard Error: {result.stderr}")
print("\n" * 3)

In [None]:
# for blog_recipe_to_test in code_details:
#     print(f"Testing : {blog_recipe_to_test.title}")
#     update_env_file(blog_recipe_to_test, env_content)
#     print(f"Running code project: {blog_recipe_to_test.title}")
#     result = run_code_project(blog_recipe_to_test)
#     print(f"Exit Code: {result.exit_code}")
#     print(f"Standard Output: {result.stdout}")
#     print(f"Standard Error: {result.stderr}")
#     print("\n" * 3)