In [1]:
from dotenv import load_dotenv
from tqdm import tqdm
import sys
if '/Users/nehiljain/code/e2b-hackathon-oct/app' not in sys.path:
    sys.path.append('/Users/nehiljain/code/e2b-hackathon-oct/app')
if '/Users/nehiljain/code/e2b-hackathon-oct/' not in sys.path:
    sys.path.append('/Users/nehiljain/code/e2b-hackathon-oct/')

load_dotenv()

True

In [2]:
BLOG_URLS = [
    "https://cookbook.openai.com/examples/structured_outputs_intro",
    "https://cookbook.openai.com/examples/third_party/web_search_with_google_api_bring_your_own_browser_tool",
    "https://fireworks.ai/blog/firellava-the-first-commercially-permissive-oss-llava-model",
    "https://python.useinstructor.com/examples/exact_citations/",
    "https://tomaugspurger.net/posts/modern-1-intro/",
    "https://www.restack.io/docs/prefect-knowledge-prefect-1-tutorial-guide",
    "https://e2b.dev/blog/guide-groq-js",
    "https://e2b.dev/docs/getting-started/api-key#use-api-key"
]

In [3]:
import os
import json
import hashlib
from langchain_community.document_loaders import FireCrawlLoader
from langchain_core.documents import Document

import tiktoken

def num_tokens_from_string(string: str, model_name: str) -> int:
    """
    This function takes a string and a model name as input and returns the number of tokens in the string
    according to the specified model's encoding.
    
    Args:
    string (str): The input text to be tokenized.
    model_name (str): The name of the OpenAI model to determine the encoding.
    
    Returns:
    int: The number of tokens in the input string.
    """
    # Get the encoding for the specified model
    encoding = tiktoken.encoding_for_model(model_name)
    # Encode the string and count the number of tokens
    num_tokens = len(encoding.encode(string))
    return num_tokens

def get_hashed_filename(url: str) -> str:
    """
    Generate a hashed filename for the given URL.

    Args:
    url (str): The URL to be hashed.

    Returns:
    str: The hashed filename.
    """
    return hashlib.md5(url.encode()).hexdigest() + ".json"

def load_docs_from_cache_or_scrape(url: str, cache_folder: str, model_name: str) -> list:
    """
    Load documents from cache if available, otherwise scrape using FireCrawlLoader and save to cache.
    Adds token count as metadata to each document.

    Args:
    url (str): The URL to scrape.
    cache_folder (str): The folder to store cached JSON files.
    model_name (str): The name of the OpenAI model to determine the encoding.

    Returns:
    list: The loaded documents with token count metadata.
    """
    # Ensure the cache folder exists
    os.makedirs(cache_folder, exist_ok=True)
    
    # Generate the hashed filename
    hashed_filename = get_hashed_filename(url)
    cache_file_path = os.path.join(cache_folder, hashed_filename)
    
    # Check if the cached file exists
    if os.path.exists(cache_file_path):
        with open(cache_file_path, 'r') as f:
            raw_file_data = json.load(f)
            docs = [Document(**doc) for doc in raw_file_data]
    else:
        # Scrape the documents using FireCrawlLoader
        loader = FireCrawlLoader(url=url, mode="scrape")
        docs = loader.load()
        
        # Save the documents to the cache file
        with open(cache_file_path, 'w') as f:
            json.dump([doc.dict() for doc in docs], f)
    
    # Add token count as metadata to each document
    for doc in docs:
        token_count = num_tokens_from_string(doc.page_content, model_name)
        doc.metadata['token_count'] = token_count
    
    return docs

BLOG_URL = "https://cookbook.openai.com/examples/structured_outputs_intro"
CACHE_FOLDER = "./data/cache"
MODEL_NAME = "gpt-4o"



In [4]:
blog_docs = [{
    "url": url,
    "docs": load_docs_from_cache_or_scrape(url, CACHE_FOLDER, MODEL_NAME)
} for url in tqdm(BLOG_URLS)]



100%|██████████| 8/8 [00:00<00:00, 38.75it/s]


In [5]:
# checking the token count for each blog. Seems reasonable
for blog in blog_docs:
    print(blog["url"])
    print(blog["docs"][0].metadata['token_count'])

https://cookbook.openai.com/examples/structured_outputs_intro
4410
https://cookbook.openai.com/examples/third_party/web_search_with_google_api_bring_your_own_browser_tool
6217
https://fireworks.ai/blog/firellava-the-first-commercially-permissive-oss-llava-model
2097
https://python.useinstructor.com/examples/exact_citations/
1539
https://tomaugspurger.net/posts/modern-1-intro/
10053
https://www.restack.io/docs/prefect-knowledge-prefect-1-tutorial-guide
5253
https://e2b.dev/blog/guide-groq-js
4164
https://e2b.dev/docs/getting-started/api-key#use-api-key
243


In [6]:
import sys
if '/Users/nehiljain/code/e2b-hackathon-oct/app' not in sys.path:
    sys.path.append('/Users/nehiljain/code/e2b-hackathon-oct/app')
if '/Users/nehiljain/code/e2b-hackathon-oct/' not in sys.path:
    sys.path.append('/Users/nehiljain/code/e2b-hackathon-oct/')


from app.schemas import BlogCodeRecipe, IsBlogPostTechnical, CodeRecipeDescriptions
from app.llms import model4o
from app.prompts import extract_is_blog_post_technical_prompt, extract_code_metadata_prompt, extract_all_code_recipes_prompt

extract_tech_deets_model = model4o.with_structured_output(IsBlogPostTechnical)
extract_code_deets_model = model4o.with_structured_output(BlogCodeRecipe)
first_pass_details_chain = extract_is_blog_post_technical_prompt | extract_tech_deets_model

first_pass_code_chain = extract_code_metadata_prompt | extract_code_deets_model

metadata_details = first_pass_details_chain.invoke({"blog_post": blog_docs[0]["docs"][0].page_content})


extract_code_recipe_description_chain = extract_all_code_recipes_prompt | model4o.with_structured_output(CodeRecipeDescriptions)
code_recipe_descriptions = extract_code_recipe_description_chain.invoke({"blog_post": blog_docs[0]["docs"][0].page_content})


In [7]:
code_recipe_descriptions.model_dump()

{'recipes': [{'title': 'Set up OpenAI API with Python',
   'description': 'Install and import OpenAI with Python using `%pip install openai -U`, `import json`, `from textwrap import dedent`, and `from openai import OpenAI`. Initialize with `client = OpenAI()`.'},
  {'title': 'Build a Math Tutoring Tool with Structured Outputs',
   'description': "Create a math tutoring tool using OpenAI's API to provide step-by-step solutions in JSON schema format. Define prompt and response format, and parse the response to display each step of the solution."},
  {'title': 'Print Math Solution Steps Using IPython',
   'description': "Define a function to parse and display math solution steps using IPython's Math display, iterating through each step and displaying the final answer."},
  {'title': "Use SDK 'parse' Helper with Pydantic",
   'description': "Utilize the SDK 'parse' helper and Pydantic model to define response format for math solutions, allowing automatic parsing of API responses into struc

In [8]:
code_recipe_extraction_inputs = [
    {
        "code_recipe_description": str(code_recipe_description),
        "blog_post": blog_docs[0]["docs"][0].page_content
    }
for code_recipe_description in code_recipe_descriptions.recipes]
code_details = first_pass_code_chain.batch(code_recipe_extraction_inputs)

In [15]:
for code_detail in code_details:
    print(code_detail.title)
    for code_file in code_detail.code:
        print(code_file.filepath)
        print(code_file.content)
        print("\n"*3)
    print("\n"*3)
    

Set up OpenAI API with Python
main.py
import json
from textwrap import dedent
from openai import OpenAI

client = OpenAI()

MODEL = "gpt-4o-2024-08-06"

math_tutor_prompt = '''
    You are a helpful math tutor. You will be provided with a math problem,
    and your goal will be to output a step by step solution, along with a final answer.
    For each step, just provide the output as an equation use the explanation field to detail the reasoning.
'''

def get_math_solution(question):
    response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {
            "role": "system",
            "content": dedent(math_tutor_prompt)
        },
        {
            "role": "user",
            "content": question
        }
    ],
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "math_reasoning",
            "schema": {
                "type": "object",
                "properties": {
                    "steps": {
         