In [4]:
import os
from dotenv import load_dotenv
import openai

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [9]:
from llama_index.llms import OpenAI
from llama_index.response.pprint_utils import pprint_response
from llama_index.tools import QueryEngineTool

In [3]:
from guidance.models import OpenAI as GuidanceOpenAI
from llama_index.question_gen.guidance_generator import GuidanceQuestionGenerator
from llama_index.tools import ToolMetadata
from llama_index import QueryBundle

In [None]:
#dont run

tools = [
    ToolMetadata(
        name="lyft_10k",
        description="Provides information about Lyft financials for year 2021",
    ),
    ToolMetadata(
        name="uber_10k",
        description="Provides information about Uber financials for year 2021",
    ),
]

question_gen = GuidanceQuestionGenerator.from_defaults(
    guidance_llm = GuidanceOpenAI(model="text-davinci-003"),
    verbose=False
)


# sub_questions = question_gen.generate(
#     tools=tools,
#     query=QueryBundle("Compare and contrast apple and google financials for year 2023")
# )

In [6]:
from llama_index.query_engine import SubQuestionQueryEngine

In [None]:
#dont run

#get or create index; then create query engines
query_engine_1 = "query_engine_1"
query_engine_2 = "query_engine_2"

#construct query_tool_list
query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine_1,
        metadata=ToolMetadata(
            name="query_tool_1",
            description="blah blah"
        )
    ),
    QueryEngineTool(
        query_engine=query_engine_2,
        metadata=ToolMetadata(
            name="query_tool_2",
            description="blah blah blah"
        )
    )
]

#create subquery enine
sub_ques_eng = SubQuestionQueryEngine.from_defaults(
    question_gen=question_gen,
    query_engine_tools=query_engine_tools
)

## response = sub_ques_eng.query("mio-mao")

#### DataFrame Structured data extraction

In [7]:
from llama_index.program import (
    OpenAIPydanticProgram,
    GuidancePydanticProgram,
    DFRowsProgram,
    DFFullProgram,
    DataFrameRowsOnly,
    DataFrame
)

In [12]:
program = OpenAIPydanticProgram.from_defaults(
    output_cls=DataFrame,
    llm=OpenAI(temperature=0, model="gpt-4-0613"),
    prompt_template_str=(
        "Please extract the following query into structured data according"
        " to: {input_str}. Please extract both the set of column names and a"
        " set of rows"
    ),
    verbose=True
)

response_obj = program(
    input_str="""My name is John and I am 25 years old. I live in 
        New York and I like to play basketball. His name is 
        Mike and he is 30 years old. He lives in San Francisco 
        and he likes to play baseball. Sarah is 20 years old 
        and she lives in Los Angeles. She likes to play tennis.
        Her name is Mary and she is 35 years old. 
        She lives in Chicago."""
)


In [15]:
response_obj.to_df()

Unnamed: 0,Name,Age,City,Hobby
0,John,25,New York,Basketball
1,Mike,30,San Francisco,Baseball
2,Sarah,20,Los Angeles,Tennis
3,Mary,35,Chicago,Not specified


#### DataFrame Programs

In [19]:
import pandas as pd

df = pd.DataFrame({
    "name":pd.Series(dtype="str"),
    "age":pd.Series(dtype="str"),
    "city":pd.Series(dtype="str"),
    "fav sport":pd.Series(dtype="str")
})

df_rows_program = DFRowsProgram.from_defaults(
    pydantic_program_cls=OpenAIPydanticProgram,
    df=df
)

result_obj = df_rows_program(
    input_str="""My name is John and I am 25 years old. I live in 
        New York and I like to play basketball. His name is 
        Mike and he is 30 years old. He lives in San Francisco 
        and he likes to play baseball. Sarah is 20 years old 
        and she lives in Los Angeles. She likes to play tennis.
        Her name is Mary and she is 35 years old. 
        She lives in Chicago."""
)

In [29]:
result_obj.to_df()

Unnamed: 0,0,1,2,3
0,John,25,New York,basketball
1,Mike,30,San Francisco,baseball
2,Sarah,20,Los Angeles,tennis
3,Mary,35,Chicago,


checkout evaporate!

#### Get pydantic outputs from query engines

In [30]:
from typing import List
from pydantic import BaseModel

class Biography(BaseModel):
    """Data model for a biography."""

    name: str
    best_known_for: List[str]
    extra_info: str

In [37]:
#dont run
from llama_index import (
    load_index_from_storage,
    load_indices_from_storage,
    StorageContext
)

index = load_index_from_storage(
    storage_context = StorageContext.from_defaults(persist_dir="./research_paper_index"),
    index_id="graph_of_thoughts"
)

query_engine = index.as_query_engine(
    resonse_mode="tree_summarize",
    output_cls=Biography
)

# response = query_engine.query("Who is Paul Graham?")