## step 1: building the pipeline

In [1]:
from superpipe.steps import LLMStructuredStep, CustomStep, SERPEnrichmentStep
from superpipe.clients import init_openai
from superpipe import models
from pydantic import BaseModel, Field
import os

# Get the OpenAI API key from the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI client with the API key
if openai_api_key is not None:
    init_openai(openai_api_key)
else:
    print("OPENAI_API_KEY environment variable not set")


# Get the api key from the environment variable, or raise an error if it doesn't exist
api_key = os.getenv("SERPAPI_API_KEY")
if api_key is None:
    raise ValueError("SERPAPI_API_KEY environment variable not set")



# Step 1: use Superpipe's built-in SERP enrichment step to search for the person's Wikipedia page
# Include a unique "name" for the step that will be used to reference this step's output in future steps

search_step = SERPEnrichmentStep(
    prompt=lambda row: f"{row['name']} wikipedia",
    name="search"
)


# Step 2: Use an LLM to extract the Wikipedia URL from the search results
# First, define a Pydantic model that specifies the structured output we want from the LLM

class ParseSearchResult(BaseModel):
    wikipedia_url: str = Field(description="The URL of the Wikipedia page for the person")

# Then we use the built-in LLMStructuredStep and specify a model and a prompt
# The prompt is a function that has access to all the fields in the input as well as the outputs of previous steps

parse_search_step = LLMStructuredStep(
    model=models.gpt35,
    prompt=lambda row: f"Extract the Wikipedia URL for {row['name']} from the following search results: \n\n {row['search']}",
    out_schema=ParseSearchResult,
    name="parse_search"
)

In [2]:
#test one (search step)

search_output = search_step.run({"name": "Jean-Paul Sartre"})
print("Search Output:", search_output)


Search Output: {'name': 'Jean-Paul Sartre', 'search': '{"searchParameters":{"q":"Jean-Paul Sartre wikipedia","type":"search","engine":"google"},"organic":[{"title":"Jean-Paul Sartre - Wikipedia","link":"https://en.wikipedia.org/wiki/Jean-Paul_Sartre","snippet":"Sartre was one of the key figures in the philosophy of existentialism (and phenomenology). His work has influenced sociology, critical theory, post-colonial ...","sitelinks":[{"title":"Nausea (novel)","link":"https://en.wikipedia.org/wiki/Nausea_(novel)"},{"title":"The Wall (Sartre short story...","link":"https://en.wikipedia.org/wiki/The_Wall_(Sartre_short_story_collection)"},{"title":"Simone de Beauvoir","link":"https://en.wikipedia.org/wiki/Simone_de_Beauvoir"},{"title":"Agrégation","link":"https://en.wikipedia.org/wiki/Agr%C3%A9gation"}],"position":1},{"title":"Jean-Paul Sartre - Simple English Wikipedia, the free encyclopedia","link":"https://simple.wikipedia.org/wiki/Jean-Paul_Sartre","snippet":"Jean-Paul Charles Aymard Sa

In [3]:
# test two (parse search result step)

parse_search_output = parse_search_step.run(search_output)
print("Parse Search Result Output:", parse_search_output)

Parse Search Result Output: {'name': 'Jean-Paul Sartre', 'search': '{"searchParameters":{"q":"Jean-Paul Sartre wikipedia","type":"search","engine":"google"},"organic":[{"title":"Jean-Paul Sartre - Wikipedia","link":"https://en.wikipedia.org/wiki/Jean-Paul_Sartre","snippet":"Sartre was one of the key figures in the philosophy of existentialism (and phenomenology). His work has influenced sociology, critical theory, post-colonial ...","sitelinks":[{"title":"Nausea (novel)","link":"https://en.wikipedia.org/wiki/Nausea_(novel)"},{"title":"The Wall (Sartre short story...","link":"https://en.wikipedia.org/wiki/The_Wall_(Sartre_short_story_collection)"},{"title":"Simone de Beauvoir","link":"https://en.wikipedia.org/wiki/Simone_de_Beauvoir"},{"title":"Agrégation","link":"https://en.wikipedia.org/wiki/Agr%C3%A9gation"}],"position":1},{"title":"Jean-Paul Sartre - Simple English Wikipedia, the free encyclopedia","link":"https://simple.wikipedia.org/wiki/Jean-Paul_Sartre","snippet":"Jean-Paul Char

In [4]:
# test three (fetch wikipedia step)

fetch_wikipedia_output = fetch_wikipedia_step.run(parse_search_output)
print("Fetch Wikipedia Output:", fetch_wikipedia_output)

NameError: name 'fetch_wikipedia_step' is not defined

In [None]:
from superpipe.pipeline import Pipeline
import requests
import html2text
import json

h = html2text.HTML2Text()
h.ignore_links = True

# Step 3: we create a CustomStep that can execute any arbitrary function (transform)
# The function fetches the contents of the wikipedia url and converts them to markdown

fetch_wikipedia_step = CustomStep(
  transform=lambda row: h.handle(requests.get(row['wikipedia_url']).text),
  name="wikipedia"
)

# Step 4: we extract the date of birth, living/dead status and cause of death from the wikipedia contents

class ExtractedData(BaseModel):
    date_of_birth: str = Field(description="The date of birth of the person in the format YYYY-MM-DD")
    alive: bool = Field(description="Whether the person is still alive")
    cause_of_death: str = Field(description="The cause of death of the person. If the person is alive, return 'N/A'")

extract_step = LLMStructuredStep(
  model=models.gpt4,
  prompt= lambda row: f"""Extract the date of birth for {row['name']}, whether they're still alive \
  and if not, their cause of death from the following Wikipedia content: \n\n {row['wikipedia']}""",
  out_schema=ExtractedData,
  name="extract_data"
)

# Finally we define and run the pipeline

pipeline = Pipeline([
  search_step,
  parse_search_step,
  fetch_wikipedia_step,
  extract_step
])

output = pipeline.run({"name": "Jean-Paul Sartre"})
print(json.dumps(output, indent=2))


KeyError: 'wikipedia_url'

## step 2: evaluating the pipeline [TBD}]

broken into the folliowing parts:

1. **a dataset with labels** - in this case we need a list of famous people and the true date of birth, living status and cause of death of each person
2. **evaluation function** - a function that defines what "correct" is. We'll use simple comparison for date of birth and living status, and an LLM call to evaluate the correctness of cause of death

In [None]:
import pandas as pd

data = [
  ("Ruth Bader Ginsburg", "1933-03-15", False, "Pancreatic cancer"),
  ("Bill Gates", "1955-10-28", True, "N/A"),
  ("Steph Curry", "1988-03-14", True, "N/A"),
  ("Scott Belsky", "1980-04-18", True, "N/A"),
  ("Steve Jobs", "1955-02-24", False, "Pancreatic tumor/cancer"),
  ("Paris Hilton", "1981-02-17", True, "N/A"),
  ("Kurt Vonnegut", "1922-11-11", False, "Brain injuries"),
  ("Snoop Dogg", "1971-10-20", True, "N/A"),
  ("Kobe Bryant", "1978-08-23", False, "Helicopter crash"),
  ("Aaron Swartz", "1986-11-08", False, "Suicide")
]
df = pd.DataFrame([{"name": d[0], "dob_label": d[1], "alive_label": d[2], "cause_label": d[3]} for d in data])

class EvalResult(BaseModel):
  result: bool = Field(description="Is the answer correct or not?")

cause_evaluator = LLMStructuredStep(
  model=models.gpt4,
  prompt=lambda row: f"This is the correct cause of death: {row['cause_label']}. Is this provided cause of death accurate? The phrasing might be slightly different. Use your judgement: \n{row['cause_of_death']}",
  out_schema=EvalResult,
  name="cause_evaluator")

def eval_fn(row):
  score = 0
  if row['date_of_birth'] == row['dob_label']:
    score += 0.25
  if row['alive'] == row['alive_label']:
    score += 0.25
  if row['cause_label'] == "N/A":
    if row['cause_of_death'] == "N/A":
      score += 0.5
  elif cause_evaluator.run(row)['result']:
    score += 0.5  
  return score

pipeline.run(df)
print("Score: ", pipeline.evaluate(eval_fn))
df

## step 3: optimizing the pipeline

this pipeline has an accuracy score of 100%, but perhaps there's room for improvement on cost and speed. First let's view the cost and latency of each step to figure out which one is the bottleneck.

In [None]:
for step in pipeline.steps:
  print(f"Step {step.name}:")
  print(f"- Latency: {step.statistics.total_latency}")
  print(f"- Cost: {step.statistics.input_cost + step.statistics.output_cost}")

Clearly the final step (`extract_data`) is the one responsible for the bulk of the cost and latency. This makes sense, because we're feeding in the entire wikipedia article to GPT-4, one of the most expensive models.

Let's find out if we can get away with a cheaper/faster model. Most models cannot handle the number of tokens needed to ingest a whole wikipedia article, so we'll turn to the two that can that are also cheaper than GPT4: Claude 3 Sonnet and Claude 3 Haiku.

In [None]:
from superpipe.grid_search import GridSearch
from superpipe.models import claude3_haiku, claude3_sonnet
from superpipe.steps import LLMStructuredCompositeStep

# we need to use LLMStructuredCompositeStep which uses GPT3.5 for structured JSON extraction
# because Claude does not support JSON mode or function calling out of the box
new_extract_step = LLMStructuredCompositeStep(
  model=models.claude3_haiku,
  prompt=extract_step.prompt,
  out_schema=ExtractedData,
  name="extract_data_new"
)

new_pipeline = Pipeline([
  search_step,
  parse_search_step,
  fetch_wikipedia_step,
  new_extract_step
], evaluation_fn=eval_fn)

param_grid = {
  new_extract_step.name:{
    "model": [claude3_haiku, claude3_sonnet]}
}
grid_search = GridSearch(new_pipeline, param_grid)
grid_search.run(df)

Strangely, Claude 3 Haiku is both more accurate (100% v/s 45%) as well as cheaper and faster. This is suprising, but useful information that we wouldn't have found out unless we built and evaluated pipelines on _our specific data_ rather than benchmark data.

In [None]:
best_params = grid_search.best_params
new_pipeline.update_params(best_params)
new_pipeline.run(df)
print("Score: ", new_pipeline.score)
for step in new_pipeline.steps:
  print(f"Step {step.name}:")
  print(f"- Latency: {step.statistics.total_latency}")
  print(f"- Cost: {step.statistics.input_cost + step.statistics.output_cost}")