## step 1: building the pipeline

In [2]:
from superpipe.steps import LLMStructuredStep, CustomStep, SERPEnrichmentStep
from superpipe.clients import init_openai
from superpipe import models
from pydantic import BaseModel, Field, HttpUrl
import os

# get the openai api] key from the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")

# initialize the opeani client with the API key
if openai_api_key is not None:
    init_openai(openai_api_key)
else:
    print("OPENAI_API_KEY environment variable not set")

# get the api key from the environment variable, or raise an error if it doesn't exist
api_key = os.getenv("SERPAPI_API_KEY")
if api_key is None:
    raise ValueError("SERPAPI_API_KEY environment variable not set")

# step 1: use superpipe's built-in serp enrichment step to search for the person's wikipedia page
# include a unique "name" for the step that will be used to reference this step's output in future steps
search_step = SERPEnrichmentStep(
    prompt=lambda row: f"{row['name']} wikipedia",
    name="search"
)

# step 2: use an llm to extract the wikipedia url from the search results
# first, define a pydantic model that specifies the structured output we want from the llm
class ParseSearchResult(BaseModel):
    wikipedia_url: HttpUrl  # ensures that the extracted url is valid

# adjust the prompt to clearly ask for the wikipedia url extraction
parse_search_step = LLMStructuredStep(
    model=models.gpt4,
    prompt=lambda row: (
        f"Extract the Wikipedia URL for {row['name']} from the following search results: \n\n"
        f"{row['search']}\n\n"
        "Provide the URL in a clear and concise format."
    ),
    out_schema=ParseSearchResult,
    name="parse_search"
)

In [3]:
from superpipe.pipeline import Pipeline
import requests
import html2text
import json

h = html2text.HTML2Text()
h.ignore_links = True

# step 3: we create a customstep that can execute any arbitrary function (transform)
# the function fetches the contents of the wikipedia url and converts them to markdown
fetch_wikipedia_step = CustomStep(
  transform=lambda row: h.handle(requests.get(row['wikipedia_url']).text),
  name="wikipedia"  
)

# step 4: we extract the date of birth, living/dead status and cause of death from the wikipedia contents
class ExtractedData(BaseModel):
    date_of_birth: str = Field(description="The date of birth of the person in the format YYYY-MM-DD")
    alive: bool = Field(description="Whether the person is still alive")
    cause_of_death: str = Field(description="The cause of death of the person. If the person is alive, return 'N/A'")

extract_step = LLMStructuredStep(
  model=models.gpt4,
  prompt= lambda row: f"""Extract the date of birth for {row['name']}, whether they're still alive \
  and if not, their cause of death from the following Wikipedia content: \n\n {row['wikipedia']}""",
  out_schema=ExtractedData,
  name="extract_data"
)

# finally we define and run the pipeline
pipeline = Pipeline([
  search_step,
  parse_search_step,
  fetch_wikipedia_step,
  extract_step
])

## step 2: evaluating the pipeline

broken into the folliowing parts:

1. **a dataset with labels** - in this case we need a list of famous people and the true date of birth, living status and cause of death of each person
2. **evaluation function** - a function that defines what "correct" is. We'll use simple comparison for date of birth and living status, and an LLM call to evaluate the correctness of cause of death

In [3]:
import pandas as pd

data = [
 ('Ruth Bader Ginsburg', '1933-03-15', False, 'Pancreatic cancer'),
 ('Bill Gates', '1955-10-28', True, 'N/A'),
 ('Steph Curry', '1988-03-14', True, 'N/A'),
 ('Scott Belsky', '1980-04-18', True, 'N/A'),
 ('Steve Jobs', '1955-02-24', False, 'Respiratory Arrest Related to a Neuroendocrine Tumor'),
 ('Paris Hilton', '1981-02-17', True, 'N/A'),
 ('Kurt Vonnegut', '1922-11-11', False, 'Brain injuries'),
 ('Snoop Dogg', '1971-10-20', True, 'N/A'),
 ('Kobe Bryant', '1978-08-23', False, 'Helicopter crash'),
 ('Aaron Swartz', '1986-11-08', False, 'Suicide'),
 ('Albert Einstein', '1879-03-14', False, 'Aortic Aneurysm'),
 ('Martin Luther King Jr.', '1929-01-15', False, 'Assassination By Firearm'),
 ('Marilyn Monroe', '1926-06-01', False, 'Drug Overdose'),
 ('Leonardo da Vinci', '1452-04-15', False, 'Stroke'),
 ('William Shakespeare', '1564-04-26', False, 'Natural Causes'),
 ('Frida Kahlo', '1907-07-06', False, 'Pulmonary Embolism'),
 ('Vincent van Gogh', '1853-04-30', False, 'Suicide by Gunshot'),
 ('Isaac Newton', '1643-01-04', False, 'Natural Causes'),
 ('Pablo Picasso', '1881-10-25', False, 'Pulmonary Edema & Heart Attack'),
 ('Mahatma Gandhi', '1869-10-02', False, 'Assassination By Firearm'),
 ('Jane Austen', '1775-12-16', False, "Addison'S Disease"),
 ('Charles Darwin', '1809-02-12', False, 'Coronary Thrombosis'),
 ('Wolfgang Amadeus Mozart', '1756-01-27', False, 'Unknown'),
 ('Princess Diana', '1961-07-01', False, 'Car Crash'),
 ('Nelson Mandela', '1918-07-18', False, 'Respiratory Infection'),
 ('Bruce Lee', '1940-11-27', False, 'Cerebral Edema'),
 ('Sigmund Freud', '1856-05-06', False, 'Euthanasia'),
 ('Amelia Earhart', '1897-07-24', False, 'Presumed Dead After Disappearance over the Pacific Ocean.'),
 ('Malcolm X', '1925-05-19', False, 'Assassination'),
 ('Anne Frank', '1929-06-12', False, 'Typhus'),
 ('Galileo Galilei', '1564-02-15', False, 'Fever & Heart Palpitations'),
 ('Charlie Chaplin', '1889-04-16', False, 'Stroke'),
 ('Elvis Presley', '1935-01-08', False, 'Cardiac Arrest '),
 ('Michael Jackson', '1958-08-29', False, 'Propofol Overdose'),
 ('Nikola Tesla', '1856-07-10', False, 'Coronary Thrombosis'),
 ('Florence Nightingale', '1820-05-12', False, 'Natural Causes'),
 ('Edgar Allan Poe', '1809-01-19', False, 'Unknown'),
 ('Marie Curie', '1867-11-07', False, 'Aplastic Anemia'),
 ('Abraham Lincoln', '1809-02-12', False, 'Assassination'),
 ('George Washington', '1732-02-22', False, 'Epiglottitis'),
 ('Ada Lovelace', '1815-12-10', False, 'Uterine Cancer'),
 ('James Dean', '1931-02-08', False, 'Car Crash'),
 ('Tupac Shakur', '1971-06-16', False, 'Murder By Firearm'),
 ('Stepehen Hawking', '1942-01-08', False, 'Motor Nueron Disease/ALS'),
 ('Elon Musk', '1971-06-28', True, 'N/A'),
 ('John Doerr', '1951-06-29', True, 'N/A'),
 ('Harry Stebbings', '1996-06-22', True, 'N/A'),
 ('Cory Booker', '1969-04-27', True, 'N/A'),
 ('Noah Kahan', '1997-01-01', True, 'N/A'),
 ('Sam Altman', '1985-04-22', True, 'N/A')
]
df = pd.DataFrame([{"name": d[0], "dob_label": d[1], "alive_label": d[2], "cause_label": d[3]} for d in data])

class EvalResult(BaseModel):
  result: bool = Field(description="Is the answer correct or not?")

cause_evaluator = LLMStructuredStep(
  model=models.gpt4,
  prompt=lambda row: f"This is the correct cause of death: {row['cause_label']}. Is this provided cause of death accurate? The phrasing might be slightly different. Use your judgement: \n{row['cause_of_death']}",
  out_schema=EvalResult,
  name="cause_evaluator")

def eval_fn(row):
  score = 0
  if row['date_of_birth'] == row['dob_label']:
    score += 0.25
  if row['alive'] == row['alive_label']:
    score += 0.25
  if row['cause_label'] == "N/A":
    if row['cause_of_death'] == "N/A":
      score += 0.5
  elif cause_evaluator.run(row)['result']:
    score += 0.5  
  return score

pipeline.run(df)
print("Score: ", pipeline.evaluate(eval_fn))
df

Applying step search: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]
Applying step parse_search: 100%|██████████| 50/50 [01:49<00:00,  2.20s/it]
Applying step wikipedia: 100%|██████████| 50/50 [00:30<00:00,  1.67it/s]
Applying step extract_data: 100%|██████████| 50/50 [08:23<00:00, 10.07s/it]


Score:  0.935


Unnamed: 0,name,dob_label,alive_label,cause_label,search,__parse_search__,wikipedia_url,wikipedia,__extract_data__,date_of_birth,alive,cause_of_death,__eval_fn__
0,Ruth Bader Ginsburg,1933-03-15,False,Pancreatic cancer,"{""searchParameters"":{""q"":""Ruth Bader Ginsburg ...","{'input_tokens': 1911, 'output_tokens': 23, 's...",https://en.wikipedia.org/wiki/Ruth_Bader_Ginsburg,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 46341, 'output_tokens': 37, '...",1933-03-15,False,complications of metastatic pancreatic cancer,1.0
1,Bill Gates,1955-10-28,True,,"{""searchParameters"":{""q"":""Bill Gates wikipedia...","{'input_tokens': 1623, 'output_tokens': 20, 's...",https://en.wikipedia.org/wiki/Bill_Gates,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 46261, 'output_tokens': 32, '...",1955-10-28,True,,1.0
2,Steph Curry,1988-03-14,True,,"{""searchParameters"":{""q"":""Steph Curry wikipedi...","{'input_tokens': 1287, 'output_tokens': 20, 's...",https://en.wikipedia.org/wiki/Stephen_Curry,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 65024, 'output_tokens': 32, '...",1988-03-14,True,,1.0
3,Scott Belsky,1980-04-18,True,,"{""searchParameters"":{""q"":""Scott Belsky wikiped...","{'input_tokens': 1448, 'output_tokens': 21, 's...",https://en.wikipedia.org/wiki/Scott_Belsky,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 2188, 'output_tokens': 32, 's...",1980-04-18,True,,1.0
4,Steve Jobs,1955-02-24,False,Respiratory Arrest Related to a Neuroendocrine...,"{""searchParameters"":{""q"":""Steve Jobs wikipedia...","{'input_tokens': 1505, 'output_tokens': 20, 's...",https://en.wikipedia.org/wiki/Steve_Jobs,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 47025, 'output_tokens': 42, '...",1955-02-24,False,respiratory arrest related to a pancreatic neu...,1.0
5,Paris Hilton,1981-02-17,True,,"{""searchParameters"":{""q"":""Paris Hilton wikiped...","{'input_tokens': 1295, 'output_tokens': 20, 's...",https://en.wikipedia.org/wiki/Paris_Hilton,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 49218, 'output_tokens': 32, '...",1981-02-17,True,,1.0
6,Kurt Vonnegut,1922-11-11,False,Brain injuries,"{""searchParameters"":{""q"":""Kurt Vonnegut wikipe...","{'input_tokens': 1361, 'output_tokens': 22, 's...",https://en.wikipedia.org/wiki/Kurt_Vonnegut,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 29649, 'output_tokens': 45, '...",1922-11-11,False,"brain injuries incurred several weeks prior, f...",1.0
7,Snoop Dogg,1971-10-20,True,,"{""searchParameters"":{""q"":""Snoop Dogg wikipedia...","{'input_tokens': 1800, 'output_tokens': 20, 's...",https://en.wikipedia.org/wiki/Snoop_Dogg,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 41013, 'output_tokens': 32, '...",1971-10-20,True,,1.0
8,Kobe Bryant,1978-08-23,False,Helicopter crash,"{""searchParameters"":{""q"":""Kobe Bryant wikipedi...","{'input_tokens': 1400, 'output_tokens': 21, 's...",https://en.wikipedia.org/wiki/Kobe_Bryant,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 74099, 'output_tokens': 33, '...",1978-08-23,False,helicopter crash,1.0
9,Aaron Swartz,1986-11-08,False,Suicide,"{""searchParameters"":{""q"":""Aaron Swartz wikiped...","{'input_tokens': 1220, 'output_tokens': 21, 's...",https://en.wikipedia.org/wiki/Aaron_Swartz,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 37532, 'output_tokens': 34, '...",1986-11-08,False,Suicide by hanging,1.0


## step 3: optimizing the pipeline

this pipeline has an accuracy score of 86.5%, but perhaps there's room for improvement on cost and speed. First let's view the cost and latency of each step to figure out which one is the bottleneck.

In [4]:
for step in pipeline.steps:
  print(f"Step {step.name}:")
  print(f"- Latency: {step.statistics.total_latency}")
  print(f"- Cost: {step.statistics.input_cost + step.statistics.output_cost}")

Step search:
- Latency: 0.0
- Cost: 0.0
Step parse_search:
- Latency: 0.0
- Cost: 0.0
Step wikipedia:
- Latency: 0.0
- Cost: 0.0
Step extract_data:
- Latency: 0.0
- Cost: 0.0


Clearly the final step (`extract_data`) is the one responsible for the bulk of the cost and latency. This makes sense, because we're feeding in the entire wikipedia article to GPT-4, one of the most expensive models.

Let's find out if we can get away with a cheaper/faster model. Most models cannot handle the number of tokens needed to ingest a whole wikipedia article, so we'll turn to the two that can that are also cheaper than GPT4: Claude 3 Sonnet and Claude 3 Haiku.

In [5]:
from superpipe.grid_search import GridSearch
from superpipe.clients import init_anthropic
from superpipe.models import claude3_haiku, claude3_sonnet, claude3_opus
from superpipe.steps import LLMStructuredCompositeStep
import os

# get the claude API key from the environment variable
anthropic_api_key = os.environ["ANTHROPIC_API_KEY"]

# Initialize the anthropic client with the API key
if anthropic_api_key is not None:
    init_anthropic(openai_api_key)
else:
    print("ANTHROPIC_API_KEY environment variable not set")

# we need to use LLMStructuredCompositeStep which uses GPT3.5 for structured JSON extraction
# because Claude does not support JSON mode or function calling out of the box
new_extract_step = LLMStructuredCompositeStep(
  model=models.claude3_haiku,
  prompt=extract_step.prompt,
  out_schema=ExtractedData,
  name="extract_data_new"
)

new_pipeline = Pipeline([
  search_step,
  parse_search_step,
  fetch_wikipedia_step,
  new_extract_step
], evaluation_fn=eval_fn)

param_grid = {
  new_extract_step.name:{
    "model": [claude3_haiku, claude3_sonnet, claude3_opus]}
}
grid_search = GridSearch(new_pipeline, param_grid)
grid_search.run(df)

Applying step search: 100%|██████████| 50/50 [00:44<00:00,  1.13it/s]
Applying step parse_search:  28%|██▊       | 14/50 [00:28<01:13,  2.03s/it]


KeyboardInterrupt: 

Strangely, Claude 3 Haiku is both more accurate (100% v/s 45%) as well as cheaper and faster. This is suprising, but useful information that we wouldn't have found out unless we built and evaluated pipelines on _our specific data_ rather than benchmark data.

In [None]:
best_params = grid_search.best_params
new_pipeline.update_params(best_params)
new_pipeline.run(df)
print("Score: ", new_pipeline.score)
for step in new_pipeline.steps:
  print(f"Step {step.name}:")
  print(f"- Latency: {step.statistics.total_latency}")
  print(f"- Cost: {step.statistics.input_cost + step.statistics.output_cost}")