## step 1: building the pipeline

In [2]:
from superpipe.steps import LLMStructuredStep, CustomStep, SERPEnrichmentStep
from superpipe.clients import init_openai
from superpipe import models
from pydantic import BaseModel, Field, HttpUrl
import os

# Get the OpenAI API key from the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI client with the API key
if openai_api_key is not None:
    init_openai(openai_api_key)
else:
    print("OPENAI_API_KEY environment variable not set")


# Get the api key from the environment variable, or raise an error if it doesn't exist
api_key = os.getenv("SERPAPI_API_KEY")
if api_key is None:
    raise ValueError("SERPAPI_API_KEY environment variable not set")



# Step 1: use Superpipe's built-in SERP enrichment step to search for the person's Wikipedia page
# Include a unique "name" for the step that will be used to reference this step's output in future steps

search_step = SERPEnrichmentStep(
    prompt=lambda row: f"{row['name']} wikipedia",
    name="search"
)


# Step 2: Use an LLM to extract the Wikipedia URL from the search results
# First, define a Pydantic model that specifies the structured output we want from the LLM


class ParseSearchResult(BaseModel):
    wikipedia_url: HttpUrl  # Ensures that the extracted URL is valid

# Adjust the prompt to clearly ask for the Wikipedia URL extraction
parse_search_step = LLMStructuredStep(
    model=models.gpt35,
    prompt=lambda row: (
        f"Extract the Wikipedia URL for {row['name']} from the following search results: \n\n"
        f"{row['search']}\n\n"
        "Provide the URL in a clear and concise format."
    ),
    out_schema=ParseSearchResult,
    name="parse_search"
)

In [3]:
from superpipe.pipeline import Pipeline
import requests
import html2text
import json

h = html2text.HTML2Text()
h.ignore_links = True

# Step 3: we create a CustomStep that can execute any arbitrary function (transform)
# The function fetches the contents of the wikipedia url and converts them to markdown

fetch_wikipedia_step = CustomStep(
  transform=lambda row: h.handle(requests.get(row['wikipedia_url']).text),
  name="wikipedia"
)

# Step 4: we extract the date of birth, living/dead status and cause of death from the wikipedia contents

class ExtractedData(BaseModel):
    date_of_birth: str = Field(description="The date of birth of the person in the format YYYY-MM-DD")
    alive: bool = Field(description="Whether the person is still alive")
    cause_of_death: str = Field(description="The cause of death of the person. If the person is alive, return 'N/A'")

extract_step = LLMStructuredStep(
  model=models.gpt4,
  prompt= lambda row: f"""Extract the date of birth for {row['name']}, whether they're still alive \
  and if not, their cause of death from the following Wikipedia content: \n\n {row['wikipedia']}""",
  out_schema=ExtractedData,
  name="extract_data"
)

# Finally we define and run the pipeline

pipeline = Pipeline([
  search_step,
  parse_search_step,
  fetch_wikipedia_step,
  extract_step
])

{
  "name": "Jean-Paul Sartre",
  "search": "{\"searchParameters\":{\"q\":\"Jean-Paul Sartre wikipedia\",\"type\":\"search\",\"engine\":\"google\"},\"organic\":[{\"title\":\"Jean-Paul Sartre - Wikipedia\",\"link\":\"https://en.wikipedia.org/wiki/Jean-Paul_Sartre\",\"snippet\":\"Sartre was one of the key figures in the philosophy of existentialism (and phenomenology). His work has influenced sociology, critical theory, post-colonial ...\",\"sitelinks\":[{\"title\":\"Nausea (novel)\",\"link\":\"https://en.wikipedia.org/wiki/Nausea_(novel)\"},{\"title\":\"The Wall (Sartre short story...\",\"link\":\"https://en.wikipedia.org/wiki/The_Wall_(Sartre_short_story_collection)\"},{\"title\":\"Simone de Beauvoir\",\"link\":\"https://en.wikipedia.org/wiki/Simone_de_Beauvoir\"},{\"title\":\"Agr\u00e9gation\",\"link\":\"https://en.wikipedia.org/wiki/Agr%C3%A9gation\"}],\"position\":1},{\"title\":\"Jean-Paul Sartre - Simple English Wikipedia, the free encyclopedia\",\"link\":\"https://simple.wikipedia

## step 2: evaluating the pipeline

broken into the folliowing parts:

1. **a dataset with labels** - in this case we need a list of famous people and the true date of birth, living status and cause of death of each person
2. **evaluation function** - a function that defines what "correct" is. We'll use simple comparison for date of birth and living status, and an LLM call to evaluate the correctness of cause of death

In [4]:
'''('Ruth Bader Ginsburg', '1933-03-15', False, 'Pancreatic cancer'),
 ('Bill Gates', '1955-10-28', True, 'N/A'),
 ('Steph Curry', '1988-03-14', True, 'N/A'),
 ('Scott Belsky', '1980-04-18', True, 'N/A'),
 ('Steve Jobs', '1955-02-24', False, 'Pancreatic tumor/cancer'),
 ('Paris Hilton', '1981-02-17', True, 'N/A'),
 ('Kurt Vonnegut', '1922-11-11', False, 'Brain injuries'),
 ('Snoop Dogg', '1971-10-20', True, 'N/A'),
 ('Kobe Bryant', '1978-08-23', False, 'Helicopter crash'),
 ('Aaron Swartz', '1986-11-08', False, 'Suicide'),
 ('Albert Einstein', '1879-03-14', False, 'Aortic Aneurysm'),
 ('Martin Luther King Jr.', '1929-01-15', False, 'Assassination By Firearm'),
 ('Marilyn Monroe', '1926-06-01', False, 'Drug Overdose'),
 ('Leonardo da Vinci', '1452-04-15', False, 'Stroke'),
 ('William Shakespeare', '1564-04-26', False, 'No cause'),
 ('Frida Kahlo', '1907-07-06', False, 'Pulmonary Embolism'),
 ('Vincent van Gogh', '1853-04-30', False, 'Suicide by Gunshot'),
 ('Isaac Newton', '1642-12-25', False, 'No cause'),
 ('Pablo Picasso', '1881-10-25', False, 'Pulmonary Edema & Heart Attack'),
 ('Mahatma Gandhi', '1869-10-02', False, 'Assassination By Firearm'),
 ('Jane Austen', '1775-12-16', False, "Addison'S Disease"),
 ('Charles Darwin', '1809-02-12', False, 'Angina Pectoris'),
 ('Wolfgang Amadeus Mozart', '1756-01-27', False, 'Severe Miliary Fever'),
 ('Princess Diana', '1961-07-01', False, 'Car Crash'),
 ('Nelson Mandela', '1918-07-18', False, 'Respiratory Infection'),
 ('Bruce Lee', '1940-11-27', False, 'Cerebral Edema'),
 ('Sigmund Freud', '1856-5-6', False, 'Cancer'),
 ('Amelia Earhart', '1897-7-24', False, 'Presumed Dead After Disappearance over the Pacific Ocean.'),
 ('Malcolm X', '1925-05-19', False, 'Assassination'),
 ('Anne Frank', '1929-06-12', False, 'Typhus'),
 ('Galileo Galilei', '1564-02-15', False, 'No cause'),
 ('Charlie Chaplin', '1889-04-16', False, 'Stroke'),
 ('Elvis Presley', '1935-01-08', False, 'Cardiac Arrest '),
 ('Michael Jackson', '1958-08-29', False, 'Cardiac Arrest '),
 ('Nikola Tesla', '1856-07-10', False, 'Coronary Thrombosis'),
 ('Florence Nightingale', '1820-05-12', False, 'No Cause'),
 ('Edgar Allan Poe', '1809-01-19', False, 'Unknown'),
 ('Marie Curie', '1867-11-07', False, 'Aplastic Anemia'),
 ('Abraham Lincoln', '1809-02-12', False, 'Assassination'),
 ('George Washington', '1732-02-22', False, 'Epiglottitis'),
 ('Ada Lovelace', '1815-12-10', False, 'Uterine Cancer'),
 ('James Dean', '1931-02-08', False, 'Car Crash'),
 ('Tupac Shakur', '1971-06-16', False, 'Murder By Firearm'),
 ('Stepehen Hawking', '1942-01-08', False, 'Natural Causes'),
 ('Elon Musk', '1971-06-28', True, 'N/A'),
 ('John Doerr', '1951-06-29', True, 'N/A'),
 ('Harry Stebbings', '1996-06-22', True, 'N/A'),
 ('Cory Booker', '1969-04-27', True, 'N/A'),
 ('Noah Kahan', '1997-01-01', True, 'N/A'),
 ('Sam Altman', '1985-04-22', True, 'N/A')]'''

import pandas as pd

data = [
('Ruth Bader Ginsburg', '1933-03-15', False, 'Pancreatic cancer'),
 ('Bill Gates', '1955-10-28', True, 'N/A'),
 ('Steph Curry', '1988-03-14', True, 'N/A'),
]
df = pd.DataFrame([{"name": d[0], "dob_label": d[1], "alive_label": d[2], "cause_label": d[3]} for d in data])

class EvalResult(BaseModel):
  result: bool = Field(description="Is the answer correct or not?")

cause_evaluator = LLMStructuredStep(
  model=models.gpt4,
  prompt=lambda row: f"This is the correct cause of death: {row['cause_label']}. Is this provided cause of death accurate? The phrasing might be slightly different. Use your judgement: \n{row['cause_of_death']}",
  out_schema=EvalResult,
  name="cause_evaluator")

def eval_fn(row):
  score = 0
  if row['date_of_birth'] == row['dob_label']:
    score += 0.25
  if row['alive'] == row['alive_label']:
    score += 0.25
  if row['cause_label'] == "N/A":
    if row['cause_of_death'] == "N/A":
      score += 0.5
  elif cause_evaluator.run(row)['result']:
    score += 0.5  
  return score

pipeline.run(df)
print("Score: ", pipeline.evaluate(eval_fn))
df

Applying step search:   0%|          | 0/3 [00:00<?, ?it/s]

Applying step search: 100%|██████████| 3/3 [00:03<00:00,  1.06s/it]
Applying step parse_search: 100%|██████████| 3/3 [00:02<00:00,  1.50it/s]
Applying step wikipedia: 100%|██████████| 3/3 [00:01<00:00,  2.25it/s]
Applying step extract_data: 100%|██████████| 3/3 [00:27<00:00,  9.05s/it]


Score:  1.0


Unnamed: 0,name,dob_label,alive_label,cause_label,search,__parse_search__,wikipedia_url,wikipedia,__extract_data__,date_of_birth,alive,cause_of_death,__eval_fn__
0,Ruth Bader Ginsburg,1933-03-15,False,Pancreatic cancer,"{""searchParameters"":{""q"":""Ruth Bader Ginsburg ...","{'input_tokens': 1901, 'output_tokens': 23, 's...",https://en.wikipedia.org/wiki/Ruth_Bader_Ginsburg,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 46341, 'output_tokens': 37, '...",1933-03-15,False,complications of metastatic pancreatic cancer,1.0
1,Bill Gates,1955-10-28,True,,"{""searchParameters"":{""q"":""Bill Gates wikipedia...","{'input_tokens': 1724, 'output_tokens': 20, 's...",https://en.wikipedia.org/wiki/Bill_Gates,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 46261, 'output_tokens': 32, '...",1955-10-28,True,,1.0
2,Steph Curry,1988-03-14,True,,"{""searchParameters"":{""q"":""Steph Curry wikipedi...","{'input_tokens': 1396, 'output_tokens': 20, 's...",https://en.wikipedia.org/wiki/Stephen_Curry,Jump to content\n\nMain menu\n\nMain menu\n\nm...,"{'input_tokens': 64818, 'output_tokens': 32, '...",1988-03-14,True,,1.0


## step 3: optimizing the pipeline

this pipeline has an accuracy score of 86.5%, but perhaps there's room for improvement on cost and speed. First let's view the cost and latency of each step to figure out which one is the bottleneck.

In [5]:
for step in pipeline.steps:
  print(f"Step {step.name}:")
  print(f"- Latency: {step.statistics.total_latency}")
  print(f"- Cost: {step.statistics.input_cost + step.statistics.output_cost}")

Step search:
- Latency: 4.191360712051392
- Cost: 0.0
Step parse_search:
- Latency: 4.446325675999105
- Cost: 0.0033809999999999995
Step wikipedia:
- Latency: 1.655771255493164
- Cost: 0.0
Step extract_data:
- Latency: 29.421274683001684
- Cost: 1.90233


Clearly the final step (`extract_data`) is the one responsible for the bulk of the cost and latency. This makes sense, because we're feeding in the entire wikipedia article to GPT-4, one of the most expensive models.

Let's find out if we can get away with a cheaper/faster model. Most models cannot handle the number of tokens needed to ingest a whole wikipedia article, so we'll turn to the two that can that are also cheaper than GPT4: Claude 3 Sonnet and Claude 3 Haiku.

In [7]:
from superpipe.grid_search import GridSearch
from superpipe.clients import init_anthropic
from superpipe.models import claude3_haiku, claude3_sonnet, claude3_opus
from superpipe.steps import LLMStructuredCompositeStep
import os

# get the claude API key from the environment variable
anthropic_api_key = os.environ["ANTHROPIC_API_KEY"]

# Initialize the anthropic client with the API key
if anthropic_api_key is not None:
    init_anthropic(openai_api_key)
else:
    print("ANTHROPIC_API_KEY environment variable not set")

# we need to use LLMStructuredCompositeStep which uses GPT3.5 for structured JSON extraction
# because Claude does not support JSON mode or function calling out of the box
new_extract_step = LLMStructuredCompositeStep(
  model=models.claude3_haiku,
  prompt=extract_step.prompt,
  out_schema=ExtractedData,
  name="extract_data_new"
)

new_pipeline = Pipeline([
  search_step,
  parse_search_step,
  fetch_wikipedia_step,
  new_extract_step
], evaluation_fn=eval_fn)

param_grid = {
  new_extract_step.name:{
    "model": [claude3_haiku, claude3_sonnet, claude3_opus]}
}
grid_search = GridSearch(new_pipeline, param_grid)
grid_search.run(df)

Applying step search: 100%|██████████| 3/3 [00:02<00:00,  1.16it/s]
Applying step parse_search: 100%|██████████| 3/3 [00:05<00:00,  1.83s/it]
Applying step wikipedia: 100%|██████████| 3/3 [00:01<00:00,  2.38it/s]
Applying step extract_data_new: 100%|██████████| 3/3 [00:00<00:00,  4.26it/s]
Applying step search: 100%|██████████| 3/3 [00:02<00:00,  1.09it/s]
Applying step parse_search: 100%|██████████| 3/3 [00:04<00:00,  1.37s/it]
Applying step wikipedia: 100%|██████████| 3/3 [00:01<00:00,  2.40it/s]
Applying step extract_data_new: 100%|██████████| 3/3 [00:00<00:00,  4.14it/s]
Applying step search: 100%|██████████| 3/3 [00:02<00:00,  1.29it/s]
Applying step parse_search: 100%|██████████| 3/3 [00:02<00:00,  1.32it/s]
Applying step wikipedia: 100%|██████████| 3/3 [00:01<00:00,  2.36it/s]
Applying step extract_data_new: 100%|██████████| 3/3 [00:00<00:00,  4.02it/s]
  styler = styler.applymap(


Unnamed: 0,extract_data_new__model,score,input_cost,output_cost,total_latency,input_tokens,output_tokens,num_success,num_failure,index
0,claude-3-haiku-20240307,1.0,0.002271,9.4e-05,9.336668,"defaultdict(, {'gpt-3.5-turbo-0125': 4542, 'claude-3-haiku-20240307': 0})","defaultdict(, {'gpt-3.5-turbo-0125': 63, 'claude-3-haiku-20240307': 0})",0,3,3791042982912507671
1,claude-3-sonnet-20240229,1.0,0.002474,9.4e-05,8.099901,"defaultdict(, {'gpt-3.5-turbo-0125': 4948, 'claude-3-sonnet-20240229': 0})","defaultdict(, {'gpt-3.5-turbo-0125': 63, 'claude-3-sonnet-20240229': 0})",0,3,3303757701480512865
2,claude-3-opus-20240229,1.0,0.002419,9.4e-05,5.857355,"defaultdict(, {'gpt-3.5-turbo-0125': 4839, 'claude-3-opus-20240229': 0})","defaultdict(, {'gpt-3.5-turbo-0125': 63, 'claude-3-opus-20240229': 0})",0,3,-8345258512936141949


Strangely, Claude 3 Haiku is both more accurate (100% v/s 45%) as well as cheaper and faster. This is suprising, but useful information that we wouldn't have found out unless we built and evaluated pipelines on _our specific data_ rather than benchmark data.

In [8]:
best_params = grid_search.best_params
new_pipeline.update_params(best_params)
new_pipeline.run(df)
print("Score: ", new_pipeline.score)
for step in new_pipeline.steps:
  print(f"Step {step.name}:")
  print(f"- Latency: {step.statistics.total_latency}")
  print(f"- Cost: {step.statistics.input_cost + step.statistics.output_cost}")

Applying step search: 100%|██████████| 3/3 [00:02<00:00,  1.20it/s]
Applying step parse_search: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it]
Applying step wikipedia: 100%|██████████| 3/3 [00:01<00:00,  2.32it/s]
Applying step extract_data_new: 100%|██████████| 3/3 [00:00<00:00,  3.68it/s]


Score:  1.0
Step search:
- Latency: 2.5014584064483643
- Cost: 0.0
Step parse_search:
- Latency: 3.8747405909998633
- Cost: 0.002477
Step wikipedia:
- Latency: 1.2917284965515137
- Cost: 0.0
Step extract_data_new:
- Latency: 0.0
- Cost: 0.0
