In [1]:
# part 1: building the pipeline

from superpipe.steps import LLMStructuredStep, CustomStep, SERPEnrichmentStep
from superpipe import models
from pydantic import BaseModel, Field

# Step 1: use Superpipe's built-in SERP enrichment step to search for the persons wikipedia page
# Include a unique "name" for the step that will used to reference this step's output in future steps

search_step = SERPEnrichmentStep(
  prompt= lambda row: f"{row['name']} wikipedia",
  name="search"
)

# Step 2: Use an LLM to extract the wikipedia URL from the search results
# First, define a Pydantic model that specifies the structured output we want from the LLM

class ParseSearchResult(BaseModel):
  wikipedia_url: str = Field(description="The URL of the Wikipedia page for the person")

# Then we use the built-in LLMStructuredStep and specify a model and a prompt
# The prompt is a function that has access to all the fields in the input as well as the outputs of previous steps

parse_search_step = LLMStructuredStep(
  model=models.gpt35,
  prompt= lambda row: f"Extract the Wikipedia URL for {row['name']} from the following search results: \n\n {row['search']}",
  out_schema=ParseSearchResult,
  name="parse_search"
)

In [1]:
from superpipe.pipeline import Pipeline
import requests
import html2text
import json

h = html2text.HTML2Text()
h.ignore_links = True

# Step 3: we create a CustomStep that can execute any arbitrary function (transform)
# The function fetches the contents of the wikipedia url and converts them to markdown

fetch_wikipedia_step = CustomStep(
  transform=lambda row: h.handle(requests.get(row['wikipedia_url']).text),
  name="wikipedia"
)

# Step 4: we extract the date of birth, living/dead status and cause of death from the wikipedia contents

class ExtractedData(BaseModel):
    date_of_birth: str = Field(description="The date of birth of the person in the format YYYY-MM-DD")
    alive: bool = Field(description="Whether the person is still alive")
    cause_of_death: str = Field(description="The cause of death of the person. If the person is alive, return 'N/A'")

extract_step = LLMStructuredStep(
  model=models.gpt4,
  prompt= lambda row: f"""Extract the date of birth for {row['name']}, whether they're still alive \
  and if not, their cause of death from the following Wikipedia content: \n\n {row['wikipedia']}""",
  out_schema=ExtractedData,
  name="extract_data"
)

# Finally we define and run the pipeline

pipeline = Pipeline([
  search_step,
  parse_search_step,
  fetch_wikipedia_step,
  extract_step
])

output = pipeline.run({"name": "Jean-Paul Sartre"})
print(json.dumps(output, indent=2))

ModuleNotFoundError: No module named 'html2text'