In [None]:
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate

from models.dinosaur import Dinosaur

from dotenv import load_dotenv
from os import getenv

from lxml import html
import requests as rq
import pandas as pd

load_dotenv()

NHM = {
    "url": "https://www.nhm.ac.uk/discover/dino-directory/name/name-az-all/gallery.html",
    "xpath": "//*[@id='content']/div[2]/div[2]/ul/li/a",
    "domain": "https://www.nhm.ac.uk/"
}

# THOUGHTCO = {
#     "url": "https://www.thoughtco.com/dinosaurs-a-to-z-1093748",
#     "xpath": "//*[@id='mntl-sc-page_1-0']/p"
# }

WIKIPEDIA = {
    "url": "https://en.wikipedia.org/wiki/list_of_dinosaur_genera",
    "xpath": "//*[@id='mw-content-text']/div[1]/table/following::ul/li/a",
    "domain": "https://en.wikipedia.org/"
}

def extract(url:str, xpath:str, **kwargs) -> list:
    return [ (d.text_content().lower(), d.attrib["href"]) for d in html.fromstring(rq.get(url).content).xpath(xpath) ]

LLM = GoogleGenerativeAI(model="gemini-2.0-flash", api_key=getenv("GOOGLE_API_KEY"), temperature=0)

In [None]:
prompt = PromptTemplate(template="Get all the data related to this dinosaur: {URL}\n{FORMAT}", input_variables=["URL", "FORMAT"])
chain = prompt | LLM | (parser := PydanticOutputParser(pydantic_object=Dinosaur))

In [None]:
def load(where:dict, how_many:int=10) -> None:
    dinolist = extract(**WIKIPEDIA)[:how_many]

    def invoke(url:str):
        return chain.invoke({"URL": url, "FORMAT": parser.get_format_instructions()})

    dinodf = pd.json_normalize([ invoke(where["domain"] + d[-1]).model_dump() for d in dinolist ])
    dinodf.to_excel("data/dinos.xlsx")