In [1]:
pip install langchain langchain-anthropic langchain-community firecrawl-py

Collecting langchain
  Downloading langchain-0.3.2-py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 323 kB/s eta 0:00:01     |████████████▊                   | 399 kB 323 kB/s eta 0:00:02
[?25hCollecting langchain-anthropic
  Downloading langchain_anthropic-0.2.3-py3-none-any.whl (21 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.1-py3-none-any.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 1.2 MB/s eta 0:00:01
[?25hCollecting firecrawl-py
  Downloading firecrawl_py-1.2.4-py3-none-any.whl (15 kB)
Collecting SQLAlchemy<3,>=1.4
  Downloading SQLAlchemy-2.0.35-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 73 kB/s eta 0:00:011
[?25hCollecting tenacity!=8.4.0,<9.0.0,>=8.1.0
  Using cached tenacity-8.5.0-py3-none-any.whl (28 kB)
Collecting PyYAML>=5.3
  Using cached PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl (172 kB)
Collecting numpy<2,>=1
  Using cached numpy-1.26.4-cp39-cp

In [None]:
ANTHROPIC_API_KEY="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"
FIRECRAWL_API_KEY="fc-3051b71069ab476fada5037118a4e31c"

In [None]:
import os
from typing import List, Dict, Any
from langchain_anthropic import ChatAnthropic
from langchain_community.document_loaders import FireCrawlLoader


llm = ChatAnthropic(
    model="claude-3-5-sonnet-20240620", 
    temperature=0.2, 
    max_tokens=1024,
    api_key=os.environ["ANTHROPIC_API_KEY"]
)

In [None]:
def get_planet_description(url: str):
    loader = FireCrawlLoader(
        api_key=os.environ["FIRECRAWL_API_KEY"],
        url=url,
        mode="scrape"
    )

    return loader.load()[0]

In [None]:
from enum import Enum
from langchain_core.pydantic_v1 import BaseModel, Field, HttpUrl
from typing import List, Optional

# Enum for Star Types based on their spectral classification
class StarType(str, Enum):
    O = 'O-type'  # Hot, blue stars
    B = 'B-type'  # Blue stars
    A = 'A-type'  # White-blue stars
    F = 'F-type'  # Yellow-white stars
    G = 'G-type'  # Yellow stars (e.g., the Sun)
    K = 'K-type'  # Orange stars
    M = 'M-type'  # Red dwarfs (cool, faint stars)

# Enum for Exoplanet Types
class ExoplanetType(str, Enum):
    GAS_GIANT = 'Gas Giant'
    NEPTUNE_LIKE = 'Neptune-like'
    SUPER_EARTH = 'Super-Earth'
    TERRESTRIAL = 'Terrestrial'


# Pydantic model for individual publications
class Publication(BaseModel):
    link: HttpUrl = Field(description="URL link to the publication")
    description: str = Field(description="Brief description of the publication")


# Pydantic Model for Exoplanet, including host star and extended parameters
class ExoplanetModel(BaseModel):
    name: str = Field(description="Name of the exoplanet")
    description: str = Field(description="A brief description of the exoplanet")
    mass: float = Field(description="Mass of the exoplanet in Earth masses (M⊕)")
    radius: float = Field(description="Radius of the exoplanet in Earth radii (R⊕)")
    orbital_period: float = Field(description="Orbital period in Earth days")
    semi_major_axis: float = Field(description="Semi-major axis of the orbit in AU (astronomical units)")
    eccentricity: float = Field(description="Orbital eccentricity (0 = circular orbit)")
    temperature: float = Field(description="Estimated surface temperature of the exoplanet in Kelvin")
    gravity: float = Field(description="Gravity index on the exoplanet relative to Earth's gravity")
    density: float = Field(description="Density of the exoplanet in g/cm³")
    habitability: bool = Field(description="Whether the planet is considered habitable (True/False)")
    surface_conditions: str = Field(description="Description of surface conditions (e.g., rocky, gaseous, water presence)")
    age: float = Field(description="Age of the exoplanet in billion years (Gyr)")
    distance_from_earth: float = Field(description="Distance from Earth in light years (ly)")
    travel_time: float = Field(description="Estimated travel time to the planet using current spacecraft in years")
    discovered_method: str = Field(description="Method used to discover the exoplanet (e.g., Transit, Radial Velocity)")

    # New Exoplanet Type field
    exoplanet_type: ExoplanetType = Field(description="Type of the exoplanet (Gas Giant, Neptune-like, Super-Earth, Terrestrial)")

    # Host star properties
    star_name: str = Field(description="Name of the host star")
    star_type: StarType = Field(description="Enum value representing the star's type (spectral class)")
    star_mass: float = Field(description="Mass of the star in solar masses (Msun)")
    star_radius: float = Field(description="Radius of the star in solar radii (Rsun)")
    star_temperature: float = Field(description="Temperature of the star in Kelvin")
    star_age: float = Field(description="Age of the star in billion years (Gyr)")

    # Visual assets
    planet_texture: Optional[str] = Field(None, description="Image file path of the visual texture of the planet")
    star_texture: Optional[str] = Field(None, description="Image file path of the visual texture of the host star")
    surface_photos: Optional[List[str]] = Field(None, description="List of image file paths for surface photos")
    locals_portrait: Optional[str] = Field(None, description="Image file path of the locals' portrait")
    flora_photos: Optional[List[str]] = Field(None, description="List of image file paths for possible flora")
    camp_photo: Optional[str] = Field(None, description="Image file path for a possible camp photo")
    background: Optional[str] = Field(None, description="Image file path for the background of the planet")

    # Publications related to the exoplanet
    publications: Optional[List[Publication]] = Field(None, description="List of publications with links and descriptions")


In [None]:
# TODO: Add detailed prompt
prompt_template_string = """
Parse the following content and generate a description of the planet.

{planet_information}

{format_instructions}
"""

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

planet_parser = PydanticOutputParser(pydantic_object=ExoplanetModel)
prompt = PromptTemplate.from_template(
        template=prompt_template_string,
        partial_variables={"format_instructions": planet_parser.get_format_instructions() },
    )

def to_json_file(data, filename):
    import json
    with open(filename, 'w') as f:
        json.dump(data, f)

def parse_planet_resource(urls: List[str]):
    # HERE is array of html content
    collected_data = [get_planet_description(url) for url in urls]
    chain = prompt | llm | planet_parser

    return chain.invoke({ "planet_information": collected_data })


In [None]:
# TODO: Add urls grouped by planet - each planet is a list of urls
resources = [
    [],
    []
]
planets = [parse_planet_resource(planet_url) for planet_url in resources]
to_json_file(planets, "planets.json")