## Mapping Scottish place name origins using LLMs

In [None]:
import os
import pandas as pd
import folium
import seaborn as sns
from tqdm import tqdm 
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langchain_core.runnables import RunnableLambda


### Run LLM query

In [None]:
# Enable progress bar
tqdm.pandas()

# Load environment variables
load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")

# Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=openai_api_key)

# Define the Pydantic Model
class GaelicCheckResponse(BaseModel):
    origin: str = Field(..., description="Whether the place name has one of the following origins: Scottish Gaelic, Norse, Brittonic, Scots, English or Roman. Use only this list. If you are not sure say 'Not sure'.")
    reason: str = Field(..., description="Short explanation of your reason for selecting the place name origin.")

# Define Output Parser
parser = PydanticOutputParser(pydantic_object=GaelicCheckResponse)

# Define the Prompt
prompt = PromptTemplate(
    template="""
    You are an expert in toponymy and origin of Scottish place names.
    Please determine the origin of the following Scottish place names.
    I will provide the place name and the historic county (for additional context) . 
    
    Place name: {place_name}
    Historic country: {historic_county}
    
    Respond in JSON format following this schema:
    {format_instructions}

    """,
    input_variables=["place_name", "historic_county"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# Define the chain
origin_chain = (
    prompt  # Format the prompt
    | llm  # Call the LLM
    | parser  # Parse response into structured JSON
    | RunnableLambda(lambda response: response.model_dump()) # Convert Pydantic model to dict
  )  

def process_place_name(place_name, historic_county):
    try:
        result = origin_chain.invoke({'place_name': place_name,
                                       'historic_county': historic_county})  # Call the LLM chain
        return result["origin"], result["reason"]  # Extract fields
    except Exception as e:
        return None, f"Error: {e}"  # Handle errors

df = pd.read_csv('../place_data/IPN_GB_2024.csv', encoding='latin-1')
df = df.drop_duplicates(subset=['placeid'])
df = df[(df.ctry23nm == 'Scotland') & (df.descnm == 'LOC')]
df['placesort'] = df['placesort'].str.title()
df = df.sample(10)

df[['origin', 'reason']] = df[['placesort', 'cty23nm']].progress_apply(lambda x: pd.Series(process_place_name(x['placesort'], x['cty23nm'])), axis=1)

# Save data
df.to_csv('../output/IPN_GB_2024_with_origin_test.csv', index=False)
df[['origin', 'reason']]

### Check output

In [None]:
df_out = pd.read_csv('../output/IPN_GB_2024_with_origin_test.csv')
df_out[['origin', 'reason']]

In [None]:
# Create a Folium Map centered at an average location
map_center = [df_out["lat"].mean(), df_out["long"].mean()]
m = folium.Map(location=map_center, zoom_start=5)

# Function to assign colors
def get_marker_color(language):
    return {
        "Scottish Gaelic": "green",
        "Norse": "red",
        "Scots": "blue",
        "English": "purple",
        "Brittonic": "black"
    }.get(language, "gray")  # Default to gray if unknown

# Add Markers
for _, row in df_out.iterrows():
    folium.Marker(
        location=[row["lat"], row["long"]],
        popup=f"{row['place23nm']} - {row['reason']}",
        icon=folium.Icon(color=get_marker_color(row["origin"]))
    ).add_to(m)

m.save("../data/topollm_map.html")
m


In [None]:
sns.histplot(data=df_out, x='origin', stat='percent')

# Test

In [None]:
origin_chain.invoke({'place_name': 'Cardiff', 'historic_county': 'Cardiff'}) 