In [None]:
!pip install openai pandas langchain-openai langchain-community


Collecting langchain-openai
  Downloading langchain_openai-0.3.17-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3

In [None]:
import pandas as pd
import openai

file_path = '/content/wuerttemberg_fauna_gold.csv'  # Adjust this path if necessary
data = pd.read_csv(file_path)

print("Data Preview:")
print(data.head())
print(data.columns)



Data Preview:
     amt  year                                              chunk   type  \
0  Aalen  1854  Von Fledermäusen sind jedenfalls die langohrig...  fauna   
1  Aalen  1854  Von Fledermäusen sind jedenfalls die langohrig...  fauna   
2  Aalen  1854  Von Fledermäusen sind jedenfalls die langohrig...  fauna   
3  Aalen  1854  Von Fledermäusen sind jedenfalls die langohrig...  fauna   
4  Aalen  1854  Von Fledermäusen sind jedenfalls die langohrig...  fauna   

             german_name scientific_name  present  
0  Langohrige Fledermaus             NaN     True  
1              Speckmaus             NaN     True  
2        Augenfledermaus             NaN     True  
3              Flußadler             NaN     True  
4             Gabelweihe             NaN     True  
Index(['amt', 'year', 'chunk', 'type', 'german_name', 'scientific_name',
       'present'],
      dtype='object')


In [None]:
# Fixing the aggregation to handle non-string and mixed types
aggregated_data = data.groupby('chunk').agg({
    'amt': lambda x: ', '.join(map(str, x.unique())),
    'year': lambda x: ', '.join(map(str, x.unique())),
    'type': lambda x: ', '.join(x.unique()),
    'german_name': lambda x: ', '.join(x.dropna().unique()),
    'scientific_name': lambda x: ', '.join(x.dropna().unique()),
    'present': 'sum'
}).reset_index()


print(aggregated_data.loc[1])
print(aggregated_data.loc[1]['chunk'])
print(aggregated_data.loc[1]['german_name'])
print(aggregated_data.loc[1]['scientific_name'])
print(aggregated_data.loc[1]['amt'])
print(aggregated_data.loc[1]['year'])
print(aggregated_data.loc[1]['type'])

print(len(aggregated_data))



chunk              3) Klettervögel: der Kukuk (Cuculus canorus), ...
amt                                                     Freudenstadt
year                                                            1858
type                                                           fauna
german_name        Kukuk, Schwarzspecht, dreizehiger Specht, Auer...
scientific_name    Cuculus canorus, Picus Martius, P. tridactylus...
present                                                            9
Name: 1, dtype: object
3) Klettervögel: der Kukuk (Cuculus canorus), der Schwarzspecht (Picus Martius), der dreizehige Specht (P. tridactylus). 4) Hühnerartige Vögel: der Auerhahn (Tetrao urogallus) kommt auf stillen, einsamen Höhen vor, wird übrigens in neuerer Zeit immer mehr gelichtet, das Haselhuhn (T. bonasia) hält sich mehr in den tiefer gelegenen Gegenden des Bezirks auf, das Feldhuhn (Perdix cinerea), die Wachtel (P. coturnix), die Holztaube (Columba oënas), die Ringeltaube (C. palumbus).
Kukuk, Schwarzs

In [None]:
# OPENAI detection

from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
import json


data = list(aggregated_data['chunk'])

#adjust to you API key
llm = ChatOpenAI(model_name="gpt-4o", max_tokens=200, api_key="XXX")


explanations = {}

#common names small context
species_names_context = []
for chunk in data:
  prompt_species_names_context = (
        f"You are a biodiversity expert. Identify all common names for species in this old German (Württemberg) text: '{chunk}'. "
        f"Ignore all scientific or Latin names. Please think deeply and provide JUST a comma-separated list, no numbers, "
        f"of all named common name biodiversity species. Ignore all scientific (Latin) names in the text. ADD NOTHING, provide just the list."
    )
  messages = [HumanMessage(content=prompt_species_names_context)]
  response = llm(messages)

  species_names_context.append(response.content)

# scientific name small context
species_names_context_scientific = []
for chunk in data:
  prompt_species_names_context = (
        f"You are a biodiversity expert. Identify all scientific names (Latin) in this old German (Württemberg) text: '{chunk}'. "
        f"Ignore all common names. Do not translate common names to scientific names. "
        f"It is possible that the text does not contain scientific names; then return: None. "
        f"Please think deeply and provide JUST a comma-separated list, no numbers. "
        f"Ignore all common names in the text. ADD NOTHING, provide just the list."
    )
  messages = [HumanMessage(content=prompt_species_names_context)]
  response = llm(messages)

  species_names_context_scientific.append(response.content)

#common names no context
species_names_no_context = []
for chunk in data:
  prompt_species_names_no_context = (
        f"Identify all common names for species in this text: '{chunk}'. "
        f"Ignore all scientific or Latin names. Please think deeply and provide JUST a comma-separated list, no numbers, "
        f"of all named common name species. Ignore all scientific (Latin) names in the text. ADD NOTHING, provide just the list."
    )
  messages = [HumanMessage(content=prompt_species_names_no_context)]
  response = llm(messages)

  species_names_no_context.append(response.content)

#scientific name no context
species_names_no_context_scientific = []
for chunk in data:
  prompt_species_names_no_context = (
        f"Identify all scientific names (Latin) in this text: '{chunk}'. "
        f"Ignore all common names. Do not translate common names to scientific names. "
        f"It is possible that the text does not contain scientific names; then return: None. "
        f"Please think deeply and provide JUST a comma-separated list, no numbers, "
        f"of all scientific (Latin) names in the text. ADD NOTHING, provide just the list."
    )
  messages = [HumanMessage(content=prompt_species_names_no_context)]
  response = llm(messages)

  species_names_no_context_scientific.append(response.content)

df_res = pd.DataFrame({'Original chunks': list(aggregated_data['chunk'])})
df_res["Gold Standard"] = list(aggregated_data['german_name'])
df_res["Gold Standard Scientific"] = list(aggregated_data['scientific_name'])
df_res['LLM_Preddiction_scientific_Context'] = species_names_context_scientific
df_res['LLM_Preddiction_scientificn_No_Context'] = species_names_no_context_scientific
df_res['LLM_Preddiction_Context'] = species_names_context
df_res['LLM_Preddiction_No_Context'] = species_names_no_context

df_res.to_csv('Results.csv', index=False)


['(im Kreßbach). Über die Fischverhältnisse der Bühler theilte der Fischwasser- und Jagdpächter Anton Heinz in Bühlerthann folgendes mit. Die Bühler wäre ein ausgezeichnetes reines Fischwasser, wenn nicht die Fischottern in ungewöhnlicher Zahl an derselben hausen und einen ordentlichen Fischstand schlechterdings nicht aufkommen lassen würden. Vor 2 Jahren habe er dieselben Winters bei Schnee und Mondschein auf dem Anstand abzuschießen versucht, aber nur mit großer Mühe einige erlegt. Ein besseres Resultat sei hierauf im letzten Winter mit aus Schlesien bezogenen Fischotterfallen erzielt worden, in welchen er 29 Stück gefangen habe.', '3) Klettervögel: der Kukuk (Cuculus canorus), der Schwarzspecht (Picus Martius), der dreizehige Specht (P. tridactylus). 4) Hühnerartige Vögel: der Auerhahn (Tetrao urogallus) kommt auf stillen, einsamen Höhen vor, wird übrigens in neuerer Zeit immer mehr gelichtet, das Haselhuhn (T. bonasia) hält sich mehr in den tiefer gelegenen Gegenden des Bezirks auf

NameError: name 'column_names' is not defined

In [None]:
# OPENAI matching

from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
import json


data = list(aggregated_data['chunk'])

#adjust to you API key
llm = ChatOpenAI(model_name="gpt-4o", max_tokens=200, api_key="XXX")



historical_spelling = ["Kukuk","Punktirte Wasserläufer","Thurmfalke", "Kriekente" ]

new_words = ["Wilde Ente", "dreizehige Specht", "Wilde Schwein" ]

general_to_specific = ["Holzwespen", "Gemeine Möve", "Edelwild", "Borkenkäfer" ]

specific_to_general = ["Schwarzamsel", "Rothbraune Waldmaus"]

renaming_dialects = ["Feldhuhn", "Holztaube", "Gehaubte Steißfuß", "Groppe", "Todtenkopf", "Große Sumpfschnepfe"]


historical_spelling_res = []
for name in historical_spelling:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    historical_spelling_res.append(response.content)


new_words_res = []
for name in new_words:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    new_words_res.append(response.content)


general_to_specific_res = []
for name in general_to_specific:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    general_to_specific_res.append(response.content)

specific_to_general_res = []
for name in specific_to_general:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    specific_to_general_res.append(response.content)

renaming_dialectsl_res = []
for name in specific_to_general:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    renaming_dialects_res.append(response.content)



print(historical_spelling_res)
print(new_words_res)
print(general_to_specific_res)
print(specific_to_general_res)
print(renaming_dialects_res)





In [None]:
#Other models matching using BLABLADOR - https://helmholtz.cloud/services/?serviceID=d7d5c597-a2f6-4bd1-b71e-4d6499d98570
import os
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

# Set environment variables for custom OpenAI-compatible endpoint at blavlador
os.environ["OPENAI_API_KEY"] = "XXX"
os.environ["OPENAI_API_BASE"] = "https://api.helmholtz-blablador.fz-juelich.de/v1"

# Initialize the LLM
#alias-llama3-huge is an alias for the Llama3.1 405b
#alias-fast - This alias is for model with a high throughput. As of December 2024, it's Ministral-8B-Instruct-2410.
#alias-reasoning - As of May 2024, this is pointing to the same Qwen3 30B A3B,
llm = ChatOpenAI(model_name="alias-fast", max_tokens=200)



historical_spelling = ["Kukuk","Punktirte Wasserläufer","Thurmfalke", "Kriekente" ]

new_words = ["Wilde Ente", "dreizehige Specht", "Wilde Schwein" ]

general_to_specific = ["Holzwespen", "Gemeine Möve", "Edelwild", "Borkenkäfer" ]

specific_to_general = ["Schwarzamsel", "Rothbraune Waldmaus"]

renaming_dialects = ["Feldhuhn", "Holztaube", "Gehaubte Steißfuß", "Groppe", "Todtenkopf", "Große Sumpfschnepfe"]


historical_spelling_res = []
for name in historical_spelling:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    historical_spelling_res.append(response.content)


new_words_res = []
for name in new_words:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    new_words_res.append(response.content)


general_to_specific_res = []
for name in general_to_specific:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    general_to_specific_res.append(response.content)

specific_to_general_res = []
for name in specific_to_general:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    specific_to_general_res.append(response.content)

renaming_dialectsl_res = []
for name in specific_to_general:
    prompt = f"""You are a biodiversity expert. If necessary, provide the modern common names for this historic common name: '{chunk}' found in an old German (Württemberg) text, provide short explanations."""
    messages = [HumanMessage(content=prompt)]
    response = llm(messages)
    renaming_dialects_res.append(response.content)



print(historical_spelling_res)
print(new_words_res)
print(general_to_specific_res)
print(specific_to_general_res)
print(renaming_dialects_res)





ModuleNotFoundError: No module named 'langchain_openai'