<a href="https://colab.research.google.com/github/dcher95/universe7/blob/herbarium/herbarium/simplesoln-geoloc_chatgpt%2Bosmapi%2Bvector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install folium



In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

GEOLocate Batch Processing Tool: Used just for localities.

In [2]:
## Simple Solution

# Use ChatGPT to disambiguate environmental descriptors, objects / named entities and {distance, direction} between objects
# Convert {distance + direction} into a vector using geopandas
# Use NER / Object Detection on the object in the satellite image.
# Add a bunch of vectors stemming off the object.
# Compare multiple vectors to find an area
# Use SDM or Sat2Cap to refine it. Maybe if it can dissociate the location.

In [3]:
!wget https://api.gbif.org/v1/occurrence/download/request/0116573-240626123714530.zip -O /content/0116573-240626123714530.zip --quiet

In [4]:
import zipfile
import pandas as pd
import folium
from geopy.geocoders import Nominatim

In [5]:
with zipfile.ZipFile("/content/0116573-240626123714530.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/MO-herbarium")  # Specify the extraction path

In [24]:
# images of plants in MO
images = pd.read_csv("/content/MO-herbarium/multimedia.txt", sep="\t")

# location information & occurrence info
occ = pd.read_csv("/content/MO-herbarium/occurrence.txt", sep="\t")
occ = occ[[
    'gbifID', 'occurrenceID', # Joining Keys
    'speciesKey', 'species', # species
    'year', 'month', 'day', # date
    'habitat', 'locality', # descriptive text. I think this is most of the route / habitat info we would want. Looks like already OCR processed?
    'countryCode', 'stateProvince', 'county', 'municipality', # administrative
    'georeferenceSources','hasCoordinate', 'hasGeospatialIssues' ,'decimalLatitude', 'decimalLongitude']] # geospatial
    # 'level0Gid', 'level0Name', 'level1Gid', 'level1Name', 'level2Gid', 'level2Name', 'level3Gid', 'level3Name',]] # Has NAs. Above is more reliable.

# lot of occurrences don't have pictures. Can we still use those for the species map?
merged = images.merge(occ, on = ['gbifID'])

merged[['identifier', 'county', 'municipality', 'decimalLatitude', 'decimalLongitude', 'habitat', 'locality']].head()

  occ = pd.read_csv("/content/MO-herbarium/occurrence.txt", sep="\t")


Unnamed: 0,identifier,county,municipality,decimalLatitude,decimalLongitude,habitat,locality
0,http://sweetgum.nybg.org/images3/4081/461/0266...,Lewis Co.,,40.00345,-91.51426,,"Quartz, northeast bay, 50 meters southeast of ..."
1,http://sweetgum.nybg.org/images3/4007/585/0428...,Carter Co.,,36.95,-90.9917,moist limestone slope bluffs,"Ozark national Scenic Riverways, Big Spring ar..."
2,http://sweetgum.nybg.org/images3/4007/584/0428...,Taney Co.,,36.68,-92.88,dry oak woods with cherty soils over Cotter do...,"Mark Twain National Forest, Hercules Glades Wi..."
3,http://sweetgum.nybg.org/images3/4001/895/0430...,Ozark Co.,,36.78,-93.37,wooded upland,"Ella Alford property, a large wooded tract alo..."
4,http://sweetgum.nybg.org/images3/3997/615/0444...,Texas Co.,,,,,Houston.


In [7]:
display(merged.loc[2,'habitat'])
display(merged.loc[2,'locality'])

display(merged.loc[3,'habitat'])
display(merged.loc[3,'locality'])

display(merged.loc[1,'habitat'])
display(merged.loc[1,'locality'])

'dry oak woods with cherty soils over Cotter dolomite'

'Mark Twain National Forest, Hercules Glades Wilderness Area, from Hercules Tower off MO125 to Pole Hollow'

'wooded upland'

'Ella Alford property, a large wooded tract along south side of Bryant Creek, northeast of Brixey'

'moist limestone slope bluffs'

'Ozark national Scenic Riverways, Big Spring area, along trail from Big Spring Lodge to Chub Hollow along Current River'

In [8]:
print('No habitat info:', merged['habitat'].isna().sum() / merged.shape[0])
print('No locality info:', merged['locality'].isna().sum() / merged.shape[0])

coord_df = merged[merged['hasCoordinate'] == True]
no_coord_df = merged[merged['hasCoordinate'] == False]
print('No coordinate:', no_coord_df.shape[0] / merged.shape[0])
print(f'Given no coordinate, has habitat:', no_coord_df[~no_coord_df['habitat'].isna()].shape[0] / no_coord_df.shape[0])
print(f'Given no coordinate, has locality:', no_coord_df[~no_coord_df['locality'].isna()].shape[0] / no_coord_df.shape[0])
print(f'Given no coordinate, has habitat & locality:', no_coord_df[(~no_coord_df['habitat'].isna() & ~no_coord_df['locality'].isna())].shape[0] / no_coord_df.shape[0])

No habitat info: 0.8458226826315082
No locality info: 0.22521189290999596
No coordinate: 0.48567200322884435
Given no coordinate, has habitat: 0.10470914127423822
Given no coordinate, has locality: 0.9119113573407203
Given no coordinate, has habitat & locality: 0.1041551246537396


In [23]:
# Use ChatGPT to disambiguate the direction into {object, distance, direction}
coord_has_locality_df = coord_df[~coord_df['locality'].isna()]
coord_has_locality_df[['identifier', 'county', 'municipality', 'decimalLatitude', 'decimalLongitude', 'habitat', 'locality']].sort_values('decimalLatitude')

Unnamed: 0,identifier,county,municipality,decimalLatitude,decimalLongitude,habitat,locality
6743,http://sweetgum.nybg.org/images3/1778/634/0237...,Dunklin Co.,,36.105969,-90.165758,,.
4831,http://sweetgum.nybg.org/images3/1939/184/0251...,Jefferson Co.,,36.125000,-90.675000,,"edge of Big River, ca .1 Mi NE of its juncton ..."
4548,http://sweetgum.nybg.org/images3/907/151/17617...,Dunklin Co.,,36.236176,-90.055649,,Kennett.
512,http://sweetgum.nybg.org/images3/1122/417/0196...,,,36.236176,-90.055649,,Kennett
1680,http://sweetgum.nybg.org/images3/1944/544/0250...,Dunklin Co.,,36.236176,-90.055649,,Kennett
...,...,...,...,...,...,...,...
5710,http://sweetgum.nybg.org/images3/1741/895/0248...,,,40.521151,-91.635439,,Dumas
4358,http://sweetgum.nybg.org/images3/1853/174/0019...,Clark Co.,,40.521151,-91.635439,Rich shaded banks,Dumas
3235,http://sweetgum.nybg.org/images3/611/875/01472...,Clark Co.,,40.521151,-91.635439,Open woods.,Dumas
6283,http://sweetgum.nybg.org/images3/1763/982/0250...,,,40.521151,-91.635439,,Dumas


### Example 1: Just NER, no relation

In [36]:
lat, lon = merged.loc[2,'decimalLatitude'], merged.loc[2,'decimalLongitude']

In [75]:
prompt = "Dissociate the following text into a dictionary with three keys, Location, Environment and Relations. Location contains the named places or geographical entities. Environment are the attributes or descriptors related to the geography or vegetation of the area. Relations are the spatial relationships between different entities, using phrases that has the named entity or object, and their directional and distance cues as seperate values, as well as relations between objects. This dictionary should be structured so that it can be fed into geometric calculations to develop vectors."
input = "Taney, Co., MO. dry oak woods with cherty soils over Cotter dolomite. Mark Twain National Forest, Hercules Glades Wilderness Area, from Hercules Tower off MO125 to Pole Hollow"

display(prompt)
display(input)

'Dissociate the following text into a dictionary with three keys, Location, Environment and Relations. Location contains the named places or geographical entities. Environment are the attributes or descriptors related to the geography or vegetation of the area. Relations are the spatial relationships between different entities, using phrases that has the named entity or object, and their directional and distance cues as seperate values, as well as relations between objects. This dictionary should be structured so that it can be fed into geometric calculations to develop vectors.'

'Taney, Co., MO. dry oak woods with cherty soils over Cotter dolomite. Mark Twain National Forest, Hercules Glades Wilderness Area, from Hercules Tower off MO125 to Pole Hollow'

In [71]:
output = {
    "Location": [
        "Taney County, MO",
        "Mark Twain National Forest",
        "Hercules Glades Wilderness Area",
        "Hercules Tower",
        "Pole Hollow"
    ],
    "Environment": [
        "dry oak woods",
        "cherty soils",
        "Cotter dolomite"
    ],
    "Relations": [
        {
            "entity": "Hercules Tower",
            "direction": "from",
            "target": "Pole Hollow",
            "distance": "unknown"
        },
        {
            "entity": "Hercules Glades Wilderness Area",
            "relation": "is within",
            "target": "Mark Twain National Forest"
        }
    ]
}

Example geocoding using named entity

In [72]:
# Initialize the Nominatim geocoder
geolocator = Nominatim(user_agent="geoapi_herbarium")

locations = {}
for location in output['Location']:
    # Geocode the location
    geocoded = geolocator.geocode(f"{location}, MO")
    locations[location] = geocoded
    print(geocoded)

Taney County, Missouri, United States
Mark Twain National Forest, Missouri, United States
Hercules Glades Wilderness Area, Taney County, Missouri, United States
Hercules Lookout Tower, tower trail, Taney County, Missouri, United States
Cedar Pole Hollow, Junction Ferry, Shannon County, Missouri, United States


In [74]:
# Create a base map
m = folium.Map(location=[37.0, -91.0], zoom_start=7)  # Adjust center and zoom level

# Add bounding boxes to the map
for location, geocoded in locations.items():
  if geocoded is not None:
    south_lat, north_lat, west_lon, east_lon = [float(coord) for coord in geocoded.raw['boundingbox']]
    folium.Rectangle(
        bounds=[[south_lat, west_lon], [north_lat, east_lon]],
        color='blue',
        fill=True,
        fill_opacity=0.2,
        popup= location
    ).add_to(m)

folium.Marker(
    location=(lat, lon),
    popup='Actual Coordinate',  # Optional popup text
    icon=folium.Icon(color='red')  # Customize marker color
).add_to(m)

# Save the map to an HTML file or display in a Jupyter notebook
m.save('bounding_boxes_map_ex1.html')
# If using a Jupyter notebook, simply display m
m

Example 2

In [89]:
ex2 = coord_has_locality_df.reset_index(drop=True).iloc[2464,][['identifier', 'county', 'stateProvince', 'municipality', 'decimalLatitude', 'decimalLongitude', 'habitat', 'locality']]

In [95]:
input_parts = [value for value in [ex2['county'], ex2['stateProvince'], ex2['habitat'], ex2['locality']] if pd.notna(value)]
input = '. '.join(input_parts) + '.'

In [119]:
lat, lon = ex2['decimalLatitude'], ex2['decimalLongitude']
prompt = "Dissociate the following text into a dictionary with three keys, Location, Environment and Relations. Location contains the named places or geographical entities. Environment are the attributes or descriptors related to the geography or vegetation of the area. Relations are the spatial relationships between different entities, using phrases that has the named entity or object. The direction and distance should be explicitly listed in the dictionary. Distance and direction should be interpretable as vectors with singular values, if possible."
display(prompt)
display(input)

'Dissociate the following text into a dictionary with three keys, Location, Environment and Relations. Location contains the named places or geographical entities. Environment are the attributes or descriptors related to the geography or vegetation of the area. Relations are the spatial relationships between different entities, using phrases that has the named entity or object. The direction and distance should be explicitly listed in the dictionary. Distance and direction should be interpretable as vectors with singular values, if possible.'

'Jefferson Co.. Missouri. 6.5 mi (10.5 km) W of DeSoto; Among woods on ridgetop, cherty substrate, above bluff near Big River, ca. 0.1 mi (0.16 km) NNE of its confluence with Parker Creek..'

In [147]:
output = {
    "Location": [
        "Jefferson County, Missouri",
        "De Soto", #DeSoto in text. diff place. manually changed
        "Big River",
        "Parker Creek"
    ],
    "Environment": [
        "woods on ridgetop",
        "cherty substrate",
        "bluff near Big River"
    ],
    "Relations": [
        {
            "entity": "De Soto",
            "direction": "W",
            "distance_miles": 6.5,
            "distance_km": 10.5
        },
        {
            "entity": "Big River",
            "direction": "NNE",
            "distance_miles": 0.1,
            "distance_km": 0.16,
            "relation": "above bluff near"
        },
        {
            "entity": "Big River",
            "relation": "is near",
            "target": "confluence with Parker Creek"
        }
    ]
}



In [121]:
# Initialize the Nominatim geocoder
geolocator = Nominatim(user_agent="geoapi_herbarium")

locations = {}
for location in output['Location']:
    # Geocode the location
    geocoded = geolocator.geocode(f"{location}, MO")
    locations[location] = geocoded
    print(geocoded)

Jefferson County, Missouri, United States
De Soto, Valle Township, Jefferson County, Missouri, 63024, United States
Big River, Missouri, United States
Parker Creek, Valle Township, Jefferson County, Missouri, United States


In [122]:
# Create a base map
m = folium.Map(location=[37.0, -91.0], zoom_start=7)  # Adjust center and zoom level

# Add bounding boxes to the map
for location, geocoded in locations.items():
  if geocoded is not None:
      south_lat, north_lat, west_lon, east_lon = [float(coord) for coord in geocoded.raw['boundingbox']]
      folium.Rectangle(
          bounds=[[south_lat, west_lon], [north_lat, east_lon]],
          color='blue',
          fill=True,
          fill_opacity=0.2,
          popup= location
      ).add_to(m)

folium.Marker(
    location=(lat, lon),
    popup='Actual Coordinate',  # Optional popup text
    icon=folium.Icon(color='red')  # Customize marker color
).add_to(m)

# Save the map to an HTML file or display in a Jupyter notebook
m.save('bounding_boxes_map_ex2.html')
# If using a Jupyter notebook, simply display m
m


In [129]:
for entity in output['Relations']:
  if 'direction' in entity:
    print(entity['distance_miles'])



6.5
0.1


In [175]:
params = output['Relations'][0]
params

{'entity': 'De Soto',
 'direction': 'W',
 'distance_miles': 6.5,
 'distance_km': 10.5}

In [176]:
import math

def calculate_new_bounding_box(bounding_box, direction, distance_miles):
    # Convert miles to degrees (approximation)
    distance_degrees = distance_miles * 0.01449275362

    # Original bounding box coordinates
    south_lat = float(bounding_box[0])
    north_lat = float(bounding_box[1])
    west_lon = float(bounding_box[2])
    east_lon = float(bounding_box[3])

    # Initialize offsets
    lat_offset = 0
    lon_offset = 0

    # Calculate offsets based on direction
    if direction == 'N':
        lat_offset = distance_degrees
    elif direction == 'S':
        lat_offset = -distance_degrees
    elif direction == 'E':
        lon_offset = distance_degrees
    elif direction == 'W':
        lon_offset = -distance_degrees
    elif direction == 'NE':
        lat_offset = distance_degrees * math.cos(math.radians(45))
        lon_offset = distance_degrees * math.sin(math.radians(45))
    elif direction == 'SE':
        lat_offset = -distance_degrees * math.cos(math.radians(45))
        lon_offset = distance_degrees * math.sin(math.radians(45))
    elif direction == 'SW':
        lat_offset = -distance_degrees * math.cos(math.radians(45))
        lon_offset = -distance_degrees * math.sin(math.radians(45))
    elif direction == 'NW':
        lat_offset = distance_degrees * math.cos(math.radians(45))
        lon_offset = -distance_degrees * math.sin(math.radians(45))
    else:
        raise ValueError(f"Invalid direction: {direction}")

    # Calculate new bounding box coordinates
    new_south_lat = south_lat + lat_offset
    new_north_lat = north_lat + lat_offset
    new_west_lon = west_lon + lon_offset
    new_east_lon = east_lon + lon_offset

    return [new_south_lat, new_north_lat, new_west_lon, new_east_lon]


In [179]:
bounding_box = locations[params['entity']].raw['boundingbox']

# Calculate new bounding box
new_bounding_box = calculate_new_bounding_box(bounding_box, params['direction'], params['distance_miles'])

# Create a map centered around the original bounding box
m = folium.Map(location=[(float(bounding_box[0]) + float(bounding_box[1])) / 2, (float(bounding_box[2]) + float(bounding_box[3])) / 2], zoom_start=12)

# Add the original bounding box
folium.Rectangle(
    bounds=[[float(bounding_box[0]), float(bounding_box[2])], [float(bounding_box[1]), float(bounding_box[3])]],
    color='blue',
    fill=True,
    fill_opacity=0.2,
    popup='Original Bounding Box'
).add_to(m)

# Add the new bounding box based on direction
folium.Rectangle(
    bounds=[[new_bounding_box[0], new_bounding_box[2]], [new_bounding_box[1], new_bounding_box[3]]],
    color='green',
    fill=True,
    fill_opacity=0.2,
    popup='New Bounding Box'
).add_to(m)

folium.Marker(
    location=(lat, lon),
    popup='Actual Coordinate',  # Optional popup text
    icon=folium.Icon(color='red')  # Customize marker color
).add_to(m)

# Save the map to an HTML file
m

In [173]:
# It's a bit complicated to do NNE, so I'll just change to NE
display(output['Relations'][1])
params2 = {'entity': 'Big River',
 'direction': 'NE',
 'distance_miles': 0.1,
 'distance_km': 0.16,
 'relation': 'above bluff near'}

# It's actually a little North of Parker Creek -- but that's not picked up!

{'entity': 'Big River',
 'direction': 'NNE',
 'distance_miles': 0.1,
 'distance_km': 0.16,
 'relation': 'above bluff near'}

In [178]:
bounding_box = locations['Parker Creek'].raw['boundingbox']

# Calculate new bounding box
new_bounding_box2 = calculate_new_bounding_box(bounding_box, params2['direction'], params2['distance_miles'])

# Create a map centered around the original bounding box
m = folium.Map(location=[(float(bounding_box[0]) + float(bounding_box[1])) / 2, (float(bounding_box[2]) + float(bounding_box[3])) / 2], zoom_start=12)

# Add the original bounding box
folium.Rectangle(
    bounds=[[float(bounding_box[0]), float(bounding_box[2])], [float(bounding_box[1]), float(bounding_box[3])]],
    color='blue',
    fill=True,
    fill_opacity=0.2,
    popup='Original Bounding Box'
).add_to(m)

# Add the new bounding box based on direction
folium.Rectangle(
    bounds=[[new_bounding_box2[0], new_bounding_box2[2]], [new_bounding_box2[1], new_bounding_box2[3]]],
    color='green',
    fill=True,
    fill_opacity=0.2,
    popup='New Bounding Box'
).add_to(m)

folium.Marker(
    location=(lat, lon),
    popup='Actual Coordinate',  # Optional popup text
    icon=folium.Icon(color='red')  # Customize marker color
).add_to(m)

# Save the map to an HTML file
m

In [182]:
m = folium.Map(location=[(float(bounding_box[0]) + float(bounding_box[1])) / 2, (float(bounding_box[2]) + float(bounding_box[3])) / 2], zoom_start=12)

# Add the new bounding box based on direction
folium.Rectangle(
    bounds=[[new_bounding_box2[0], new_bounding_box2[2]], [new_bounding_box2[1], new_bounding_box2[3]]],
    color='green',
    fill=True,
    fill_opacity=0.2,
    popup='New Bounding Box2'
).add_to(m)

# Add the new bounding box based on direction
folium.Rectangle(
    bounds=[[new_bounding_box[0], new_bounding_box[2]], [new_bounding_box[1], new_bounding_box[3]]],
    color='green',
    fill=True,
    fill_opacity=0.2,
    popup='New Bounding Box'
).add_to(m)

folium.Marker(
    location=(lat, lon),
    popup='Actual Coordinate',  # Optional popup text
    icon=folium.Icon(color='red')  # Customize marker color
).add_to(m)

m