This notebook shows how to use LLM-Geo, you can uncomment a study case (preferred Case 1 and Case 3) to get the spatial analysis results without any human intervention. 

Please check the [webpage of LLM-Geo](https://github.com/gladcolor/LLM-Geo) for more detail!


#  Install package

Make sure you are using the latest version of `openai` and `geopandas`.

In [1]:
# ! pip install pyvis
# ! pip install networkx
# ! pip install openai
# ! conda update  --channel conda-forge geopandas  


# Import package

In [2]:
import os
import sys
import requests
import networkx as nx
import pandas as pd
import geopandas as gpd
from pyvis.network import Network
from openai import OpenAI
from IPython.display import display, HTML, Code
from IPython.display import clear_output

# Define Solution class
Please run the following cell to define the functions

In [3]:
%load_ext autoreload
%autoreload 2

import LLM_Geo_Constants as constants
import helper

from LLM_Geo_kernel import Solution

sys.path.append(os.path.abspath("Modules"))    
import Modules.data_eye as data_eye

# Demonstration Cases

## Input task and data desciption

In [4]:
isReview = True

In [5]:

# Case 1: population living near hazardous wastes

# task_name ='Resident_at_risk_counting'
# TASK = r"""1) Find out Census tracts that contain hazardous waste facilities, then comppute and print out the population living in those tracts. The study area is North Carolina (NC), US.
# 2) Generate a population choropleth map for all tract polygons in NC, rendering the color by tract population; and then highlight the borders of tracts that have hazardous waste facilities. Please draw all polygons, not only the highlighted ones. The map size is 15*10 inches.
# """

# data location with column information
# DATA_LOCATIONS = ["NC hazardous waste facility ESRI shape file: https://github.com/gladcolor/LLM-Geo/raw/master/overlay_analysis/HW_Sites_EPSG4326.zip.",
#                   "NC tract boundary shapefile: https://github.com/gladcolor/LLM-Geo/raw/master/overlay_analysis/tract_37_EPSG4326.zip. The tract ID column is 'GEOID', data types is integer.",
#                   "NC tract population CSV file: https://github.com/gladcolor/LLM-Geo/raw/master/overlay_analysis/NC_tract_population.csv. The population is stored in 'TotalPopulation' column. The tract ID column is 'GEOID', data types is integer."
#                  ]

# data location without column information
# DATA_LOCATIONS = ["NC hazardous waste facility ESRI shape file: https://github.com/gladcolor/LLM-Geo/raw/master/overlay_analysis/HW_Sites_EPSG4326.zip.",
#                   "NC tract boundary shapefile: https://github.com/gladcolor/LLM-Geo/raw/master/overlay_analysis/tract_37_EPSG4326.zip.",
#                   "NC tract population CSV file: https://github.com/gladcolor/LLM-Geo/raw/master/overlay_analysis/NC_tract_population.csv."
#                  ]
 

# # Case 3: COVID-19 death rate in US
# task_name ='COVID_death_rate'
# TASK = r'''1) Draw a choropleth map to show the death rate (death/case) of COVID-19 among the countiguous US counties. Use the accumulated COVID-19 data of 2020.12.31 to compute the death rate. Use scheme ='quantiles' when plotting the map.  Set map projection to 'Conus Albers'. Set map size to 15*10 inches.  
# 2) Draw a scatter plot to show the correlation and trend line of the death rate with the senior resident rate, including the r-square and p-value. Set data point transparency to 50%, regression line as red. Set figure size to 15*10 inches.  
# '''

# data location with column information
# DATA_LOCATIONS = [
#                   r"COVID-19 data case in 2020 (county-level): https://github.com/nytimes/covid-19-data/raw/master/us-counties-2020.csv. This data is for daily accumulated COVID cases and deaths for each county in the US. There are 5 columns: date (format: 2021-02-01), county, state, fips, cases, deaths. ",   
#                   r"Contiguous US county boundary (ESRI shapefile): https://github.com/gladcolor/spatial_data/raw/master/contiguous_counties.zip. The county FIPS column is 'GEOID'; map projection is EPSG:4269",
#                   r"Census data (ACS2020): https://raw.githubusercontent.com/gladcolor/spatial_data/master/Demography/ACS2020_5year_county.csv. THe needed columns are: 'FIPS', 'Total Population', 'Total Population: 65 to 74 Years', 'Total Population: 75 to 84 Years', 'Total Population: 85 Years and Over'. Drop rows with NaN cells after loading the used columns.",
#                  ]

# data location without column information
# DATA_LOCATIONS = [
#                   r"COVID-19 data case in 2020 (county-level): https://github.com/nytimes/covid-19-data/raw/master/us-counties-2020.csv. This data is for daily accumulated COVID cases and deaths for each county in the US. ",   
#                   r"Contiguous US county boundary (ESRI shapefile): https://github.com/gladcolor/spatial_data/raw/master/contiguous_counties.zip.",
#                   r"Census data (ACS2020): https://raw.githubusercontent.com/gladcolor/spatial_data/master/Demography/ACS2020_5year_county.csv. "
#                  ]

# # Case 4: Hospital_accessibility
# task_name ='Hospital_accessibility'
# TASK = r'''
# For each zipcode area in South Carolina (SC), calculate the distance from the centroid of the zipcode area to its nearest hospital, and then create a choropleth distance map of zipcode area polygons (unit: km), also show the hospital.
# '''

## data location with column information
# DATA_LOCATIONS = [
# r"SC zipcode boundary shapefile: https://github.com/GIBDUSC/test/raw/master/sc_zip_boundary.zip, the map projection is WGS1984.",
# r"SC hospitals:  https://github.com/gladcolor/spatial_data/raw/master/South_Carolina/SC_hospitals_with_emergency_room_cleaned.csv, location columns: longitude in 'Longitude' column, latitude in 'Latitude' column.",          
# ]

# data location without column information
# DATA_LOCATIONS = [
# r"SC zipcode boundary shapefile: https://github.com/GIBDUSC/test/raw/master/sc_zip_boundary.zip.",
# r"SC hospitals:  https://github.com/gladcolor/spatial_data/raw/master/South_Carolina/SC_hospitals_with_emergency_room_cleaned.csv.",          
# ]

# # Case 5: school walkability
task_name ='School walkability'
TASK = r'''You need to compute the walkability scores for all schools in the Colubmia city. The steps are:
1) extract the road network near a school within 1 km buffer zone.
2) extract the sidewalks within 20 meters to the extracted road network in the step 1.
3) the school walkability scores is the ratio of the extracted sidewalk length to the extracted road network length.
4) Please draw a map for each school, using the school name and the walkability score as the map title, while showing the extracted sidewalks on a OpenStreetMap basemap.
'''

# data location with column information
DATA_LOCATIONS = [
r"Columbia school points: https://github.com/gladcolor/spatial_data/raw/refs/heads/master/South_Carolina/Columbia_schools.gpkg.",
r"Columbia road network (polyline): https://github.com/gladcolor/spatial_data/raw/refs/heads/master/South_Carolina/Columbia_road.gpkg.",          
r"Columbia sidewalk network (polyline): https://github.com/gladcolor/spatial_data/raw/refs/heads/master/South_Carolina/Columbia_sidewalks.gpkg."    
]


# # Case 2: France_mobility_changes_2020  (NOTE: invalidated due to API shutdown)
# task_name ='France_mobility_changes_2020'
# TASK = r'''
# 1) Show the 2020 human mobility monthly change rates of each administrative regions in a France choropleth map. Each month is a sub-map in a map matrix，12 months in total. All monthly maps need to use the same colorbar range (color scheme: coolwarm). The base of the change rate is January 2020. 
# 2) Draw a line chart to show the monthly change rate trends of all administrative regeions. Each region is a line (the region name is the legend), the x-axis is 2020 months.
# '''

# DATA_LOCATIONS = ["ESRI shapefile for France administrative regions:" + \
#                   "https://github.com/gladcolor/LLM-Geo/raw/master/REST_API/France.zip. " + \
#                   "The 'GID_1' column is the administrative region code, 'NAME_1' column is the administrative region name.",
#                   "REST API url with parameters for daily human mobility data access:" + \
#                   "http://gis.cas.sc.edu/GeoAnalytics/REST?operation=get_daily_movement_for_all_places&source=twitter&scale=world_first_level_admin&begin=01/01/2020&end=12/31/2020." + \
#                   "The response is in CSV format. There are three columns in the response: " + \
#                   "place,date (format:2020-01-07), and intra_movement. 'place' column is the administractive region code of every country;" + \
#                   "codes for France administrative regions start with 'FRA'. Use the total intra_movement of the month as the montly mobility.",
#                  ]


save_dir = os.path.join(os.getcwd(), task_name)
os.makedirs(save_dir, exist_ok=True)

# create graph
model = r'gpt-4o'
model = r'gpt-4o-2024-08-06'


## Get data overview (column names, data types, and map projection)

In [6]:
attributes_json, DATA_LOCATIONS = data_eye.add_data_overview_to_data_location(task=TASK, data_location_list=DATA_LOCATIONS, model=model)
print("DATA_LOCATIONS with data overviews:")
print(DATA_LOCATIONS)

DATA_LOCATIONS with data overviews:
["Columbia school points: https://github.com/gladcolor/spatial_data/raw/refs/heads/master/South_Carolina/Columbia_schools.gpkg.Columbia school points: https://github.com/gladcolor/spatial_data/raw/refs/heads/master/South_Carolina/Columbia_schools.gpkg. Data overview: {'column names and data types': '[amenity: object, ele: object, gnis:feature_id: object, name: object, id: int64, type: object, wikidata: object, operator: object, addr:city: object, addr:country: object, addr:housenumber: object, addr:postcode: object, addr:state: object, addr:street: object, contact:facebook: object, contact:twitter: object, contact:youtube: object, fax: object, grades: object, isced:level: object, operator:type: object, operator:wikidata: object, operator:wikipedia: object, phone: object, website: object, building: object]', 'Coordinate reference system': 'EPSG:4326'}", "Columbia road network (polyline): https://github.com/gladcolor/spatial_data/raw/refs/heads/master/

In [7]:
solution = Solution(
                    task=TASK,
                    task_name=task_name,
                    save_dir=save_dir,
                    data_locations=DATA_LOCATIONS,
                    model=model,
                    )
print("Prompt to get solution graph:\n")
print(solution.graph_prompt)

Prompt to get solution graph:

Your role: A professional Geo-information scientist and programmer good at Python. You have worked on Geographic information science more than 20 years, and know every detail and pitfall when processing spatial data and coding. You know well how to set up workflows for spatial analysis tasks. You have significant experence on graph theory, application, and implementation. You are also experienced on generating map using Matplotlib and GeoPandas.
 

Your task: Generate a graph (data structure) only, whose nodes are (1) a series of consecutive steps and (2) data to solve this question:  
 You need to compute the walkability scores for all schools in the Colubmia city. The steps are:
1) extract the road network near a school within 1 km buffer zone.
2) extract the sidewalks within 20 meters to the extracted road network in the step 1.
3) the school walkability scores is the ratio of the extracted sidewalk length to the extracted road network length.
4) Pleas

## Get graph code from GPT API

In [8]:
response_for_graph = solution.get_LLM_response_for_graph() 
solution.graph_response = response_for_graph
solution.save_solution()

clear_output(wait=True)
display(Code(solution.code_for_graph, language='python'))

## Execute code to generate the solution graph

In [9]:
exec(solution.code_for_graph)
solution_graph = solution.load_graph_file()

# Show the graph
G = nx.read_graphml(solution.graph_file)  
nt = helper.show_graph(G)
html_name = os.path.join(os.getcwd(), solution.task_name + '.html')  
# HTML file should in the same directory. See:
# https://stackoverflow.com/questions/65564916/error-displaying-pyvis-html-inside-jupyter-lab-cell
nt.show(name=html_name)
# html_name

D:\OneDrive_PSU\OneDrive - The Pennsylvania State University\Research_doc\LLM_Geo\Python_code\School walkability.html


## Generate prompts and code for operations (functions)

In [10]:
operations = solution.get_LLM_responses_for_operations(review=isReview)
solution.save_solution()

all_operation_code_str = '\n'.join([operation['operation_code'] for operation in operations])

clear_output(wait=True)
display(Code(all_operation_code_str, language='python'))

## Generate prompts and code for assembly program

In [11]:
assembly_LLM_response = solution.get_LLM_assembly_response(review=isReview)
solution.assembly_LLM_response = assembly_LLM_response
solution.save_solution()

clear_output(wait=True)
display(Code(solution.code_for_assembly, language='python'))

## Execute assembly code

In [12]:
all_code = all_operation_code_str + '\n' + solution.code_for_assembly

# display(Code(all_code, language='python'))

all_code = solution.execute_complete_program(code=all_code, try_cnt=10)



-------------- Running code (trial # 1/10) --------------









Error_info_str: 
Traceback (most recent call last):
  File "Complete program", line 325, in <module>
    exec('assembely_solution()')
  File "<string>", line 1, in <module>
  File "Complete program", line 234, in assembely_solution
    school_walkability_scores = compute_walkability_scores(extracted_road_network_gdf, extracted_sidewalks_gdf)
  File "Complete program", line 291, in compute_walkability_scores
    school_walkability_scores = schools_gdf.set_index('id')['name'].to_dict()
  File "d:\ProgramData\anaconda3\envs\tgi_gpt\Lib\site-packages\geopandas\geodataframe.py", line 1750, in __getitem__
    result = super().__getitem__(key)
  File "d:\ProgramData\anaconda3\envs\tgi_gpt\Lib\site-packages\pandas\core\frame.py", line 4102, in __getitem__
    indexer = self.columns.get_loc(key)
  File "d:\ProgramData\anaconda3\envs\tgi_gpt\Lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'name'

Sending error information to LLM fo


  verbose=True,

  self.code_for_graph = ""

  self.source_nodes = None
  self.assembly_prompt = ""




--------------- Done ---------------


