# POI Download

## POI from Overture

In [12]:
import duckdb
import geopandas
import pandas as pd

# Connect to DuckDB
con = duckdb.connect()

# Install and load necessary extensions
con.install_extension('httpfs')
con.install_extension('spatial')
con.load_extension('httpfs')
con.load_extension('spatial')

# Define the URL and bbox
url = 'https://data.source.coop/cholmes/overture/places-geoparquet-country/FR.parquet'
minx, miny, maxx, maxy = 2.321001, 48.837379, 2.373657, 48.870248

# Define the bbox polygon WKT
bbox_wkt = f'POLYGON(({minx} {miny}, {maxx} {miny}, {maxx} {maxy}, {minx} {maxy}, {minx} {miny}))'


In [13]:
schema_query = f"DESCRIBE SELECT * FROM read_parquet('{url}');"
schema = con.execute(schema_query).fetchdf()  # Or .fetchall() for a list of tuples
print(schema)

    column_name                                        column_type null   key  \
0            id                                            VARCHAR  YES  None   
1    updatetime                                            VARCHAR  YES  None   
2       version                                            INTEGER  YES  None   
3         names  STRUCT(key_value STRUCT("key" VARCHAR, "value"...  YES  None   
4    categories  STRUCT(alternate STRUCT(list VARCHAR[]), main ...  YES  None   
5    confidence                                             DOUBLE  YES  None   
6      websites                             STRUCT(list VARCHAR[])  YES  None   
7       socials                             STRUCT(list VARCHAR[])  YES  None   
8        emails                                            INTEGER  YES  None   
9        phones                             STRUCT(list VARCHAR[])  YES  None   
10        brand  STRUCT("names" STRUCT(key_value STRUCT("key" V...  YES  None   
11    addresses  STRUCT(list

In [14]:
# Query to filter and create GeoJSON
query = f"""
SELECT json_object(
    'type', 'FeatureCollection',
    'features', json_group_array(
        json_object(
            'type', 'Feature',
            'geometry', ST_AsGeoJSON(geometry)::json,
            'properties', json_object(
                'id', id,
                'confidence', confidence,
                'names', to_json(names),
                'categories', to_json(categories),
                'brand', to_json(brand),
                'addresses', to_json(addresses)
            )
        )
    )
) AS geojson
FROM (
    SELECT *
    FROM read_parquet('{url}')
    WHERE bbox.minx <= {maxx} AND bbox.maxx >= {minx} AND bbox.miny <= {maxy} AND bbox.maxy >= {miny}
    AND ST_Intersects(geometry, ST_GeomFromText('{bbox_wkt}'))
)
"""

# Execute the query and fetch the GeoJSON string
geojson_str = con.execute(query).fetchone()[0]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [15]:
# Export to local file
with open('places.geojson', 'w') as f:
    f.write(geojson_str)

print("GeoJSON exported to places.geojson")

GeoJSON exported to places.geojson


In [16]:
import geopandas as gpd
# Load the GeoJSON into a GeoDataFrame
gdf = gpd.read_file('places.geojson')
gdf.head(100)

Unnamed: 0,id,confidence,names,categories,brand,addresses,geometry
0,tmp_947C1852EE79AA7A4044012FEEFA6FDF,0.271406,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": null, ""main"": ""professional_ser...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.34284 48.87018)
1,tmp_3C3FD29687593D9EB9EB6556051D41A0,0.255010,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": null, ""main"": null }","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.33537 48.86789)
2,tmp_5C6A47005E5375F8A38EBDD94FD90452,0.857398,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""campus_building"" ]...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.33583 48.86865)
3,tmp_98DD07E263F8CFE51D05131D84D94DC1,0.520115,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""event_planning"", ""...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.33562 48.8668)
4,tmp_68E5A0310FB78F1CA845BEC28C8F4978,0.509612,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""coffee_shop"", ""res...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.32922 48.86484)
...,...,...,...,...,...,...,...
95,tmp_C117879421269D414244895C7D51BE29,0.934367,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": null, ""main"": ""beauty_salon"" }","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.35444 48.86808)
96,tmp_BFA3CB17921FDD1126F78FA5C32F9659,0.664790,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""graphic_designer"",...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.33497 48.86537)
97,tmp_5EC45714EEB4636C90B38B8F305355B0,0.384468,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""accountant"", ""bar""...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.36795 48.86733)
98,tmp_46FA07AA2BA384B5ED4967793233E816,0.935415,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""shopping"", ""fashio...","{ ""names"": { ""key_value"": [ { ""key"": ""brand_na...","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (2.32229 48.86882)


### Extract main categories from overture POIs

 Extract values from key "main" to get a new column of places categories for further classification 

In [29]:
import ast

def get_main(cat):
    # If value is missing
    if cat is None:
        return None
    
    # Convert string → dict
    if isinstance(cat, str):
        try:
            cat = ast.literal_eval(cat)
        except:
            return None
    
    # Extract "main"
    if isinstance(cat, dict):
        return cat.get("main")
    
    return None

gdf["main_category"] = gdf["categories"].apply(get_main)


In [30]:
gdf[["categories", "main_category"]].head(100)


Unnamed: 0,categories,main_category
0,"{ ""alternate"": null, ""main"": ""professional_ser...",
1,"{ ""alternate"": null, ""main"": null }",
2,"{ ""alternate"": { ""list"": [ ""campus_building"" ]...",art_school
3,"{ ""alternate"": { ""list"": [ ""event_planning"", ""...",theatre
4,"{ ""alternate"": { ""list"": [ ""coffee_shop"", ""res...",cafe
...,...,...
95,"{ ""alternate"": null, ""main"": ""beauty_salon"" }",
96,"{ ""alternate"": { ""list"": [ ""graphic_designer"",...",printing_services
97,"{ ""alternate"": { ""list"": [ ""accountant"", ""bar""...",financial_service
98,"{ ""alternate"": { ""list"": [ ""shopping"", ""fashio...",clothing_store


In [31]:
unique_count = gdf["main_category"].nunique()
unique_count

608

Create a csv to produce a classification unsing LLM

In [25]:
unique = pd.DataFrame(gdf["main_category"].dropna().unique(), columns=["category"])
unique.to_csv("unique_categories.csv", index=False)


In [32]:
import geopandas
import pandas as pd
CATEGORY_MAP = {
    
    # ------------------ FOOD & DRINK ------------------
    'bar': 'Food & Drink', 'korean_restaurant': 'Food & Drink', 'fast_food_restaurant': 'Food & Drink', 
    'indian_restaurant': 'Food & Drink', 'sandwich_shop': 'Food & Drink', 'hawaiian_restaurant': 'Food & Drink', 
    'japanese_restaurant': 'Food & Drink', 'bakery': 'Food & Drink', 'karaoke': 'Food & Drink', 
    'italian_restaurant': 'Food & Drink', 'cafe': 'Food & Drink', 'wine_bar': 'Food & Drink', 
    'french_restaurant': 'Food & Drink', 'cheese_shop': 'Food & Drink', 'pizza_restaurant': 'Food & Drink', 
    'mediterranean_restaurant': 'Food & Drink', 'chinese_restaurant': 'Food & Drink', 'whiskey_bar': 'Food & Drink', 
    'sake_bar': 'Food & Drink', 'dim_sum_restaurant': 'Food & Drink', 'arabian_restaurant': 'Food & Drink', 
    'ethiopian_restaurant': 'Food & Drink', 'greek_restaurant': 'Food & Drink', 'barbecue_restaurant': 'Food & Drink', 
    'fish_and_chips_restaurant': 'Food & Drink', 'beer_bar': 'Food & Drink', 'candy_store': 'Food & Drink', 
    'live_and_raw_food_restaurant': 'Food & Drink', 'food_delivery_service': 'Food & Drink', 'portuguese_restaurant': 'Food & Drink', 
    'kurdish_restaurant': 'Food & Drink', 'cuban_restaurant': 'Food & Drink', 'halal_restaurant': 'Food & Drink', 
    'hotel_bar': 'Food & Drink', 'afghan_restaurant': 'Food & Drink', 'restaurant': 'Food & Drink', 
    'butcher_shop': 'Food & Drink', 'grocery_store': 'Food & Drink', 'tea_room': 'Food & Drink',
    'smoothie_juice_bar': 'Food & Drink', 'chocolatier': 'Food & Drink', 'ice_cream_and_frozen_yogurt_store': 'Food & Drink',
    'liquor_store': 'Food & Drink', 'breakfast_and_brunch_restaurant': 'Food & Drink', 'pub': 'Food & Drink',
    'vegetarian_restaurant': 'Food & Drink', 'asian_restaurant': 'Food & Drink', 'thai_restaurant': 'Food & Drink',
    'vietnamese_restaurant': 'Food & Drink',

    # ------------------ COMMERCIAL/RETAIL ------------------
    'bookstore': 'Commercial/Retail', 'clothing_store': 'Commercial/Retail', 'antique_store': 'Commercial/Retail', 
    'flowers_and_gifts_shop': 'Commercial/Retail', 'jewelry_store': 'Commercial/Retail', 'fashion_accessories_store': 'Commercial/Retail', 
    'furniture_store': 'Commercial/Retail', 'wholesale_store': 'Commercial/Retail', 'merchandising_service': 'Commercial/Retail', 
    "women's_clothing_store": 'Commercial/Retail', 'home_improvement_store': 'Commercial/Retail', 'key_and_locksmith': 'Commercial/Retail', 
    'fashion': 'Commercial/Retail', 'supermarket': 'Commercial/Retail', 'shoe_store': 'Commercial/Retail', 
    'arts_and_crafts': 'Commercial/Retail', 'comic_books_store': 'Commercial/Retail', 'cosmetic_and_beauty_supplies': 'Commercial/Retail', 
    'shopping': 'Commercial/Retail', 'mattress_store': 'Commercial/Retail', 'b2b_apparel': 'Commercial/Retail', 
    'audio_visual_equipment_store': 'Commercial/Retail', 'tobacco_shop': 'Commercial/Retail', 'hat_shop': 'Commercial/Retail', 
    'firework_retailer': 'Commercial/Retail', 'costume_store': 'Commercial/Retail', 'currency_exchange': 'Commercial/Retail', 
    'souvenir_shop': 'Commercial/Retail', 'swimwear_store': 'Commercial/Retail', 'computer_store': 'Commercial/Retail', 
    'night_market': 'Commercial/Retail', 'home_and_garden': 'Commercial/Retail', 'flea_market': 'Commercial/Retail', 
    'party_supply': 'Commercial/Retail', 'gun_and_ammo': 'Commercial/Retail', 'vape_shop': 'Commercial/Retail',
    'toy_and_game_store': 'Commercial/Retail', 'musical_instrument_store': 'Commercial/Retail', 'electronic_parts_supplier': 'Commercial/Retail',
    'office_equipment': 'Commercial/Retail', 'department_store': 'Commercial/Retail', 'boutique': 'Commercial/Retail',
    'thrift_store': 'Commercial/Retail', 'convenience_store': 'Commercial/Retail', "men's_clothing_store": 'Commercial/Retail',
    'hardware_store': 'Commercial/Retail', 'home_goods_store': 'Commercial/Retail', 'luggage_store': 'Commercial/Retail',
    'retail': 'Commercial/Retail',

    # ------------------ PROFESSIONAL/SERVICES ------------------
    'professional_services': 'Professional/Services', 'advertising_agency': 'Professional/Services', 'architectural_designer': 'Professional/Services', 
    'construction_services': 'Professional/Services', 'appraisal_services': 'Professional/Services', 'janitorial_services': 'Professional/Services', 
    'image_consultant': 'Professional/Services', 'writing_service': 'Professional/Services', 'private_investigation': 'Professional/Services', 
    'brokers': 'Professional/Services', 'notary_public': 'Professional/Services', 'ip_and_internet_law': 'Professional/Services', 
    'medical_law': 'Professional/Services', 'genealogists': 'Professional/Services', 'lawyer': 'Professional/Services', 
    'accounting_firm': 'Professional/Services', 'management_consultant': 'Professional/Services', 'tax_services': 'Professional/Services', 
    'public_relations_firm': 'Professional/Services', 'contractor': 'Professional/Services', 'graphic_designer': 'Professional/Services',
    'event_planning': 'Professional/Services', 'commercial_industrial': 'Professional/Services',

    # ------------------ RECREATION & ARTS ------------------
    'dance_club': 'Recreation & Arts', 'sports_club_and_league': 'Recreation & Arts', 'yoga_studio': 'Recreation & Arts', 
    'art_gallery': 'Recreation & Arts', 'arts_and_entertainment': 'Recreation & Arts', 'arts_and_crafts': 'Recreation & Arts', 
    'active_life': 'Recreation & Arts', 'amusement_park': 'Recreation & Arts', 'atv_rentals_and_tours': 'Recreation & Arts', 
    'martial_arts_club': 'Recreation & Arts', 'opera_and_ballet': 'Recreation & Arts', 'bus_tours': 'Recreation & Arts', 
    'zoo': 'Recreation & Arts', 'sports_bar': 'Recreation & Arts', 'arcade': 'Recreation & Arts', 
    'ice_skating_rink': 'Recreation & Arts', 'gymnastics_center': 'Recreation & Arts', 'tennis_court': 'Recreation & Arts', 
    'fair': 'Recreation & Arts', 'playground': 'Recreation & Arts', 'science_museum': 'Recreation & Arts', 
    'theatre': 'Recreation & Arts', 'bowling_alley': 'Recreation & Arts', 'fitness_center': 'Recreation & Arts',
    'swimming_pool': 'Recreation & Arts', 'botanic_garden': 'Recreation & Arts', 'park': 'Recreation & Arts',
    'library': 'Recreation & Arts', 'museum': 'Recreation & Arts', 'hobby_shop': 'Recreation & Arts',
    'art_museum': 'Recreation & Arts', 'cinema': 'Recreation & Arts', 'music_venue': 'Recreation & Arts',
    'tours': 'Recreation & Arts', 'travel_services': 'Recreation & Arts',

    # ------------------ HEALTH & MEDICAL ------------------
    'hospital': 'Health & Medical', 'health_and_medical': 'Health & Medical', 'dermatologist': 'Health & Medical', 
    'cosmetic_dentist': 'Health & Medical', 'plastic_surgeon': 'Health & Medical', 'prenatal_perinatal_care': 'Health & Medical', 
    'home_health_care': 'Health & Medical', 'neurologist': 'Health & Medical', 'orthopedist': 'Health & Medical', 
    'nurse_practitioner': 'Health & Medical', 'fertility': 'Health & Medical', 'pharmacy': 'Health & Medical',
    'dental_clinic': 'Health & Medical', 'optometrist': 'Health & Medical', 'veterinarian': 'Health & Medical',
    'psychologist': 'Health & Medical', 'chiropractor': 'Health & Medical', 'physical_therapist': 'Health & Medical',
    'audiologist': 'Health & Medical', 'physical_therapy': 'Health & Medical',

    # ------------------ EDUCATION ------------------
    'college_university': 'Education', 'school': 'Education', 'education': 'Education', 'dance_school': 'Education', 
    'cooking_school': 'Education', 'computer_coaching': 'Education', 'nursing_school': 'Education', 
    'medical_school': 'Education', 'private_school': 'Education', 'educational_services': 'Education', 
    'middle_school': 'Education', 'language_school': 'Education', 'driving_school': 'Education',
    'preschool': 'Education', 'high_school': 'Education', 'vocational_school': 'Education',

    # ------------------ ACCOMMODATION/RESIDENTIAL & REAL ESTATE ------------------
    'hotel': 'Accommodation/Residential', 'bed_and_breakfast': 'Accommodation/Residential', 'beach_resort': 'Accommodation/Residential', 
    'homeless_shelter': 'Accommodation/Residential', 'apartment_building': 'Accommodation/Residential', 'hostel': 'Accommodation/Residential',
    'rv_park': 'Accommodation/Residential', 'real_estate_property': 'Accommodation/Residential', 'real_estate_agent': 'Accommodation/Residential',
    'real_estate': 'Accommodation/Residential', 'holiday_rental_home': 'Accommodation/Residential',

    # ------------------ ADMINISTRATIVE/PUBLIC & RELIGIOUS ------------------
    'community_services_non_profits': 'Administrative/Public', 'public_service_and_government': 'Administrative/Public', 
    'charity_organization': 'Administrative/Public', 'community_center': 'Administrative/Public', 
    'local_and_state_government_offices': 'Administrative/Public', 'law_enforcement': 'Administrative/Public', 
    'disability_services_and_support_organization': 'Administrative/Public', 'food_banks': 'Administrative/Public',
    'fire_station': 'Administrative/Public', 'court': 'Administrative/Public', 'military_base': 'Administrative/Public',
    'post_office': 'Administrative/Public', 'embassy': 'Administrative/Public', 'prison_correctional_facility': 'Administrative/Public',
    'religious_organization': 'Administrative/Public', 'pentecostal_church': 'Administrative/Public',
    'labor_union': 'Administrative/Public',

    # ------------------ UTILITY/INFRASTRUCTURE ------------------
    'public_utility_company': 'Utility/Infrastructure',
    'water_sewer_and_utility_maintenance': 'Utility/Infrastructure',
    'telecommunications_company': 'Utility/Infrastructure',
    'electric_power_facility': 'Utility/Infrastructure',
    'internet_service_provider': 'Utility/Infrastructure',
    'home_security': 'Utility/Infrastructure',

    # ------------------ AUTOMOTIVE & TRANSPORT ------------------
    'airline': 'Automotive & Transport', 'airport': 'Automotive & Transport', 'car_dealer': 'Automotive & Transport', 
    'motorcycle_dealer': 'Automotive & Transport', 'auto_glass_service': 'Automotive & Transport', 
    'motorcycle_manufacturer': 'Automotive & Transport', 'auto_customization': 'Automotive & Transport', 
    'taxi_service': 'Automotive & Transport', 'railroad_freight': 'Automotive & Transport', 
    'bike_repair_maintenance': 'Automotive & Transport', 'car_repair_and_maintenance': 'Automotive & Transport',
    'parking_garage': 'Automotive & Transport', 'gas_station': 'Automotive & Transport',
    'car_wash': 'Automotive & Transport', 'car_rental': 'Automotive & Transport',
    'train_station': 'Automotive & Transport', 'parking': 'Automotive & Transport',

    # ------------------ FINANCIAL ------------------
    'bank_credit_union': 'Financial', 'investing': 'Financial', 'financial_service': 'Financial',
    'financial_services': 'Financial', 'mortgage_broker': 'Financial', 'insurance_agency': 'Financial',

    # ------------------ BEAUTY & PERSONAL CARE ------------------
    'beauty_salon': 'Beauty & Personal Care', 'barber': 'Beauty & Personal Care', 'beauty_and_spa': 'Beauty & Personal Care',
    'hair_salon': 'Beauty & Personal Care', 'massage_studio': 'Beauty & Personal Care', 'tattoo_parlor': 'Beauty & Personal Care',

    # ------------------ MEDIA & PUBLISHING ------------------
    'topic_publisher': 'Media/Publishing', 'mass_media': 'Media/Publishing', 'print_media': 'Media/Publishing',
    'media_agency': 'Media/Publishing', 'media_news_company': 'Media/Publishing', 'broadcasting_media_production': 'Media/Publishing',
    'media_critic': 'Media/Publishing', 'game_publisher': 'Media/Publishing',

    # ------------------ INDUSTRIAL/MANUFACTURING/TRADE ------------------
    'metal_supplier': 'Industrial/Trade', 'tobacco_company': 'Industrial/Trade', 'farm': 'Industrial/Trade', 
    'windows_installation': 'Industrial/Trade', 'manufacturing_facility': 'Industrial/Trade', 'trade_school': 'Industrial/Trade',
    'wholesale_trade': 'Industrial/Trade', 'industrial_company': 'Industrial/Trade', 'industrial_equipment': 'Industrial/Trade',
    
    # ------------------ OTHER/LANDMARK/MISCELLANEOUS ------------------
    'landmark_and_historical_building': 'Other/Landmark', 'archaeological_services': 'Other/Landmark', 
    'record_label': 'Other/Arts', 'sculpture_statue': 'Other/Landmark', 
    'astrologer': 'Other/Personal', 'dog_trainer': 'Other/Personal', 
    'funeral_services_and_cemeteries': 'Other/Services', 'sewing_and_alterations': 'Other/Services',
    'game_store': 'Other/Entertainment', 'computer_hardware_company': 'Other/Technology',
    'eyewear_and_optician': 'Other/Retail', 'russian_restaurant': 'Food & Drink', # Moved this back to Food & Drink
}


# 1. Apply the mapping to create the new column
# The .map() function looks up the value in 'main_category' and replaces it 
# with the corresponding value from the CATEGORY_MAP.
gdf['simplified_category'] = gdf['main_category'].map(CATEGORY_MAP)

# 2. Handle categories that were NOT in the map
# Any category not found in the map will result in NaN. We can fill these 
# with a generic 'Other' category.
gdf['simplified_category'] = gdf['simplified_category'].fillna('Other')

# 3. Verification: Print the count of points per new category
print("--- Counts of Simplified Categories ---")
print(gdf['simplified_category'].value_counts())

# 4. Show the original and new category columns for inspection
print("\n--- Verification (Categories vs. Simplified) ---")
print(gdf[['main_category', 'simplified_category']].head(20))

--- Counts of Simplified Categories ---
simplified_category
Other                        10849
Food & Drink                  4473
Commercial/Retail             4032
Recreation & Arts             2413
Professional/Services         1301
Administrative/Public          930
Accommodation/Residential      854
Education                      834
Beauty & Personal Care         801
Other/Landmark                 411
Health & Medical               409
Financial                      324
Media/Publishing               290
Other/Retail                   145
Automotive & Transport         145
Utility/Infrastructure          58
Other/Technology                36
Industrial/Trade                21
Other/Services                  18
Other/Arts                      12
Other/Personal                   2
Name: count, dtype: int64

--- Verification (Categories vs. Simplified) ---
             main_category        simplified_category
0                     None                      Other
1                    

In [33]:
# Define the output file name
output_geojson_file = "places_categorized.geojson"

gdf.to_file(output_geojson_file, driver="GeoJSON")