# POI Download

## POI from Overture

In [2]:
import duckdb
import geopandas
import pandas as pd

# Connect to DuckDB
con = duckdb.connect()

# Install and load necessary extensions
con.install_extension('httpfs')
con.install_extension('spatial')
con.load_extension('httpfs')
con.load_extension('spatial')

# Define the URL and bbox
url = 'https://data.source.coop/cholmes/overture/places-geoparquet-country/NG.parquet'
minx, miny, maxx, maxy = 11.288944,10.978943,12.804761,12.235339

# Define the bbox polygon WKT
bbox_wkt = f'POLYGON(({minx} {miny}, {maxx} {miny}, {maxx} {maxy}, {minx} {maxy}, {minx} {miny}))'


In [3]:
schema_query = f"DESCRIBE SELECT * FROM read_parquet('{url}');"
schema = con.execute(schema_query).fetchdf()  # Or .fetchall() for a list of tuples
print(schema)

    column_name                                        column_type null   key  \
0            id                                            VARCHAR  YES  None   
1    updatetime                                            VARCHAR  YES  None   
2       version                                            INTEGER  YES  None   
3         names  STRUCT(key_value STRUCT("key" VARCHAR, "value"...  YES  None   
4    categories  STRUCT(alternate STRUCT(list VARCHAR[]), main ...  YES  None   
5    confidence                                             DOUBLE  YES  None   
6      websites                             STRUCT(list VARCHAR[])  YES  None   
7       socials                             STRUCT(list VARCHAR[])  YES  None   
8        emails                                            INTEGER  YES  None   
9        phones                             STRUCT(list VARCHAR[])  YES  None   
10        brand  STRUCT("names" STRUCT(key_value STRUCT("key" V...  YES  None   
11    addresses  STRUCT(list

In [4]:
# Query to filter and create GeoJSON
query = f"""
SELECT json_object(
    'type', 'FeatureCollection',
    'features', json_group_array(
        json_object(
            'type', 'Feature',
            'geometry', ST_AsGeoJSON(geometry)::json,
            'properties', json_object(
                'id', id,
                'confidence', confidence,
                'names', to_json(names),
                'categories', to_json(categories),
                'brand', to_json(brand),
                'addresses', to_json(addresses)
            )
        )
    )
) AS geojson
FROM (
    SELECT *
    FROM read_parquet('{url}')
    WHERE bbox.minx <= {maxx} AND bbox.maxx >= {minx} AND bbox.miny <= {maxy} AND bbox.maxy >= {miny}
    AND ST_Intersects(geometry, ST_GeomFromText('{bbox_wkt}'))
)
"""

# Execute the query and fetch the GeoJSON string
geojson_str = con.execute(query).fetchone()[0]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [5]:
# Export to local file
with open('places.geojson', 'w') as f:
    f.write(geojson_str)

print("GeoJSON exported to places.geojson")

GeoJSON exported to places.geojson


In [6]:
import geopandas as gpd
# Load the GeoJSON into a GeoDataFrame
gdf = gpd.read_file('places.geojson')
gdf.head(100)

Unnamed: 0,id,confidence,names,categories,brand,addresses,geometry
0,tmp_1D49E5776173C4A2820F48EFA446F4CF,0.538119,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""fast_food_restaura...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.33459 11.09228)
1,tmp_5C54AF8897FE97E183E7FA9D6930F57F,0.371037,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": null, ""main"": null }","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.33774 11.09367)
2,tmp_F5F398DA2902A459212E36209AB1BB82,0.491798,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": null, ""main"": ""community_servic...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.33629 11.09491)
3,tmp_AF15A4EA087E04F3C2A863C41039DA74,0.291113,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": null, ""main"": ""mass_media"" }","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.33372 11.0949)
4,tmp_734C5FB2575D92D592978C6A36142673,0.750255,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": null, ""main"": ""hospital"" }","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.3289 11.0928)
...,...,...,...,...,...,...,...
95,tmp_15C7C8C70AA8E1452F27AFA46D61771A,0.478061,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""accommodation"" ] }...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.97093 11.75169)
96,tmp_3305507FE99DE709F32077F5B4F9B1FF,0.518916,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""accommodation"" ] }...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.95487 11.72497)
97,tmp_72358FEB8E2624A7483223752443B9A0,0.653790,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": null, ""main"": ""central_governme...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.96256 11.74441)
98,tmp_281FEFAB23B9B80B1C1A1E07567BD802,0.571705,"{ ""key_value"": [ { ""key"": ""common"", ""value"": {...","{ ""alternate"": { ""list"": [ ""youth_organization...","{ ""names"": null, ""wikidata"": null }","{ ""list"": [ { ""key_value"": [ { ""key"": ""localit...",POINT (11.97389 11.74982)


### Extract main categories from overture POIs

 Extract values from key "main" to get a new column of places categories for further classification 

In [25]:
import json
import ast
import pandas as pd

# -----------------------------
# 1) Robust extraction of main_category from gdf["categories"]
# -----------------------------
def extract_main_category(cat):
    # Missing
    if cat is None or (isinstance(cat, float) and pd.isna(cat)):
        return None

    # Already a dict
    if isinstance(cat, dict):
        d = cat
    else:
        s = str(cat).strip()

        # Try JSON first (handles null/true/false properly)
        try:
            d = json.loads(s)
        except Exception:
            # Fallback: try python-literal (only works if it's python-ish)
            try:
                d = ast.literal_eval(s)
            except Exception:
                return None

    main = d.get("main", None)

    # Fallback if main is null: take first item in alternate.list (if present)
    alt = d.get("alternate", None)
    alt_first = None
    if isinstance(alt, dict):
        alt_list = alt.get("list", None)
        if isinstance(alt_list, list) and len(alt_list) > 0:
            alt_first = alt_list[0]

    return main if main is not None else alt_first


gdf["main_category"] = gdf["categories"].apply(extract_main_category)

# Quick check (should drop massively vs before)
print("Missing main_category:", gdf["main_category"].isna().sum())


Missing main_category: 47


In [26]:
gdf[["categories", "main_category"]].head(100)


Unnamed: 0,categories,main_category
0,"{ ""alternate"": { ""list"": [ ""fast_food_restaura...",senior_citizen_services
1,"{ ""alternate"": null, ""main"": null }",
2,"{ ""alternate"": null, ""main"": ""community_servic...",community_services_non_profits
3,"{ ""alternate"": null, ""main"": ""mass_media"" }",mass_media
4,"{ ""alternate"": null, ""main"": ""hospital"" }",hospital
...,...,...
95,"{ ""alternate"": { ""list"": [ ""accommodation"" ] }...",sports_club_and_league
96,"{ ""alternate"": { ""list"": [ ""accommodation"" ] }...",hotel
97,"{ ""alternate"": null, ""main"": ""central_governme...",central_government_office
98,"{ ""alternate"": { ""list"": [ ""youth_organization...",theatre


In [27]:
unique_count = gdf["main_category"].nunique()
unique_count

91

Create a csv to produce a classification unsing LLM

In [28]:
unique = pd.DataFrame(gdf["main_category"].dropna().unique(), columns=["category"])
unique.to_csv("unique_categories.csv", index=False)


In [32]:
import geopandas
import pandas as pd
CATEGORY_MAP = {
    
    # ------------------ FOOD & DRINK ------------------
    'bar': 'Food & Drink', 'korean_restaurant': 'Food & Drink', 'fast_food_restaurant': 'Food & Drink', 
    'indian_restaurant': 'Food & Drink', 'sandwich_shop': 'Food & Drink', 'hawaiian_restaurant': 'Food & Drink', 
    'japanese_restaurant': 'Food & Drink', 'bakery': 'Food & Drink', 'karaoke': 'Food & Drink', 
    'italian_restaurant': 'Food & Drink', 'cafe': 'Food & Drink', 'wine_bar': 'Food & Drink', 
    'french_restaurant': 'Food & Drink', 'cheese_shop': 'Food & Drink', 'pizza_restaurant': 'Food & Drink', 
    'mediterranean_restaurant': 'Food & Drink', 'chinese_restaurant': 'Food & Drink', 'whiskey_bar': 'Food & Drink', 
    'sake_bar': 'Food & Drink', 'dim_sum_restaurant': 'Food & Drink', 'arabian_restaurant': 'Food & Drink', 
    'ethiopian_restaurant': 'Food & Drink', 'greek_restaurant': 'Food & Drink', 'barbecue_restaurant': 'Food & Drink', 
    'fish_and_chips_restaurant': 'Food & Drink', 'beer_bar': 'Food & Drink', 'candy_store': 'Food & Drink', 
    'live_and_raw_food_restaurant': 'Food & Drink', 'food_delivery_service': 'Food & Drink', 'portuguese_restaurant': 'Food & Drink', 
    'kurdish_restaurant': 'Food & Drink', 'cuban_restaurant': 'Food & Drink', 'halal_restaurant': 'Food & Drink', 
    'hotel_bar': 'Food & Drink', 'afghan_restaurant': 'Food & Drink', 'restaurant': 'Food & Drink', 
    'butcher_shop': 'Food & Drink', 'grocery_store': 'Food & Drink', 'tea_room': 'Food & Drink',
    'smoothie_juice_bar': 'Food & Drink', 'chocolatier': 'Food & Drink', 'ice_cream_and_frozen_yogurt_store': 'Food & Drink',
    'liquor_store': 'Food & Drink', 'breakfast_and_brunch_restaurant': 'Food & Drink', 'pub': 'Food & Drink',
    'vegetarian_restaurant': 'Food & Drink', 'asian_restaurant': 'Food & Drink', 'thai_restaurant': 'Food & Drink',
    'vietnamese_restaurant': 'Food & Drink',

    # ------------------ COMMERCIAL/RETAIL ------------------
    'bookstore': 'Commercial/Retail', 'clothing_store': 'Commercial/Retail', 'antique_store': 'Commercial/Retail', 
    'flowers_and_gifts_shop': 'Commercial/Retail', 'jewelry_store': 'Commercial/Retail', 'fashion_accessories_store': 'Commercial/Retail', 
    'furniture_store': 'Commercial/Retail', 'wholesale_store': 'Commercial/Retail', 'merchandising_service': 'Commercial/Retail', 
    "women's_clothing_store": 'Commercial/Retail', 'home_improvement_store': 'Commercial/Retail', 'key_and_locksmith': 'Commercial/Retail', 
    'fashion': 'Commercial/Retail', 'supermarket': 'Commercial/Retail', 'shoe_store': 'Commercial/Retail', 
    'arts_and_crafts': 'Commercial/Retail', 'comic_books_store': 'Commercial/Retail', 'cosmetic_and_beauty_supplies': 'Commercial/Retail', 
    'shopping': 'Commercial/Retail', 'mattress_store': 'Commercial/Retail', 'b2b_apparel': 'Commercial/Retail', 
    'audio_visual_equipment_store': 'Commercial/Retail', 'tobacco_shop': 'Commercial/Retail', 'hat_shop': 'Commercial/Retail', 
    'firework_retailer': 'Commercial/Retail', 'costume_store': 'Commercial/Retail', 'currency_exchange': 'Commercial/Retail', 
    'souvenir_shop': 'Commercial/Retail', 'swimwear_store': 'Commercial/Retail', 'computer_store': 'Commercial/Retail', 
    'night_market': 'Commercial/Retail', 'home_and_garden': 'Commercial/Retail', 'flea_market': 'Commercial/Retail', 
    'party_supply': 'Commercial/Retail', 'gun_and_ammo': 'Commercial/Retail', 'vape_shop': 'Commercial/Retail',
    'toy_and_game_store': 'Commercial/Retail', 'musical_instrument_store': 'Commercial/Retail', 'electronic_parts_supplier': 'Commercial/Retail',
    'office_equipment': 'Commercial/Retail', 'department_store': 'Commercial/Retail', 'boutique': 'Commercial/Retail',
    'thrift_store': 'Commercial/Retail', 'convenience_store': 'Commercial/Retail', "men's_clothing_store": 'Commercial/Retail',
    'hardware_store': 'Commercial/Retail', 'home_goods_store': 'Commercial/Retail', 'luggage_store': 'Commercial/Retail',
    'retail': 'Commercial/Retail',

    # ------------------ PROFESSIONAL/SERVICES ------------------
    'professional_services': 'Professional/Services', 'advertising_agency': 'Professional/Services', 'architectural_designer': 'Professional/Services', 
    'construction_services': 'Professional/Services', 'appraisal_services': 'Professional/Services', 'janitorial_services': 'Professional/Services', 
    'image_consultant': 'Professional/Services', 'writing_service': 'Professional/Services', 'private_investigation': 'Professional/Services', 
    'brokers': 'Professional/Services', 'notary_public': 'Professional/Services', 'ip_and_internet_law': 'Professional/Services', 
    'medical_law': 'Professional/Services', 'genealogists': 'Professional/Services', 'lawyer': 'Professional/Services', 
    'accounting_firm': 'Professional/Services', 'management_consultant': 'Professional/Services', 'tax_services': 'Professional/Services', 
    'public_relations_firm': 'Professional/Services', 'contractor': 'Professional/Services', 'graphic_designer': 'Professional/Services',
    'event_planning': 'Professional/Services', 'commercial_industrial': 'Professional/Services',

    # ------------------ RECREATION & ARTS ------------------
    'dance_club': 'Recreation & Arts', 'sports_club_and_league': 'Recreation & Arts', 'yoga_studio': 'Recreation & Arts', 
    'art_gallery': 'Recreation & Arts', 'arts_and_entertainment': 'Recreation & Arts', 'arts_and_crafts': 'Recreation & Arts', 
    'active_life': 'Recreation & Arts', 'amusement_park': 'Recreation & Arts', 'atv_rentals_and_tours': 'Recreation & Arts', 
    'martial_arts_club': 'Recreation & Arts', 'opera_and_ballet': 'Recreation & Arts', 'bus_tours': 'Recreation & Arts', 
    'zoo': 'Recreation & Arts', 'sports_bar': 'Recreation & Arts', 'arcade': 'Recreation & Arts', 
    'ice_skating_rink': 'Recreation & Arts', 'gymnastics_center': 'Recreation & Arts', 'tennis_court': 'Recreation & Arts', 
    'fair': 'Recreation & Arts', 'playground': 'Recreation & Arts', 'science_museum': 'Recreation & Arts', 
    'theatre': 'Recreation & Arts', 'bowling_alley': 'Recreation & Arts', 'fitness_center': 'Recreation & Arts',
    'swimming_pool': 'Recreation & Arts', 'botanic_garden': 'Recreation & Arts', 'park': 'Recreation & Arts',
    'library': 'Recreation & Arts', 'museum': 'Recreation & Arts', 'hobby_shop': 'Recreation & Arts',
    'art_museum': 'Recreation & Arts', 'cinema': 'Recreation & Arts', 'music_venue': 'Recreation & Arts',
    'tours': 'Recreation & Arts', 'travel_services': 'Recreation & Arts',

    # ------------------ HEALTH & MEDICAL ------------------
    'hospital': 'Health & Medical', 'health_and_medical': 'Health & Medical', 'dermatologist': 'Health & Medical', 
    'cosmetic_dentist': 'Health & Medical', 'plastic_surgeon': 'Health & Medical', 'prenatal_perinatal_care': 'Health & Medical', 
    'home_health_care': 'Health & Medical', 'neurologist': 'Health & Medical', 'orthopedist': 'Health & Medical', 
    'nurse_practitioner': 'Health & Medical', 'fertility': 'Health & Medical', 'pharmacy': 'Health & Medical',
    'dental_clinic': 'Health & Medical', 'optometrist': 'Health & Medical', 'veterinarian': 'Health & Medical',
    'psychologist': 'Health & Medical', 'chiropractor': 'Health & Medical', 'physical_therapist': 'Health & Medical',
    'audiologist': 'Health & Medical', 'physical_therapy': 'Health & Medical',

    # ------------------ EDUCATION ------------------
    'college_university': 'Education', 'school': 'Education', 'education': 'Education', 'dance_school': 'Education', 
    'cooking_school': 'Education', 'computer_coaching': 'Education', 'nursing_school': 'Education', 
    'medical_school': 'Education', 'private_school': 'Education', 'educational_services': 'Education', 
    'middle_school': 'Education', 'language_school': 'Education', 'driving_school': 'Education',
    'preschool': 'Education', 'high_school': 'Education', 'vocational_school': 'Education',

    # ------------------ ACCOMMODATION/RESIDENTIAL & REAL ESTATE ------------------
    'hotel': 'Accommodation/Residential', 'bed_and_breakfast': 'Accommodation/Residential', 'beach_resort': 'Accommodation/Residential', 
    'homeless_shelter': 'Accommodation/Residential', 'apartment_building': 'Accommodation/Residential', 'hostel': 'Accommodation/Residential',
    'rv_park': 'Accommodation/Residential', 'real_estate_property': 'Accommodation/Residential', 'real_estate_agent': 'Accommodation/Residential',
    'real_estate': 'Accommodation/Residential', 'holiday_rental_home': 'Accommodation/Residential',

    # ------------------ ADMINISTRATIVE/PUBLIC & RELIGIOUS ------------------
    'community_services_non_profits': 'Administrative/Public', 'public_service_and_government': 'Administrative/Public', 
    'charity_organization': 'Administrative/Public', 'community_center': 'Administrative/Public', 
    'local_and_state_government_offices': 'Administrative/Public', 'law_enforcement': 'Administrative/Public', 
    'disability_services_and_support_organization': 'Administrative/Public', 'food_banks': 'Administrative/Public',
    'fire_station': 'Administrative/Public', 'court': 'Administrative/Public', 'military_base': 'Administrative/Public',
    'post_office': 'Administrative/Public', 'embassy': 'Administrative/Public', 'prison_correctional_facility': 'Administrative/Public',
    'religious_organization': 'Administrative/Public', 'pentecostal_church': 'Administrative/Public',
    'labor_union': 'Administrative/Public',

    # ------------------ UTILITY/INFRASTRUCTURE ------------------
    'public_utility_company': 'Utility/Infrastructure',
    'water_sewer_and_utility_maintenance': 'Utility/Infrastructure',
    'telecommunications_company': 'Utility/Infrastructure',
    'electric_power_facility': 'Utility/Infrastructure',
    'internet_service_provider': 'Utility/Infrastructure',
    'home_security': 'Utility/Infrastructure',

    # ------------------ AUTOMOTIVE & TRANSPORT ------------------
    'airline': 'Automotive & Transport', 'airport': 'Automotive & Transport', 'car_dealer': 'Automotive & Transport', 
    'motorcycle_dealer': 'Automotive & Transport', 'auto_glass_service': 'Automotive & Transport', 
    'motorcycle_manufacturer': 'Automotive & Transport', 'auto_customization': 'Automotive & Transport', 
    'taxi_service': 'Automotive & Transport', 'railroad_freight': 'Automotive & Transport', 
    'bike_repair_maintenance': 'Automotive & Transport', 'car_repair_and_maintenance': 'Automotive & Transport',
    'parking_garage': 'Automotive & Transport', 'gas_station': 'Automotive & Transport',
    'car_wash': 'Automotive & Transport', 'car_rental': 'Automotive & Transport',
    'train_station': 'Automotive & Transport', 'parking': 'Automotive & Transport',

    # ------------------ FINANCIAL ------------------
    'bank_credit_union': 'Financial', 'investing': 'Financial', 'financial_service': 'Financial',
    'financial_services': 'Financial', 'mortgage_broker': 'Financial', 'insurance_agency': 'Financial',

    # ------------------ BEAUTY & PERSONAL CARE ------------------
    'beauty_salon': 'Beauty & Personal Care', 'barber': 'Beauty & Personal Care', 'beauty_and_spa': 'Beauty & Personal Care',
    'hair_salon': 'Beauty & Personal Care', 'massage_studio': 'Beauty & Personal Care', 'tattoo_parlor': 'Beauty & Personal Care',

    # ------------------ MEDIA & PUBLISHING ------------------
    'topic_publisher': 'Media/Publishing', 'mass_media': 'Media/Publishing', 'print_media': 'Media/Publishing',
    'media_agency': 'Media/Publishing', 'media_news_company': 'Media/Publishing', 'broadcasting_media_production': 'Media/Publishing',
    'media_critic': 'Media/Publishing', 'game_publisher': 'Media/Publishing',

    # ------------------ INDUSTRIAL/MANUFACTURING/TRADE ------------------
    'metal_supplier': 'Industrial/Trade', 'tobacco_company': 'Industrial/Trade', 'farm': 'Industrial/Trade', 
    'windows_installation': 'Industrial/Trade', 'manufacturing_facility': 'Industrial/Trade', 'trade_school': 'Industrial/Trade',
    'wholesale_trade': 'Industrial/Trade', 'industrial_company': 'Industrial/Trade', 'industrial_equipment': 'Industrial/Trade',
    
    # ------------------ OTHER/LANDMARK/MISCELLANEOUS ------------------
    'landmark_and_historical_building': 'Other/Landmark', 'archaeological_services': 'Other/Landmark', 
    'record_label': 'Other/Arts', 'sculpture_statue': 'Other/Landmark', 
    'astrologer': 'Other/Personal', 'dog_trainer': 'Other/Personal', 
    'funeral_services_and_cemeteries': 'Other/Services', 'sewing_and_alterations': 'Other/Services',
    'game_store': 'Other/Entertainment', 'computer_hardware_company': 'Other/Technology',
    'eyewear_and_optician': 'Other/Retail', 'russian_restaurant': 'Food & Drink', # Moved this back to Food & Drink
}


# 1. Apply the mapping to create the new column
# The .map() function looks up the value in 'main_category' and replaces it 
# with the corresponding value from the CATEGORY_MAP.
gdf['simplified_category'] = gdf['main_category'].map(CATEGORY_MAP)

# 2. Handle categories that were NOT in the map
# Any category not found in the map will result in NaN. We can fill these 
# with a generic 'Other' category.
gdf['simplified_category'] = gdf['simplified_category'].fillna('Other')

# 3. Verification: Print the count of points per new category
print("--- Counts of Simplified Categories ---")
print(gdf['simplified_category'].value_counts())

# 4. Show the original and new category columns for inspection
print("\n--- Verification (Categories vs. Simplified) ---")
print(gdf[['main_category', 'simplified_category']].head(20))

--- Counts of Simplified Categories ---
simplified_category
Other                        10849
Food & Drink                  4473
Commercial/Retail             4032
Recreation & Arts             2413
Professional/Services         1301
Administrative/Public          930
Accommodation/Residential      854
Education                      834
Beauty & Personal Care         801
Other/Landmark                 411
Health & Medical               409
Financial                      324
Media/Publishing               290
Other/Retail                   145
Automotive & Transport         145
Utility/Infrastructure          58
Other/Technology                36
Industrial/Trade                21
Other/Services                  18
Other/Arts                      12
Other/Personal                   2
Name: count, dtype: int64

--- Verification (Categories vs. Simplified) ---
             main_category        simplified_category
0                     None                      Other
1                    

In [29]:
CATEGORY_MAP = {
    # ---------------- FOOD & DRINK ----------------
    "fast_food_restaurant": "Food & Drink",
    "cafe": "Food & Drink",
    "bar": "Food & Drink",
    "donuts": "Food & Drink",
    "internet_cafe": "Food & Drink",
    "eat_and_drink": "Food & Drink",

    # ---------------- RETAIL ----------------
    "shopping": "Retail",
    "shopping_center": "Retail",
    "supermarket": "Retail",
    "clothing_store": "Retail",
    "women's_clothing_store": "Retail",
    "shoe_store": "Retail",
    "computer_store": "Retail",
    "cosmetic_and_beauty_supplies": "Retail",
    "party_supply": "Retail",

    # ---------------- EDUCATION ----------------
    "education": "Education",
    "school": "Education",
    "elementary_school": "Education",
    "public_school": "Education",
    "college_university": "Education",
    "library": "Education",

    # ---------------- HEALTH ----------------
    "health_and_medical": "Health",
    "hospital": "Health",
    "pharmacy": "Health",
    "counseling_and_mental_health": "Health",

    # ---------------- PUBLIC & COMMUNITY ----------------
    "public_service_and_government": "Public & Community Services",
    "central_government_office": "Public & Community Services",
    "public_and_government_association": "Public & Community Services",
    "community_services_non_profits": "Public & Community Services",
    "charity_organization": "Public & Community Services",
    "social_service_organizations": "Public & Community Services",
    "senior_citizen_services": "Public & Community Services",
    "animal_shelter": "Public & Community Services",
    "non_governmental_association": "Public & Community Services",

    # ---------------- PROFESSIONAL SERVICES ----------------
    "professional_services": "Professional Services",
    "construction_services": "Professional Services",
    "architectural_designer": "Professional Services",
    "business_advertising": "Professional Services",
    "legal_services": "Professional Services",
    "videographer": "Professional Services",
    "printing_services": "Professional Services",
    "screen_printing_t_shirt_printing": "Professional Services",

    # ---------------- RECREATION & CULTURE ----------------
    "arts_and_entertainment": "Recreation & Culture",
    "sports_club_and_league": "Recreation & Culture",
    "stadium_arena": "Recreation & Culture",
    "theatre": "Recreation & Culture",
    "topic_concert_venue": "Recreation & Culture",
    "park": "Recreation & Culture",
    "public_plaza": "Recreation & Culture",

    # ---------------- RELIGION ----------------
    "religious_organization": "Religion",
    "mosque": "Religion",
    "catholic_church": "Religion",
    "church_cathedral": "Religion",

    # ---------------- TRANSPORT ----------------
    "transportation": "Transport",
    "airport": "Transport",

    # ---------------- ACCOMMODATION ----------------
    "accommodation": "Accommodation",
    "hotel": "Accommodation",
    "hostel": "Accommodation",
    "lodge": "Accommodation",

    # ---------------- FINANCE ----------------
    "financial_service": "Finance",
    "bank_credit_union": "Finance",

    # ---------------- REAL ESTATE ----------------
    "real_estate": "Real Estate",
    "real_estate_agent": "Real Estate",

    # ---------------- AGRICULTURE & TRADE ----------------
    "farm": "Agriculture & Trade",
    "livestock_breeder": "Agriculture & Trade",
    "meat_wholesaler": "Agriculture & Trade",

    # ---------------- LANDMARK ----------------
    "landmark_and_historical_building": "Landmark / Heritage",

    # ---------------- MILITARY ----------------
    "armed_forces_branch": "Military",

    # ---------------- OTHER ----------------
    "mass_media": "Other",
}




In [31]:


gdf["simplified_category"] = (
    gdf["main_category"]
    .astype(str)
    .str.strip()
    .map(CATEGORY_MAP)
    .fillna("Other")
)

print(gdf["simplified_category"].value_counts())


simplified_category
Other                          72
Public & Community Services    44
Education                      34
Accommodation                  24
Food & Drink                   12
Religion                       12
Professional Services          11
Recreation & Culture           11
Retail                         10
Landmark / Heritage             8
Health                          6
Agriculture & Trade             3
Real Estate                     2
Military                        2
Transport                       2
Finance                         2
Name: count, dtype: int64


In [16]:
import pandas as pd

# -------------------------------------------------------------------
# 0) Read the authoritative list of unique categories (66 values)
# -------------------------------------------------------------------
csv_path = r"C:\Users\edoar\OneDrive\Documenti\Work\Projects\GisWorkflows\Data_collection\unique_categories.csv"   # change if needed
unique_df = pd.read_csv(csv_path)

# column name in your CSV is "category"
expected = set(unique_df["category"].dropna().astype(str).str.strip())

# -------------------------------------------------------------------
# 1) EXPLICIT reclassification (covers ALL expected values)
#    - simplified_category  : small set of parents
#    - simplified_subcat    : more detail (esp. education/health)
# -------------------------------------------------------------------
CATEGORY_MAP = {
    # ------------------ EDUCATION ------------------
    "college_university": ("Services", "Education - Tertiary"),
    "school":             ("Services", "Education - School (General)"),
    "elementary_school":  ("Services", "Education - Primary"),
    "education":          ("Services", "Education - Other"),
    "library":            ("Services", "Education - Library"),

    # ------------------ HEALTH ------------------
    "hospital":                 ("Services", "Health - Hospital"),
    "pharmacy":                 ("Services", "Health - Pharmacy"),
    "counseling_and_mental_health": ("Services", "Health - Mental Health"),
    "health_and_medical":       ("Services", "Health - Other/General"),

    # ------------------ FOOD & DRINK ------------------
    "fast_food_restaurant": ("Food & Drink", "Fast Food"),
    "cafe":                 ("Food & Drink", "Cafe"),
    "donuts":               ("Food & Drink", "Bakery / Sweets"),
    "bar":                  ("Food & Drink", "Bar"),

    # ------------------ RETAIL ------------------
    "shopping":                  ("Retail", "General Retail"),
    "supermarket":               ("Retail", "Supermarket / Grocery"),
    "clothing_store":            ("Retail", "Clothing"),
    "women's_clothing_store":    ("Retail", "Clothing"),
    "shoe_store":                ("Retail", "Footwear"),
    "computer_store":            ("Retail", "Electronics"),
    "cosmetic_and_beauty_supplies": ("Retail", "Cosmetics / Beauty Supplies"),
    "party_supply":              ("Retail", "Party / Events Supplies"),

    # ------------------ PERSONAL CARE ------------------
    "beauty_salon": ("Services", "Personal Care - Beauty Salon"),

    # ------------------ ACCOMMODATION ------------------
    "accommodation": ("Accommodation", "Accommodation (General)"),
    "hotel":         ("Accommodation", "Hotel"),
    "hostel":        ("Accommodation", "Hostel"),
    "lodge":         ("Accommodation", "Lodge"),

    # ------------------ TRANSPORT ------------------
    "transportation": ("Transport", "Transport (General)"),
    "airport":        ("Transport", "Airport"),

    # ------------------ GOVERNMENT / PUBLIC ------------------
    "public_service_and_government":      ("Public & Government", "Government (General)"),
    "central_government_office":          ("Public & Government", "Government Office"),
    "armed_forces_branch":                ("Public & Government", "Armed Forces"),
    "public_and_government_association":  ("Public & Government", "Public/Gov Association"),

    # ------------------ RELIGION ------------------
    "religious_organization": ("Religion", "Religious Organization"),
    "mosque":                 ("Religion", "Mosque"),
    "catholic_church":        ("Religion", "Church (Catholic)"),
    "church_cathedral":       ("Religion", "Church / Cathedral"),

    # ------------------ SOCIAL / NGO / COMMUNITY ------------------
    "community_services_non_profits": ("Services", "Social Services / NGO"),
    "charity_organization":           ("Services", "Social Services / NGO"),
    "social_service_organizations":   ("Services", "Social Services / NGO"),
    "non_governmental_association":   ("Services", "Social Services / NGO"),
    "animal_shelter":                 ("Services", "Animal Shelter"),
    "senior_citizen_services":        ("Services", "Senior Citizen Services"),

    # ------------------ PROFESSIONAL / BUSINESS SERVICES ------------------
    "professional_services":              ("Services", "Business Services (General)"),
    "construction_services":              ("Services", "Construction Services"),
    "architectural_designer":             ("Services", "Architecture / Design"),
    "legal_services":                     ("Services", "Legal Services"),
    "business_advertising":               ("Services", "Advertising / Marketing"),
    "videographer":                       ("Services", "Media Production"),
    "printing_services":                  ("Services", "Printing Services"),
    "screen_printing_t_shirt_printing":   ("Services", "Printing Services"),
    "internet_cafe":                      ("Services", "Internet / IT Service"),
    "financial_service":                  ("Services", "Financial Services"),
    "bank_credit_union":                  ("Services", "Bank / Credit Union"),
    "real_estate_agent":                  ("Services", "Real Estate Services"),
    "real_estate":                        ("Services", "Real Estate Services"),

    # ------------------ RECREATION / CULTURE ------------------
    "arts_and_entertainment": ("Recreation & Culture", "Arts & Entertainment (General)"),
    "theatre":                ("Recreation & Culture", "Theatre"),
    "topic_concert_venue":    ("Recreation & Culture", "Concert / Music Venue"),
    "stadium_arena":          ("Recreation & Culture", "Stadium / Arena"),
    "sports_club_and_league": ("Recreation & Culture", "Sports Club / League"),
    "park":                   ("Recreation & Culture", "Park / Green Space"),
    "public_plaza":           ("Recreation & Culture", "Public Plaza"),

    # ------------------ LANDMARK ------------------
    "landmark_and_historical_building": ("Landmark", "Historic / Landmark"),

    # ------------------ AGRICULTURE / TRADE ------------------
    "farm":              ("Agriculture", "Farm"),
    "livestock_breeder": ("Agriculture", "Livestock"),
    "meat_wholesaler":   ("Industry/Trade", "Food Wholesale"),
}

# -------------------------------------------------------------------
# 2) HARD CHECK: ensure the dict covers every value in the CSV
# -------------------------------------------------------------------
mapped = set(CATEGORY_MAP.keys())
missing_in_map = expected - mapped
extra_in_map = mapped - expected

if missing_in_map:
    raise ValueError(f"CATEGORY_MAP is missing {len(missing_in_map)} categories: {sorted(missing_in_map)}")
if extra_in_map:
    raise ValueError(f"CATEGORY_MAP has {len(extra_in_map)} categories not present in CSV: {sorted(extra_in_map)}")

print(f"✅ CATEGORY_MAP covers all categories in CSV: {len(expected)} / {len(expected)}")

# -------------------------------------------------------------------
# 3) Apply to your dataset (GeoDataFrame) called gdf
# -------------------------------------------------------------------
# Ensure main_category is clean strings (important if you have whitespace)
gdf["main_category"] = gdf["main_category"].astype(str).str.strip()

# Create both columns
gdf["simplified_category"] = gdf["main_category"].map(lambda x: CATEGORY_MAP.get(x, ("Other", "Other"))[0])
gdf["simplified_subcat"]   = gdf["main_category"].map(lambda x: CATEGORY_MAP.get(x, ("Other", "Other"))[1])

# -------------------------------------------------------------------
# 4) Verification outputs
# -------------------------------------------------------------------
print("\n--- Counts of Simplified Categories ---")
print(gdf["simplified_category"].value_counts(dropna=False))

print("\n--- Counts of Simplified Sub-Categories ---")
print(gdf["simplified_subcat"].value_counts(dropna=False))

# Show any rows that still ended up as Other (should be zero if your data matches the CSV list)
other_rows = gdf[gdf["simplified_category"].eq("Other")]
print(f"\n--- Rows classified as 'Other' ---\n{len(other_rows)}")
if len(other_rows) > 0:
    print(other_rows[["main_category"]].drop_duplicates().head(50))


✅ CATEGORY_MAP covers all categories in CSV: 66 / 66

--- Counts of Simplified Categories ---
simplified_category
Other                   111
Services                 60
Public & Government      22
Accommodation            17
Religion                  9
Recreation & Culture      9
Retail                    8
Food & Drink              7
Landmark                  7
Transport                 2
Agriculture               2
Industry/Trade            1
Name: count, dtype: int64

--- Counts of Simplified Sub-Categories ---
simplified_subcat
Other                           111
Education - Tertiary             16
Social Services / NGO            10
Accommodation (General)           9
Public/Gov Association            8
                               ... 
Livestock                         1
General Retail                    1
Party / Events Supplies           1
Personal Care - Beauty Salon      1
Architecture / Design             1
Name: count, Length: 61, dtype: int64

--- Rows classified as 'Ot

In [32]:
# Define the output file name
output_geojson_file = "places_categorized4.geojson"

gdf.to_file(output_geojson_file, driver="GeoJSON")