In [5]:
import requests
import pandas as pd
from datetime import datetime

def get_sao_paulo_healthcare_facilities():

    bbox = "-23.7036,-46.8256,-23.3567,-46.3650"
    overpass_url = "http://overpass-api.de/api/interpreter"

    overpass_query = f"""
    [out:json][timeout:180];
    (
      node["amenity"="hospital"]({bbox});
      way["amenity"="hospital"]({bbox});
      relation["amenity"="hospital"]({bbox});

      node["healthcare"="hospital"]({bbox});
      way["healthcare"="hospital"]({bbox});
      relation["healthcare"="hospital"]({bbox});
    );
    out center;
    """

    print(f"Querying Overpass API for Sao Paulo healthcare facilities...")
    print(f"Timestamp: {datetime.now()}")

    response = requests.post(overpass_url, data={'data': overpass_query}, timeout=200)
    data = response.json()

    print(f"API Response received. Status code: {response.status_code}")

    facilities = []

    for element in data['elements']:
        tags = element.get('tags', {})

        facility = {
            'osm_id': element.get('id'),
            'osm_type': element.get('type'),
            'name': tags.get('name'),
            'amenity': tags.get('amenity'),
            'healthcare': tags.get('healthcare'),
            'operator': tags.get('operator'),
            'operator_type': tags.get('operator:type'),
            'emergency': tags.get('emergency'),
            'beds': tags.get('beds'),
            'addr_street': tags.get('addr:street'),
            'phone': tags.get('phone'),
        }

        # get coordinates
        if element['type'] == 'node':
            facility['latitude'] = element.get('lat')
            facility['longitude'] = element.get('lon')
        else:
            if 'center' in element:
                facility['latitude'] = element['center'].get('lat')
                facility['longitude'] = element['center'].get('lon')
            else:
                facility['latitude'] = None
                facility['longitude'] = None

        facilities.append(facility)

    df = pd.DataFrame(facilities)

    initial_count = len(df)
    print(f"\nInitial healthcare facilities collected: {initial_count}")

    # remove duplicates
    df = df.drop_duplicates(subset=['name', 'latitude', 'longitude'])
    print(f"After removing duplicates: {len(df)}")

    # remove entries without coordinates
    df = df.dropna(subset=['latitude', 'longitude'])
    print(f"After removing entries without coordinates: {len(df)}")

    # remove entries without names
    df = df.dropna(subset=['name'])
    print(f"After removing unnamed facilities: {len(df)}")

    # remove military facilities
    military_keywords = ['militar', 'military', 'forca aerea', 'exercito', 'marinha', 'forcas armadas']

    def is_military(row):
        name = str(row['name']).lower()
        operator = str(row['operator']).lower() if pd.notna(row['operator']) else ''

        for keyword in military_keywords:
            if keyword in name or keyword in operator:
                return True
        return False

    military_mask = df.apply(is_military, axis=1)
    military_count = military_mask.sum()

    if military_count > 0:
        print(f"\nMilitary facilities being removed: {military_count}")
        print(df[military_mask][['name', 'operator']].to_string(index=False))

    df = df[~military_mask].copy()
    print(f"After removing military facilities: {len(df)}")

    print(f"\n{'='*60}")
    print(f"FINAL RESULTS")
    print(f"{'='*60}")
    print(f"Total civilian healthcare facilities with names: {len(df)}")
    print(f"With emergency tag: {df['emergency'].notna().sum()}")
    print(f"With bed count: {df['beds'].notna().sum()}")
    print(f"With operator info: {df['operator'].notna().sum()}")

    print(f"\nFirst 15 facilities:")
    print(df[['name', 'latitude', 'longitude', 'operator']].head(15).to_string())

    print(f"\nCoordinate ranges:")
    print(f"Latitude: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
    print(f"Longitude: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")

    df.to_csv('sao_paulo_healthcare_facilities.csv', index=False)
    print(f"\nData saved to sao_paulo_healthcare_facilities.csv")

    return df

facilities_df = get_sao_paulo_healthcare_facilities()

Querying Overpass API for Sao Paulo healthcare facilities...
Timestamp: 2025-11-22 02:26:31.711175
API Response received. Status code: 200

Initial healthcare facilities collected: 369
After removing duplicates: 369
After removing entries without coordinates: 369
After removing unnamed facilities: 330

Military facilities being removed: 3
                                 name                               operator
Hospital Militar de Área de São Paulo                    Exército Brasileiro
             Hospital Policia Militar Polícia Militar do Estado de São Paulo
                  Centro Médico PMESP Polícia Militar do Estado de São Paulo
After removing military facilities: 327

FINAL RESULTS
Total civilian healthcare facilities with names: 327
With emergency tag: 153
With bed count: 1
With operator info: 92

First 15 facilities:
                                          name   latitude  longitude operator
0                           Hospital São Paulo -23.597777 -46.643533     None


In [7]:
import requests
import pandas as pd
from datetime import datetime

def get_sao_paulo_schools():

    bbox = "-23.7036,-46.8256,-23.3567,-46.3650"
    overpass_url = "http://overpass-api.de/api/interpreter"

    # only query for schools, not colleges or universities
    overpass_query = f"""
    [out:json][timeout:180];
    (
      node["amenity"="school"]({bbox});
      way["amenity"="school"]({bbox});
      relation["amenity"="school"]({bbox});
    );
    out center;
    """

    print(f"Querying Overpass API for Sao Paulo schools (K-12 only)...")
    print(f"Timestamp: {datetime.now()}")

    response = requests.post(overpass_url, data={'data': overpass_query}, timeout=200)
    data = response.json()

    print(f"API Response received. Status code: {response.status_code}")

    schools = []

    for element in data['elements']:
        tags = element.get('tags', {})

        school = {
            'osm_id': element.get('id'),
            'osm_type': element.get('type'),
            'name': tags.get('name'),
            'amenity': tags.get('amenity'),
            'operator': tags.get('operator'),
            'operator_type': tags.get('operator:type'),
            'addr_street': tags.get('addr:street'),
            'phone': tags.get('phone'),
            'website': tags.get('website'),
        }

        # get coordinates
        if element['type'] == 'node':
            school['latitude'] = element.get('lat')
            school['longitude'] = element.get('lon')
        else:
            if 'center' in element:
                school['latitude'] = element['center'].get('lat')
                school['longitude'] = element['center'].get('lon')
            else:
                school['latitude'] = None
                school['longitude'] = None

        schools.append(school)

    df = pd.DataFrame(schools)

    initial_count = len(df)
    print(f"\nInitial schools collected: {initial_count}")

    # remove duplicates
    df = df.drop_duplicates(subset=['name', 'latitude', 'longitude'])
    print(f"After removing duplicates: {len(df)}")

    # remove entries without coordinates
    df = df.dropna(subset=['latitude', 'longitude'])
    print(f"After removing entries without coordinates: {len(df)}")

    # remove entries without names
    df = df.dropna(subset=['name'])
    print(f"After removing unnamed schools: {len(df)}")

    # remove military schools
    military_keywords = ['militar', 'military', 'forca aerea', 'exercito', 'marinha', 'forcas armadas']

    def is_military(row):
        name = str(row['name']).lower()
        operator = str(row['operator']).lower() if pd.notna(row['operator']) else ''

        for keyword in military_keywords:
            if keyword in name or keyword in operator:
                return True
        return False

    military_mask = df.apply(is_military, axis=1)
    military_count = military_mask.sum()

    if military_count > 0:
        print(f"\nMilitary schools being removed: {military_count}")
        print(df[military_mask][['name', 'operator']].to_string(index=False))

    df = df[~military_mask].copy()
    print(f"After removing military schools: {len(df)}")

    print(f"\n{'='*60}")
    print(f"FINAL RESULTS")
    print(f"{'='*60}")
    print(f"Total K-12 schools with names: {len(df)}")
    print(f"With operator info: {df['operator'].notna().sum()}")
    print(f"With phone: {df['phone'].notna().sum()}")
    print(f"With website: {df['website'].notna().sum()}")

    print(f"\nFirst 15 schools:")
    print(df[['name', 'latitude', 'longitude', 'operator']].head(15).to_string())

    print(f"\nCoordinate ranges:")
    print(f"Latitude: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
    print(f"Longitude: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")

    df.to_csv('sao_paulo_schools_k12.csv', index=False)
    print(f"\nData saved to sao_paulo_schools_k12.csv")

    return df

schools_df = get_sao_paulo_schools()

Querying Overpass API for Sao Paulo schools (K-12 only)...
Timestamp: 2025-11-22 02:33:57.975113
API Response received. Status code: 200

Initial schools collected: 2181
After removing duplicates: 2180
After removing entries without coordinates: 2180
After removing unnamed schools: 2006

Military schools being removed: 2
                                  name                                          operator
Escola de Corpo de Bombeiros Planejado Corpo de Bombeiros Militar do Estado de São Paulo
                          Colégio PMSP                         Policia Militar São Paulo
After removing military schools: 2004

FINAL RESULTS
Total K-12 schools with names: 2004
With operator info: 804
With phone: 949
With website: 210

First 15 schools:
                             name   latitude  longitude operator
1   Colégio Presbiteriano do Brás -23.541214 -46.599650     None
2                     Coreo Dança -23.635853 -46.698141     None
3              Anglo Vestibulares -23.563672 -46.

In [8]:
import requests
import pandas as pd
from datetime import datetime

def get_sao_paulo_groceries():

    bbox = "-23.7036,-46.8256,-23.3567,-46.3650"
    overpass_url = "http://overpass-api.de/api/interpreter"

    # query for supermarkets, convenience stores, and grocery shops
    overpass_query = f"""
    [out:json][timeout:180];
    (
      node["shop"="supermarket"]({bbox});
      way["shop"="supermarket"]({bbox});
      relation["shop"="supermarket"]({bbox});

      node["shop"="convenience"]({bbox});
      way["shop"="convenience"]({bbox});
      relation["shop"="convenience"]({bbox});

      node["shop"="greengrocer"]({bbox});
      way["shop"="greengrocer"]({bbox});
      relation["shop"="greengrocer"]({bbox});

      node["shop"="marketplace"]({bbox});
      way["shop"="marketplace"]({bbox});
      relation["shop"="marketplace"]({bbox});
    );
    out center;
    """

    print(f"Querying Overpass API for Sao Paulo grocery stores...")
    print(f"Timestamp: {datetime.now()}")

    response = requests.post(overpass_url, data={'data': overpass_query}, timeout=200)
    data = response.json()

    print(f"API Response received. Status code: {response.status_code}")

    groceries = []

    for element in data['elements']:
        tags = element.get('tags', {})

        grocery = {
            'osm_id': element.get('id'),
            'osm_type': element.get('type'),
            'name': tags.get('name'),
            'shop': tags.get('shop'),
            'operator': tags.get('operator'),
            'brand': tags.get('brand'),
            'addr_street': tags.get('addr:street'),
            'phone': tags.get('phone'),
            'opening_hours': tags.get('opening_hours'),
        }

        # get coordinates
        if element['type'] == 'node':
            grocery['latitude'] = element.get('lat')
            grocery['longitude'] = element.get('lon')
        else:
            if 'center' in element:
                grocery['latitude'] = element['center'].get('lat')
                grocery['longitude'] = element['center'].get('lon')
            else:
                grocery['latitude'] = None
                grocery['longitude'] = None

        groceries.append(grocery)

    df = pd.DataFrame(groceries)

    initial_count = len(df)
    print(f"\nInitial grocery stores collected: {initial_count}")

    # remove duplicates
    df = df.drop_duplicates(subset=['name', 'latitude', 'longitude'])
    print(f"After removing duplicates: {len(df)}")

    # remove entries without coordinates
    df = df.dropna(subset=['latitude', 'longitude'])
    print(f"After removing entries without coordinates: {len(df)}")

    # remove entries without names
    df = df.dropna(subset=['name'])
    print(f"After removing unnamed stores: {len(df)}")

    print(f"\n{'='*60}")
    print(f"FINAL RESULTS")
    print(f"{'='*60}")
    print(f"Total grocery stores with names: {len(df)}")
    print(f"With brand info: {df['brand'].notna().sum()}")
    print(f"With operator info: {df['operator'].notna().sum()}")
    print(f"With phone: {df['phone'].notna().sum()}")
    print(f"With opening hours: {df['opening_hours'].notna().sum()}")

    # breakdown by type
    print(f"\nBreakdown by type:")
    print(df['shop'].value_counts())

    print(f"\nFirst 15 stores:")
    print(df[['name', 'shop', 'brand', 'latitude', 'longitude']].head(15).to_string())

    print(f"\nCoordinate ranges:")
    print(f"Latitude: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
    print(f"Longitude: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")

    df.to_csv('sao_paulo_groceries.csv', index=False)
    print(f"\nData saved to sao_paulo_groceries.csv")

    return df

groceries_df = get_sao_paulo_groceries()

Querying Overpass API for Sao Paulo grocery stores...
Timestamp: 2025-11-22 02:39:43.042272
API Response received. Status code: 200

Initial grocery stores collected: 2093
After removing duplicates: 2092
After removing entries without coordinates: 2092
After removing unnamed stores: 1839

FINAL RESULTS
Total grocery stores with names: 1839
With brand info: 674
With operator info: 108
With phone: 224
With opening hours: 418

Breakdown by type:
shop
supermarket    1060
convenience     569
greengrocer     209
marketplace       1
Name: count, dtype: int64

First 15 stores:
                    name         shop              brand   latitude  longitude
0   Hipermercado Zaffari  supermarket               None -23.527030 -46.680669
1     Rycoy Supermercado  supermarket               None -23.652532 -46.733733
2     Sonda Hipermercado  supermarket               None -23.525261 -46.546788
4      Carrefour Express  convenience  Carrefour Express -23.553053 -46.634936
5             Mercadinho  sup