In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import json
import random
import requests
import pandas as pd
from tqdm import tqdm
from functools import lru_cache

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [3]:
@lru_cache(maxsize=100000)
def convert_samplingpoint_to_city(lat, lon):
    """
    Convert sampling point data to city data.
    """
    # Load the sampling point data
    url = "https://nominatim.openstreetmap.org/reverse"
    params = {
        "format": "jsonv2",
        "lat": lat,
        "lon": lon,
        "zoom": 10,  # Slightly more detailed than 10, still returns cities
        "addressdetails": 1
    }
    headers = {
        "User-Agent": "YourAppName/1.0 (drudao.2001@gmail.com)"
    }

    try:
        print(f"Fetching city for coordinates: ({lat}, {lon})")
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        address = data.get("address", {})

        # Return most appropriate field available
        return (
            address.get("city") or
            address.get("town") or
            address.get("village") or
            address.get("municipality") or
            address.get("county") or
            None
        )

    except requests.RequestException as e:
        print(f"Error: {e}")
        return None


In [4]:
def retry_with_backoff(func, *args, max_retries=5, **kwargs):
    for attempt in range(max_retries):
        result = func(*args, **kwargs)
        if result:
            return result
        wait_time = 2 ** attempt + random.uniform(0, 1)
        print(f"Retrying in {wait_time:.2f}s...")
        time.sleep(wait_time)
    return None

In [5]:
def add_city_to_data(file_path):
    """
    Adds a 'City' field to each sampling point in a JSON file using reverse geocoding.

    Args:
        file_path (str): The path to the input JSON file.

    Returns:
        str: The path to the new JSON file with added city information,
             or None if an error occurs.
    """
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {file_path}")
        return None

    geolocator = Nominatim(user_agent="city_finder_app")
    reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1, error_wait_seconds=10, max_retries=2)

    updated_data_count = 0
    failed_lookups = 0

    print("Starting to process the data for city information...")

    for year_data in tqdm(data.values()):
        for pollutant_data in year_data.values():
            for country_data_list in pollutant_data.values():
                for entry in country_data_list:
                    if 'Latitude' in entry and 'Longitude' in entry:
                        lat = entry['Latitude']
                        lon = entry['Longitude']
                        location = None
                        try:
                            # Ensure coordinates are valid numbers
                            if isinstance(lat, (int, float)) and isinstance(lon, (int, float)):
                                location = reverse((lat, lon), language='en', timeout=10) # Added timeout
                                if location and location.raw.get('address'):
                                    address = location.raw['address']
                                    # Prioritize city, then town, then village
                                    city = address.get('city', address.get('town', address.get('village')))
                                    if city:
                                        entry['City'] = city
                                        print(f"Found city: {city} for Lat: {lat}, Lon: {lon}")
                                        updated_data_count +=1
                                    else:
                                        print(f"City not found in address for Lat: {lat}, Lon: {lon}. Address: {address}")
                                        entry['City'] = 'N/A' # Not Available
                                        failed_lookups +=1
                                        print(f"City not found for Lat: {lat}, Lon: {lon}. Address: {address}")
                                else:
                                    print(f"Location not found for Lat: {lat}, Lon: {lon}")
                                    entry['City'] = 'N/A'
                                    failed_lookups +=1
                                    print(f"Location or address not found for Lat: {lat}, Lon: {lon}")
                            else:
                                print(f"Invalid coordinates for entry: {entry.get('Samplingpoint')}")
                                entry['City'] = 'Invalid Coords'
                                failed_lookups +=1
                                print(f"Invalid coordinates for entry: {entry.get('Samplingpoint')}")
                        except Exception as e:
                            entry['City'] = 'Error'
                            failed_lookups +=1
                            print(f"Error during geocoding for Lat: {lat}, Lon: {lon} - {e}")
                        # Adding a small delay to be respectful to the Nominatim API
                        time.sleep(0.5) # Sleep for 0.5 second between requests

    print(f"Successfully added city information to {updated_data_count} entries.")
    if failed_lookups > 0:
        print(f"Could not determine city for {failed_lookups} entries (marked as 'N/A', 'Invalid Coords', or 'Error').")

    output_file_path = "air_quality_data_with_cities.json"
    try:
        with open(output_file_path, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"Successfully generated the new JSON file: {output_file_path}")
        return output_file_path
    except IOError:
        print(f"Error: Could not write to {output_file_path}")
        return None

In [8]:
add_city_to_data("../data/air_quality_data.json")

Starting to process the data for city information...


  0%|          | 0/31 [00:00<?, ?it/s]

Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Knittelfeld for Lat: 47.21037, Lon: 14.82528
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Graz for Lat: 47.04172, Lon: 15.43308


  3%|▎         | 1/31 [00:21<10:42, 21.42s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Knittelfeld for Lat: 47.21037, Lon: 14.82528
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Kufstein for Lat: 47.58181, Lon: 12.17241
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Voitsberg for Lat: 47.04472, Lon: 15.15278
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88

  6%|▋         | 2/31 [01:03<16:14, 33.60s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Kufstein for Lat: 47.58181, Lon: 12.17241
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Voitsberg for Lat: 47.04472, Lon: 15.15278
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Oberhaag for Lat: 46.65195, Lon: 15.36778
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Knittelfeld for Lat: 47.21037, Lon: 14.8252

 10%|▉         | 3/31 [01:47<17:53, 38.33s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Knittelfeld for Lat: 47.21037, Lon: 14.82528
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz f

 13%|█▎        | 4/31 [02:50<21:36, 48.03s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Knittelfeld for Lat: 47.21037, Lon: 14.82528
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz f

 16%|█▌        | 5/31 [03:49<22:35, 52.12s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Knittelfeld for Lat: 47.21037, Lon: 14.82528
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Linz for Lat: 48.27329, Lon: 14.31479
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.057

 19%|█▉        | 6/31 [04:31<20:10, 48.43s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Knittelfeld for Lat: 47.21037, Lon: 14.82528
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Braunau am Inn for Lat: 48.25747, Lon: 13.03923
Found city: Pöllauberg for Lat: 47.34806, Lon: 15.88222
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Braunau am Inn for Lat: 48.25747, L

 23%|██▎       | 7/31 [05:10<18:09, 45.38s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Kufstein for Lat: 47.58181, Lon: 12.17241
Found city: Vienna for Lat: 48.14128, Lon: 16.29259
Found city: Vienna for Lat: 48.18837, Lon: 16.30002
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Lamm for Lat: 47.13378, Lon: 13.54318
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: Oberhaag for Lat: 46.65195, Lon: 15.36778
Found city: Lienz f

 26%|██▌       | 8/31 [05:39<15:23, 40.16s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: St. Pölten for Lat: 48.20113, Lon: 15.62
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Kufstein for Lat: 47.58181, Lon: 12.17241
Found city: Vienna for Lat: 48.14128, Lon: 16.29259
Found city: Vienna for Lat: 48.18837, Lon: 16.30002
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln 

 29%|██▉       | 9/31 [06:13<14:05, 38.43s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: St. Pölten for Lat: 48.20113, Lon: 15.62
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Kufstein for Lat: 47.58181, Lon: 12.17241
Found city: Vienna for Lat: 48.14128, Lon: 16.29259
Found city: Vienna for Lat: 48.18837, Lon: 16.30002
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Lamm for Lat: 47.13378, Lon: 13.54318
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: Oberhaag

 32%|███▏      | 10/31 [06:45<12:42, 36.33s/it]

Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: Bruck an der Mur for Lat: 47.409440000000004, Lon: 15.25333
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: St. Pölten for Lat: 48.20113, Lon: 15.62
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Graz for Lat: 47.04172, Lon: 15.43308
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Foun

 35%|███▌      | 11/31 [07:22<12:11, 36.56s/it]

Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: Bruck an der Mur for Lat: 47.409440000000004, Lon: 15.25333
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Enns for Lat: 48.20944, Lon: 14.43694
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Lamm for Lat: 47.13378, Lon: 13.54318
Found city: Liezen fo

 39%|███▊      | 12/31 [08:04<12:06, 38.24s/it]

Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Bruck an der Mur for Lat: 47.409440000000004, Lon: 15.25333
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Steyregg for Lat: 48.27975, Lon: 14.3665
Found city: Enns for Lat: 48.20944, Lon: 14.43694
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Engolasters for Lat: 42.51694, Lon: 1.56525
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Tulln an der Donau for Lat: 48.33004, Lon: 16.05726
Found city: Lamm for Lat: 47.13378, Lon: 13.54318
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: O

 42%|████▏     | 13/31 [08:42<11:26, 38.16s/it]

Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Engolasters for Lat: 42.51694, Lon: 1.56525
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002


 45%|████▌     | 14/31 [08:49<08:09, 28.78s/it]

Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Liezen for Lat: 47.56544, Lon: 14.24468
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Enns for Lat: 48.20944, Lon: 14.43694
Found city: Weiz for Lat: 47.2157, Lon: 15.62839
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Engolasters for Lat: 42.51694, Lon: 1.56525
Found city: Klagenfurt for Lat: 46.62662, Lon: 14.29914
Found city: Klosterneuburg for Lat: 48.30194, Lon: 16.32111
Found city: Lamm for Lat: 47.13378, Lon: 13.54318
Found city: Hartberg for Lat: 47.28286, Lon: 15.97185
Found city: Lienz for Lat: 46.81911, Lon: 12.76603
Found city: les Escaldes for Lat: 42.50969

 48%|████▊     | 15/31 [09:20<07:50, 29.42s/it]

Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Engolasters for Lat: 42.51694, Lon: 1.56525
Found city: Elciego for Lat: 42.51833, Lon: -2.61944
Found city: Aiarako kuadrilla/Cuadrilla de Ayala for Lat: 43.14407, Lon: -2.9633700000000003
Found city: Agurain/Salvatierra for Lat: 42.849, Lon: -2.3937
City not found in address for Lat: 42.8752, Lon: -3.2317. Address: {'house_number': '28', 'road': 'Errege kalea/Calle Real', 'hamlet': 'Lalastra', 'city_district': 'Valderejo', 'municipality': 'Valdegovía/Gaubea', 'county': 'Añanako kuadrilla/Cuadrilla de Añana', 'province': 'Álava', 'ISO3166-2-lvl6': 'ES-VI', 'state': 'Autonomous Community of the Basque Country', 'ISO3166-2-lvl4': 'ES-PV', 'postcode': '01427', 'country': 'Spain', 'country_code': 'es'}
City not found for Lat: 42.8752, Lon: -3.2317. Address: {'house_number': '2

 52%|█████▏    | 16/31 [15:05<31:05, 124.34s/it]

Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Engolasters for Lat: 42.51694, Lon: 1.56525
Found city: Encamp for Lat: 42.53488, Lon: 1.71699
Found city: Elciego for Lat: 42.51833, Lon: -2.61944
Found city: Aiarako kuadrilla/Cuadrilla de Ayala for Lat: 43.14407, Lon: -2.9633700000000003
Found city: Agurain/Salvatierra for Lat: 42.849, Lon: -2.3937
City not found in address for Lat: 42.8752, Lon: -3.2317. Address: {'house_number': '28', 'road': 'Errege kalea/Calle Real', 'hamlet': 'Lalastra', 'city_district': 'Valderejo', 'municipality': 'Valdegovía/Gaubea', 'county': 'Añanako kuadrilla/Cuadrilla de Añana', 'province': 'Álava', 'ISO3166-2-lvl6': 'ES-VI', 'state': 'Autonomous Community of the Basque Country', 'ISO3166-2-lvl4': 'ES-PV', 'postcode': '01427', 'country': 'Spain', 'country_code': 'es'}
City not found for Lat: 

 55%|█████▍    | 17/31 [22:05<49:45, 213.26s/it]

Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Engolasters for Lat: 42.51694, Lon: 1.56525
Found city: Encamp for Lat: 42.53488, Lon: 1.71699
Found city: Hvanneyri for Lat: 64.56195, Lon: -21.76417
Found city: Elciego for Lat: 42.51833, Lon: -2.61944
Found city: Aiarako kuadrilla/Cuadrilla de Ayala for Lat: 43.14407, Lon: -2.9633700000000003
Found city: Agurain/Salvatierra for Lat: 42.849, Lon: -2.3937
City not found in address for Lat: 42.8752, Lon: -3.2317. Address: {'house_number': '28', 'road': 'Errege kalea/Calle Real', 'hamlet': 'Lalastra', 'city_district': 'Valderejo', 'municipality': 'Valdegovía/Gaubea', 'county': 'Añanako kuadrilla/Cuadrilla de Añana', 'province': 'Álava', 'ISO3166-2-lvl6': 'ES-VI', 'state': 'Autonomous Community of the Basque Country', 'ISO3166-2-lvl4': 'ES-PV', 'postcode': '01427', 'country':

 58%|█████▊    | 18/31 [28:45<58:22, 269.39s/it]

Found city: Grad Sisak for Lat: 45.45813, Lon: 16.38836
Found city: City of Zagreb for Lat: 45.80034, Lon: 15.97407
Found city: les Escaldes for Lat: 42.50969, Lon: 1.5391400000000002
Found city: Lahti for Lat: 60.98382, Lon: 25.65531
Found city: Imatra for Lat: 61.23807, Lon: 28.87369
Found city: Harjavalta for Lat: 61.33164, Lon: 22.14306
Found city: Harjavalta for Lat: 61.3135, Lon: 22.135
Found city: Turku for Lat: 60.45195, Lon: 22.26768
Found city: Helsinki for Lat: 60.19367, Lon: 24.96395
Found city: Lappeenranta for Lat: 61.04082, Lon: 28.1765
Found city: Oulu for Lat: 65.04338, Lon: 25.4979
Found city: Kuopio for Lat: 62.8936, Lon: 27.66952
Found city: Vantaa for Lat: 60.28995, Lon: 25.03953
Found city: Lappeenranta for Lat: 61.07194, Lon: 28.26409
Found city: Valkeakoski for Lat: 61.27156, Lon: 24.02849
Found city: Jakobstad for Lat: 63.67912, Lon: 22.71837
Found city: Kokkola for Lat: 63.83764, Lon: 23.1316
Found city: Imatra for Lat: 61.18892, Lon: 28.77012
Found city: Hels

 61%|██████▏   | 19/31 [48:01<1:47:09, 535.79s/it]

Found city: Cardiff for Lat: 51.48178, Lon: -3.17625
Found city: Belfast for Lat: 54.59965, Lon: -5.92883
Found city: Newcastle upon Tyne for Lat: 54.97825, Lon: -1.61053
Found city: Leeds for Lat: 53.80378, Lon: -1.54647
Found city: Leicester for Lat: 52.63135, Lon: -1.13301
Found city: Southampton for Lat: 50.90814, Lon: -1.39578
Found city: Middlesbrough for Lat: 54.5693, Lon: -1.2208700000000001
Found city: Manchester for Lat: 53.48152, Lon: -2.23788
Found city: Sheffield for Lat: 53.37772, Lon: -1.4733100000000001
Found city: London for Lat: 51.52105, Lon: -0.21349
Found city: London for Lat: 51.45258, Lon: 0.07077
Found city: Exeter for Lat: 50.72508, Lon: -3.53246
Found city: London for Lat: 51.49633, Lon: -0.46086000000000005
Found city: Royal Leamington Spa for Lat: 52.28881, Lon: -1.53312
Found city: Nottingham for Lat: 52.95473, Lon: -1.14645
Found city: Grays for Lat: 51.47707, Lon: 0.31797000000000003
Found city: East Suffolk for Lat: 52.2944, Lon: 1.4635
Found city: Manch

 61%|██████▏   | 19/31 [56:47<35:51, 179.32s/it]  


KeyboardInterrupt: 

In [10]:
def drop_invalid_entries(path):
    """
    Removes entries from the dataset where the 'City' field is marked as 'N/A', 'Invalid Coords', or 'Error'.
    
    Args:
        path (str): The path to the JSON file containing the dataset.
    
    Returns:
        dict: The cleaned dataset.
    """
    
    try:
        with open(path, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File not found at {path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {path}")
        return None
    
    tot_removed = 0
    
    for year in data:
        for pollutant in data[year]:
            for country in data[year][pollutant]:
                original_len = len(data[year][pollutant][country])
                data[year][pollutant][country] = [
                    entry for entry in data[year][pollutant][country]
                    if entry.get("City") not in {"N/A", "Invalid Coords", "Error"}
                ]
                removed = original_len - len(data[year][pollutant][country])
                if removed > 0:
                    print(f"Removed {removed} invalid entries from {country}, {pollutant}, {year}")
                tot_removed += removed
    print(f"Total removed invalid entries: {tot_removed}")
    return data

In [11]:
new_data = drop_invalid_entries("../data/air_quality_data_with_cities.json")

with open("../data/cleaned_air_quality_data.json", 'w') as f:
    json.dump(new_data, f, indent=2)
print("Cleaned data saved to cleaned_air_quality_data.json")

Removed 4 invalid entries from Spain, O3, 2009
Removed 4 invalid entries from Spain, O3, 2010
Removed 4 invalid entries from Spain, O3, 2011
Removed 1 invalid entries from Sweden, PM10, 2012
Removed 1 invalid entries from Iceland, SO2, 2012
Removed 1 invalid entries from Sweden, SO2, 2012
Removed 2 invalid entries from Norway, SO2, 2012
Removed 4 invalid entries from Greece, O3, 2012
Removed 4 invalid entries from Spain, O3, 2012
Removed 5 invalid entries from Greece, NO2, 2012
Removed 1 invalid entries from Sweden, NO2, 2012
Removed 1 invalid entries from Greece, CO, 2012
Removed 5 invalid entries from Greece, NO, 2012
Removed 6 invalid entries from United Kingdom, O3, 2013
Removed 5 invalid entries from Greece, O3, 2013
Removed 9 invalid entries from Sweden, O3, 2013
Removed 1 invalid entries from Austria, O3, 2013
Removed 4 invalid entries from Germany, O3, 2013
Removed 6 invalid entries from Ireland, O3, 2013
Removed 1 invalid entries from Italy, O3, 2013
Removed 2 invalid entries 