In [1]:
# Add this at the top of your notebook
%load_ext autoreload
%autoreload 2  
# Automatically reload all modules before executing code

from airpollutionpy import extraction as no2
import ee
import geemap
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import os
import sys
from datetime import datetime

In [2]:
no2.initialize_earth_engine()

Earth Engine already initialized


In [3]:
import requests 

def get_boundaries(attr, val):
    """
    This function queries World Bank official boundary dataset and returns 
    country boundary based on attribute and value passed to the function. 
    
    Input:
    attr: One of the attributes of database
    val : Value corresponding to the attribute
    
    Complete list of attributes can be found here: 
    https://services.arcgis.com/iQ1dY19aHwbSDYIF/ArcGIS/rest/services/World_Bank_Official_Boundaries_World_Country_Polygons_(Very_High_Definition)/FeatureServer/0
    
    Returns:
    A dataframe with features and geometry returned by the API. 
    """
    
    url = ''.join(['https://services.arcgis.com/iQ1dY19aHwbSDYIF/ArcGIS/rest/services/',
            'World_Bank_Official_Boundaries_World_Country_Polygons_(Very_High_Definition)/FeatureServer/0/query?',
            "where={a}='{b}'&f=pjson&returnGeometry=true&outFields=*&outSR=4326".format(a=attr, b=val)])

    result = requests.get(url)

    if result.status_code == 200:
        try:
            res = result.json()
            df = pd.DataFrame.from_dict(res['features'][0]).T[:-1]
            geom_lis = []
            for i in range(len(res['features'][0]['geometry']['rings'])):
                geom_lis.append(shapely.geometry.Polygon(res['features'][0]['geometry']['rings'][i]))
            geom = shapely.ops.cascaded_union(geom_lis)
            df.loc['attributes', 'rings'] = geom
            return df
        except Exception as e:
            return e.args[0]

In [56]:
# country_list = ['ETH', 'KEN']
# gdf = pd.DataFrame()
# for name in country_list:
    
#     df = get_boundaries('ISO_A3', name)
    
#     if isinstance(df, pd.DataFrame):
#         gdf = pd.concat([gdf, df])
#     elif isinstance(df, str):
#         country = san_fac.loc[name, 'Country']
#         df = get_boundaries('WB_NAME', country)
#         if isinstance(df, pd.DataFrame):
#             gdf = pd.concat([gdf, df])
#         else:
#             print(name)

In [7]:
eth_adm0 = gpd.read_file('../data/boundaries/ethiopia/eth_admbnda_adm0_csa_bofedb_itos_2021.shp')
eth_adm1 = gpd.read_file('../data/boundaries/ethiopia/eth_admbnda_adm1_csa_bofedb_2021.shp')
eth_adm2 = gpd.read_file('../data/boundaries/ethiopia/eth_admbnda_adm2_csa_bofedb_2021.shp')
eth_adm3 = gpd.read_file('../data/boundaries/ethiopia/eth_admbnda_adm3_csa_bofedb_2021.shp')

eth_adm3_path = '../data/boundaries/ethiopia/eth_admbnda_adm3_csa_bofedb_2021.shp'

In [21]:
start_date = '2020-01-01'
end_date = '2024-12-01'

In [12]:
aoi

In [22]:
# Correct setup
aoi = ee.FeatureCollection('projects/ee-datalab/assets/ETH/eth_admbnda_adm3_csa_bofedb_2021')  # Keep as FeatureCollection
admin_regions = ee.FeatureCollection('projects/ee-datalab/assets/ETH/eth_admbnda_adm3_csa_bofedb_2021')  # Keep as FeatureCollection, not geometry()

monthly_data = no2.process_no2_data(
    start_date=start_date,
    end_date=end_date,
    aoi=aoi,
    admin_regions=admin_regions,
    temporal_resolution='monthly',
    spatial_resolution='admin',
    collection_type="OFFL"
)

Processing NO2 data from 2020-01-01 to 2024-12-01
Temporal resolution: monthly
Spatial resolution: admin
Collection type: OFFL
Admin regions count: 1082
Number of available images in date range: 25301
Processing month: 2020-01-01 to 2020-02-01
Month 2020-01-01: Reduced regions result size: 1082
Processing month: 2020-02-01 to 2020-03-01
Month 2020-02-01: Reduced regions result size: 1082
Processing month: 2020-03-01 to 2020-04-01
Month 2020-03-01: Reduced regions result size: 1082
Processing month: 2020-04-01 to 2020-05-01
Month 2020-04-01: Reduced regions result size: 1082
Processing month: 2020-05-01 to 2020-06-01
Month 2020-05-01: Reduced regions result size: 1082
Processing month: 2020-06-01 to 2020-07-01
Month 2020-06-01: Reduced regions result size: 1082
Processing month: 2020-07-01 to 2020-08-01
Month 2020-07-01: Reduced regions result size: 1082
Processing month: 2020-08-01 to 2020-09-01
Month 2020-08-01: Reduced regions result size: 1082
Processing month: 2020-09-01 to 2020-10

In [None]:
adm3 = list(eth_adm3['ADM3_EN'].unique())

monthly_df = no2.export_no2_data(
    collection=monthly_data,
    description="Monthly_NO2_Adm3",
    output_file="eth_adm3_no2_monthly.csv",
    destination="datalab-air-pollution/ETH/ADM3",
    export_type="GCS",
    return_df=False,
    admin_codes=adm3,
    admin_code_field='ADM3_EN'
)

Using years: [2020, 2021, 2022, 2023, 2024]
Splitting export by 1082 admin regions and 5 years
Started export task for Tahtay Adiyabo in 2020
Started export task for Laelay Adiabo in 2020
Started export task for Zana in 2020
Started export task for Tahtay Koraro in 2020
Started export task for Asgede in 2020
Started export task for Tselemti in 2020
Started export task for Sheraro town in 2020
Started export task for Indasilassie town in 2020
Started export task for Selekleka in 2020
Started export task for Seyemti Adyabo in 2020
Started export task for Adi Daero in 2020
Started export task for Adi Hageray in 2020
Started export task for Tsimbla in 2020
Started export task for Endabaguna town in 2020
Started export task for Dima (TG) in 2020
Started export task for May Tsebri town in 2020
Started export task for Chila in 2020
Started export task for Aheferom in 2020
Started export task for Edaga arbi in 2020
Started export task for Adwa in 2020
Started export task for Laelay Maychew in 

In [35]:
df = pd.read_csv("../data/airpollution/eth_adm0_no2_monthly_2024.csv")

In [46]:
from airpollutionpy import downloadgcs
import os

bucket_name = "datalab-air-pollution"
prefix = 'ETH/ADM3/'
output_folder = os.path.abspath("../data/airpollution/ethiopia/ethiopia_adm3/raw/monthly/")

os.makedirs(output_folder, exist_ok=True)

downloadgcs.download_bucket_files(bucket_name, output_folder=output_folder, prefix=prefix)

Found 6515 files in bucket
c:\Users\wb588851\OneDrive - WBG\Documents\air-pollution-analysis\data\airpollution\ethiopia\ethiopia_adm3\raw\monthly
c:\Users\wb588851\OneDrive - WBG\Documents\air-pollution-analysis\data\airpollution\ethiopia\ethiopia_adm3\raw\monthly
Skipping ETH/ADM3//eth_adm3_no2_monthly_Abergele (TG)_2019.csv (already exists)
c:\Users\wb588851\OneDrive - WBG\Documents\air-pollution-analysis\data\airpollution\ethiopia\ethiopia_adm3\raw\monthly
Skipping ETH/ADM3//eth_adm3_no2_monthly_Abi Adi town_2019.csv (already exists)
c:\Users\wb588851\OneDrive - WBG\Documents\air-pollution-analysis\data\airpollution\ethiopia\ethiopia_adm3\raw\monthly
Skipping ETH/ADM3//eth_adm3_no2_monthly_Adigrat town_2019.csv (already exists)
c:\Users\wb588851\OneDrive - WBG\Documents\air-pollution-analysis\data\airpollution\ethiopia\ethiopia_adm3\raw\monthly
Skipping ETH/ADM3//eth_adm3_no2_monthly_Adwa town_2019.csv (already exists)
c:\Users\wb588851\OneDrive - WBG\Documents\air-pollution-analysi

In [47]:
import glob
import pandas as pd

files = glob.glob('../data/airpollution/ethiopia/ethiopia_adm3/raw/monthly/*.csv')

final_df = pd.DataFrame()

for file in files:
    df = pd.read_csv(file)
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df.drop(columns=['system:index'], inplace=True)
    final_df = pd.concat([final_df, df], ignore_index=True)
    

In [48]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load your current data
# Replace 'your_file.csv' with your actual data file
df = final_df.copy()  


all_admin_regions = adm3

# Generate all monthly dates from Jan 2019 to Dec 2024
start_date = datetime(2019, 1, 1)
end_date = datetime(2024, 12, 1)
all_dates = []

current_date = start_date
while current_date <= end_date:
    all_dates.append(current_date.strftime('%Y-%m-%d'))
    # Move to next month
    if current_date.month == 12:
        current_date = datetime(current_date.year + 1, 1, 1)
    else:
        current_date = datetime(current_date.year, current_date.month + 1, 1)

# Create a complete DataFrame with all combinations of regions and dates
complete_df = pd.DataFrame([(region, date) for region in all_admin_regions for date in all_dates],
                          columns=['ADM3_EN', 'start_date'])

# Convert existing dataframe's date to string for comparison if needed
if df['start_date'].dtype != 'object':
    df['start_date'] = df['start_date'].astype(str)

# Merge to find missing entries
# First ensure both DataFrames have the same data types
df_existing = df[['ADM3_EN', 'start_date']].copy()
df_existing['exists'] = True

# Merge with complete set
merged_df = pd.merge(complete_df, df_existing, on=['ADM3_EN', 'start_date'], how='left')

# Find missing entries
missing_df = merged_df[merged_df['exists'].isna()]

# Show results
print(f"Total missing entries: {len(missing_df)}")
# print("\nSample of missing entries:")
# print(missing_df.head(10))

# # Save missing entries to a CSV file for further analysis
# missing_df.to_csv('missing_admin_date_entries.csv', index=False)

# # Optional: Count missing entries by admin region
missing_by_region = missing_df.groupby('ADM3_EN').size().reset_index(name='missing_count')
missing_by_region = missing_by_region.sort_values('missing_count', ascending=False)
print("\nMissing entries by region:")
print(missing_by_region.head(10))  # Show top 10 regions with most missing entries

# # Optional: Count missing entries by date
missing_by_date = missing_df.groupby('start_date').size().reset_index(name='missing_count')
missing_by_date = missing_by_date.sort_values('missing_count', ascending=False)
print("\nMissing entries by date:")
print(missing_by_date.head(10))  # Show top 10 dates with most missing entries

Total missing entries: 0

Missing entries by region:
Empty DataFrame
Columns: [ADM3_EN, missing_count]
Index: []

Missing entries by date:
Empty DataFrame
Columns: [start_date, missing_count]
Index: []


In [50]:
final_df.to_csv('../data/airpollution/ethiopia/ethiopia_adm3/processed/eth_adm3_no2_monthly_2019_2024.csv', index=False)

In [42]:
missing_admin_regions = list(missing_df['ADM3_EN'].unique())

In [44]:
# Correct setup
aoi = ee.FeatureCollection('projects/ee-datalab/assets/ETH/eth_admbnda_adm3_csa_bofedb_2021')  # Keep as FeatureCollection
admin_regions = ee.FeatureCollection('projects/ee-datalab/assets/ETH/eth_admbnda_adm3_csa_bofedb_2021')  # Keep as FeatureCollection, not geometry()

filtered_aoi = aoi.filter(ee.Filter.inList('ADM3_EN', missing_admin_regions))
filtered_admin_regions = admin_regions.filter(ee.Filter.inList('ADM3_EN', missing_admin_regions))

monthly_data = no2.process_no2_data(
    start_date='2023-01-01',
    end_date='2024-12-01',
    aoi=filtered_aoi,
    admin_regions=filtered_admin_regions,
    temporal_resolution='monthly',
    spatial_resolution='admin',
    collection_type="OFFL"
)

Processing NO2 data from 2023-01-01 to 2024-12-01
Temporal resolution: monthly
Spatial resolution: admin
Collection type: OFFL
Admin regions count: 897
Number of available images in date range: 9803
Processing month: 2023-01-01 to 2023-02-01
Month 2023-01-01: Reduced regions result size: 897
Processing month: 2023-02-01 to 2023-03-01
Month 2023-02-01: Reduced regions result size: 897
Processing month: 2023-03-01 to 2023-04-01
Month 2023-03-01: Reduced regions result size: 897
Processing month: 2023-04-01 to 2023-05-01
Month 2023-04-01: Reduced regions result size: 897
Processing month: 2023-05-01 to 2023-06-01
Month 2023-05-01: Reduced regions result size: 897
Processing month: 2023-06-01 to 2023-07-01
Month 2023-06-01: Reduced regions result size: 897
Processing month: 2023-07-01 to 2023-08-01
Month 2023-07-01: Reduced regions result size: 897
Processing month: 2023-08-01 to 2023-09-01
Month 2023-08-01: Reduced regions result size: 897
Processing month: 2023-09-01 to 2023-10-01
Month 

In [45]:

    
    # Fetch the missing data using your existing function
    # You might need to adapt this depending on how your function works
    # Option 1: Fetch all data for missing regions
missing_data_df = no2.export_no2_data(
    collection=monthly_data,
    description="Missing_Monthly_NO2_Adm3",
    output_file="eth_adm3_no2_monthly.csv",
    destination="datalab-air-pollution/ETH/ADM3",
    export_type="GCS",
    return_df=True,  # Changed to True to get the data back
    admin_codes=missing_admin_regions,
    admin_code_field='ADM3_EN'
)

    

Debug: Starting export to DataFrame
Debug: Error getting collection info: Collection query aborted after accumulating over 5000 elements.


Traceback (most recent call last):
  File "c:\WBG\Anaconda3\envs\acled\Lib\site-packages\ee\data.py", line 408, in _execute_cloud_call
    return call.execute(num_retries=num_retries)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\WBG\Anaconda3\envs\acled\Lib\site-packages\googleapiclient\_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\WBG\Anaconda3\envs\acled\Lib\site-packages\googleapiclient\http.py", line 938, in execute
    raise HttpError(resp, content, uri=self.uri)
googleapiclient.errors.HttpError: <HttpError 400 when requesting https://earthengine.googleapis.com/v1/projects/869743898578/value:compute?prettyPrint=false&alt=json returned "Collection query aborted after accumulating over 5000 elements.". Details: "Collection query aborted after accumulating over 5000 elements.">

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File

Using years: [2023, 2024]
Splitting export by 897 admin regions and 2 years
Started export task for Zana in 2023
Started export task for Tahtay Koraro in 2023
Started export task for Asgede in 2023
Started export task for Tselemti in 2023
Started export task for Sheraro town in 2023
Started export task for Indasilassie town in 2023
Started export task for Selekleka in 2023
Started export task for Seyemti Adyabo in 2023
Started export task for Adi Daero in 2023
Started export task for Adi Hageray in 2023
Started export task for Tsimbla in 2023
Started export task for Aheferom in 2023
Started export task for Edaga arbi in 2023
Started export task for Adwa in 2023
Started export task for Laelay Maychew in 2023
Started export task for Tahtay Mayechew in 2023
Started export task for Adet in 2023
Started export task for Kola Temben in 2023
Started export task for Naeder in 2023
Started export task for Abergele (TG) in 2023
Started export task for Abi Adi town in 2023
Started export task for 

In [36]:
missing_by_region

Unnamed: 0,ADM3_EN,missing_count
198,Chire,24
155,Bole,24
55,Aleta Chuko,24
3,Abadir,24
56,Aleta Wendo,24
...,...,...
892,Zana,12
893,Zayi,12
894,Zequala,12
895,Zigem,12


In [35]:
missing_by_date['start_date'].max()

'2024-12-01'