# Appendix - Code Assiting Data Preparation

## README

TBC-------------

## 0 Initial Run

Run the following cell every time you start a new kernel to configure related parameters.

In [None]:
# Configuration
from pathlib import Path
import sys

CURR_PATH = Path().resolve()            # current file path
REPO_PATH = CURR_PATH.parent            # current repository path
DATA_PATH = REPO_PATH / "data"          # path for saving the data
DEMO_PATH = DATA_PATH / "demo-data"     # path for demo purpose 

SRC_PATH = REPO_PATH / "src"    # path for other sources
sys.path.append(str(SRC_PATH))  # add src to system path to import custom functions


## 1 Data Download

### 1.1 NO2 Data Download 

In this section, NO2 pollution data from [Google Earth Engine Sentinel 5P](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_NRTI_L3_NO2) is downloaded, for both Ethiopia and Iraq in country level.

From related literature and data quality, we finally decided to use **NO2_column_number_density** as the proxy for NO2 concentration level.

#### 1) Custom Functions

Custom function to generate desired time period of NOx data.

In [2]:
import pandas as pd
from typing import List

import ee
ee.Authenticate() # For the first Initialization, individual API is needed to log into Google Earth Engine
ee.Initialize()

# Function: generate desired time period of NO2 data  
def specific_date(start_date: str, end_date: str, time_resolution: str = 'D') -> List[str]:
    """
    Generate a list of dates within specified time period and resolution.

    Parameters:
    - start_date: str
        Start date, format: 'YYYY-MM-DD'.
    - end_date: str
        End date, format: 'YYYY-MM-DD'.
    - time_resolution: str
        Time resolution (e.g., 'D' for daily, 'W' for weekly, 'M' for monthly). Default is 'D'.
    
    Return:
    - dates(list): List of date strings marking the ends of each time segment, format: 'YYYY-MM-DD'.
    
    """
    dates = (
        pd.date_range(start_date, end_date, freq = time_resolution)
        .strftime('%Y-%m-%d')
        .tolist()
    )
    return dates



Successfully saved authorization token.


Request tasks to download in Google Drive.

In [None]:
# Function: download NO2 data
def download_no2_country(country_name: str, dates: list):
    """
    Request NO2 data download from Earth Engine for a specified country and time period

    Parameters:
    - country_name: str
        Name of the target country. Must match the format used by Earth Engine.
    - dates: list
        List containing the desired time range, (e.g., [start_date, end_date]).

    Return:
    - None. Sends a/multiple request(s) to Earth Engine to initiate data download.
        Exported files are saved under a folder named 'NO2_<country_name>' in first-level Google Drive directory.
        Each exported .tiff file is named using its starting date.
    """
    
    countries = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017')
    country = countries.filter(ee.Filter.eq('country_na', country_name)).geometry()

    n_dates = len(dates)

    for i in range(n_dates-1):

        date_start, date_end = dates[i], dates[i+1]

        no2 = (ee.ImageCollection('COPERNICUS/S5P/NRTI/L3_NO2')
            .select('tropospheric_NO2_column_number_density')
            .filterDate(date_start, date_end)
            .mean())

        task = ee.batch.Export.image.toDrive(
            image=no2,
            description=f'{country_name}_NO2_{date_start}_{date_end}',
            folder=f'NO2_{country_name}',
            fileNamePrefix=f'{country_name}_NO2_{date_start}',
            region=country,
            scale=1000,
            maxPixels=1e13
        )

        try:
            task.start()
            print(f'{country_name}: The export task for {date_start} is ongoing, please check the results in Google Drive.')
        except Exception as e:
            print(f'Fail to submit task: {e}')

In [None]:
# Function: download EVI data
def download_EVI_country(country_name: str, dates: list):
    """
    Request NO2 data download from Earth Engine for a specified country and time period

    Parameters:
    - country_name: str
        Name of the target country. Must match the format used by Earth Engine.
    - dates: list
        List containing the desired time range, (e.g., [start_date, end_date]).

    Return:
    - None. Sends a/multiple request(s) to Earth Engine to initiate data download.
        Exported files are saved under a folder named 'NO2_<country_name>' in first-level Google Drive directory.
        Each exported .tiff file is named using its starting date.
    """
    
    countries = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017')
    country = countries.filter(ee.Filter.eq('country_na', country_name)).geometry()

    n_dates = len(dates)

    for i in range(n_dates-1):

        date_start, date_end = dates[i], dates[i+1]

        no2 = (ee.ImageCollection('MODIS/MOD09GA_006_EVI')
            .select('EVI')
            .filterDate(date_start, date_end)
            .mean())

        task = ee.batch.Export.image.toDrive(
            image=no2,
            description=f'{country_name}_NO2_{date_start}_{date_end}',
            folder=f'NO2_{country_name}',
            fileNamePrefix=f'{country_name}_NO2_{date_start}',
            region=country,
            scale=1000,
            maxPixels=1e13
        )

        try:
            task.start()
            print(f'{country_name}: The export task for {date_start} is ongoing, please check the results in Google Drive.')
        except Exception as e:
            print(f'Fail to submit task: {e}')

#### 2) Call and Download Data

In [None]:
dates = specific_date('2023-01-01', '2024-12-31')
len(dates) # 731

# Download Ethiopia NO2 Data
download_no2_country('Ethiopia', dates)

# Download Iraq NO2 Data
download_no2_country('Iraq', dates)

# Download Ethiopia EVI Data
download_EVI_country('Ethiopia', dates)

# Download Iraq EVI Data
download_EVI_country('Iraq', dates)

731

### 1.2 OSM Data Download

Including code to download data from OpenStreetMap(OSM), [OSM Ethiopia](https://download.geofabrik.de/africa/ethiopia-latest-free.shp.zip) and [OSM Iraq](https://download.geofabrik.de/asia/iraq-latest-free.shp.zip).

#### Install & import libraries, define folder structure

In [None]:
import pandas as pd
import osmnx as ox
import geopandas as gpd
from pathlib import Path
import osm2geojson
import requests
import urllib3
from shapely.geometry import Point

In [None]:
# Define base data directory and subfolders
base_dir = Path(r"C:\Users\Luis.ParraMorales\OneDrive - Imperial College London\Group Design Project\Data")
folders = {
    "boundaries": base_dir / "boundaries",
    "roads":      base_dir / "roads",
    "industry":   base_dir / "industry",
    "energy":     base_dir / "energy",
}
for path in folders.values():
    path.mkdir(parents=True, exist_ok=True)

# OSMnx settings
ox.settings.use_cache        = True
ox.settings.log_console      = True
ox.settings.requests_kwargs  = {"verify": False}

#### Country/city boundaries (Ethiopia and Baghdad)

In [None]:
# Define queries
areas = {
    "ethiopia": "Ethiopia, Africa",
    "baghdad":  "Baghdad, Iraq",
}

# Dictionary to hold geometry polygons
boundaries = {}

for name, query in areas.items():
    print(f"Fetching boundary for {name}...")
    gdf = ox.geocode_to_gdf(query)
    poly = gdf.loc[0, "geometry"]
    boundaries[name] = poly
    # save as shapefile
    out_fp = folders["boundaries"] / f"{name}_boundary.shp"
    gdf.to_file(out_fp)
    print(f"Saved boundary to {out_fp}")

#### Road networks

In [None]:
# Read Ethiopia subregions
subregs = gpd.read_file(folders["boundaries"] / "ethiopia_subregions.gpkg")

# Define the road filter
road_types = ["motorway","trunk","primary","secondary","tertiary"]
filter_str = f'["highway"~"^({"|".join(road_types)})$"]'

ethi_roads_parts = []
for _, row in subregs.iterrows():
    region_name = row["region_name"]
    poly = row["geometry"]
    print(f"Fetching roads for Ethiopia – {region_name}…")
    try:
        G = ox.graph_from_polygon(poly, custom_filter=filter_str)
        roads = ox.graph_to_gdfs(G, nodes=False, edges=True, fill_edge_geometry=True)
        roads["region_name"] = region_name
        ethi_roads_parts.append(roads)
    except Exception as e:
        print(f"   skipped {region_name}: {e}")

# Concatenate and save Ethiopia roads
ethi_roads = pd.concat(ethi_roads_parts, ignore_index=True)
out_fp_eth = folders["roads"] / "ethiopia_roads.shp"
ethi_roads.to_file(out_fp_eth)
print(f"Saved Ethiopia roads to {out_fp_eth}")

# Fetch Baghdad’s roads
print("📥 Fetching roads for Baghdad…")
G_bag = ox.graph_from_place("Baghdad, Iraq", custom_filter=filter_str)
bag_roads = ox.graph_to_gdfs(G_bag, nodes=False, edges=True, fill_edge_geometry=True)
out_fp_bag = folders["roads"] / "baghdad_roads.shp"
bag_roads.to_file(out_fp_bag)
print(f"Saved Baghdad roads to {out_fp_bag}")

#### Industrial features & power plants

In [None]:
# 1) SSL off & HTTP endpoints
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ox.settings.requests_kwargs    = {"verify": False}
ox.settings.nominatim_endpoint = "http://nominatim.openstreetmap.org/search"
ox.settings.overpass_endpoint  = "http://overpass-api.de/api/interpreter"

# 2) Ensure poi folder exists
folders["poi"] = folders.get("poi", folders["industry"].parent / "poi")
folders["poi"].mkdir(parents=True, exist_ok=True)

# 3) Tags for POIs
poi_tags = {
    "amenity": [
        "bus_station","bus_stop","parking","fuel","marketplace",
        "school","college","university","hospital","clinic",
        "bank","restaurant","cafe","fast_food","bar","police","fire_station"
    ],
    "shop": True,
    "highway": ["bus_stop","bus_station"],
    "railway": ["station","halt","tram_stop"],
    "aeroway": ["aerodrome","helipad","terminal"],
    "landuse": ["industrial"],
    "man_made": ["works","chimney","storage_tank"],
    "power": ["plant","substation","generator","tower","transformer"],
    "office": True,
    "craft": True,
    "place": ["city","town","village","suburb","neighbourhood","hamlet"],
}

# 4) Ethiopia – loop per subregion
subregs    = gpd.read_file(folders["boundaries"] / "ethiopia_subregions.gpkg")
ethi_parts = []

for _, row in subregs.iterrows():
    region = row["region_name"]
    poly   = row.geometry
    print(f"📥 Fetching POIs for Ethiopia – {region} …")
    try:
        gdf = ox.features_from_polygon(poly, tags=poi_tags)
        if gdf.empty:
            continue
        # convert all non-Points to centroids
        gdf["geometry"] = gdf.geometry.apply(
            lambda g: g if isinstance(g, Point) else g.centroid
        )
        gdf["region_name"] = region
        ethi_parts.append(gdf)
    except Exception as e:
        print(f"   Skipped {region}: {e}")

if ethi_parts:
    ethi_pois = pd.concat(ethi_parts, ignore_index=True).set_crs("EPSG:4326")
    # drop duplicate columns
    ethi_pois = ethi_pois.loc[:, ~ethi_pois.columns.duplicated()]
    # drop any fixme column
    for bad in ["fixme", "FIXME"]:
        if bad in ethi_pois.columns:
            ethi_pois = ethi_pois.drop(columns=bad)
    out_eth = folders["poi"] / "ethiopia_pois.gpkg"
    ethi_pois.to_file(out_eth, driver="GPKG")
    print(f"Saved Ethiopia POIs to {out_eth}")
else:
    print("No Ethiopia POIs fetched.")

# 5) Baghdad – single call
print("Fetching POIs for Baghdad …")
bag_poly = boundaries.get("baghdad") or ox.geocode_to_gdf("Baghdad, Iraq").geometry.iloc[0]

try:
    bag_pois = ox.features_from_polygon(bag_poly, tags=poi_tags)
    bag_pois["geometry"] = bag_pois.geometry.apply(
        lambda g: g if isinstance(g, Point) else g.centroid
    )
    bag_pois["region_name"] = "Baghdad"
    # drop duplicate columns
    bag_pois = bag_pois.loc[:, ~bag_pois.columns.duplicated()]
    # drop any fixme column
    if "fixme" in bag_pois.columns:
        bag_pois = bag_pois.drop(columns="fixme")
    out_bag = folders["poi"] / "baghdad_pois.gpkg"
    bag_pois.to_file(out_bag, driver="GPKG")
    print(f"Saved Baghdad POIs to {out_bag}")
except Exception as e:
    print(f"Failed to fetch Baghdad POIs: {e}")

#### Energy-grid components

In [None]:
# 1) Disable SSL verification & warnings, force HTTP endpoints
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ox.settings.requests_kwargs    = {"verify": False}
ox.settings.nominatim_endpoint = "http://nominatim.openstreetmap.org/search"
ox.settings.overpass_endpoint  = "http://overpass-api.de/api/interpreter"

# 2) Ensure energy folder exists
folders["energy"].mkdir(parents=True, exist_ok=True)

# 3) Define tags for grid components
grid_tags = {
    "power": ["line", "substation", "transformer", "tower"]
}

# 4) Ethiopia – loop per admin_level=4 subregion
subregs   = gpd.read_file(folders["boundaries"] / "ethiopia_subregions.gpkg")
eth_parts = []

for _, row in subregs.iterrows():
    region   = row["region_name"]
    poly     = row.geometry
    print(f"Fetching energy‐grid for Ethiopia – {region} …")
    try:
        gdf = ox.features_from_polygon(poly, tags=grid_tags)
        if gdf.empty:
            continue
        # tag the region
        gdf["region_name"] = region
        # drop any duplicate columns
        gdf = gdf.loc[:, ~gdf.columns.duplicated()]
        # drop problematic 'fixme' field if present
        for bad in ["fixme", "FIXME"]:
            if bad in gdf.columns:
                gdf = gdf.drop(columns=bad)
        eth_parts.append(gdf)
    except Exception as e:
        print(f"   Skipped {region}: {e}")

# 5) Save Ethiopia grid
if eth_parts:
    eth_grid = pd.concat(eth_parts, ignore_index=True).set_crs("EPSG:4326")
    out_eth = folders["energy"] / "ethiopia_energy_grid.gpkg"
    eth_grid.to_file(out_eth, driver="GPKG")
    print(f"Saved Ethiopia energy grid to {out_eth}")
else:
    print("No Ethiopia energy‐grid features fetched.")

# 6) Baghdad – single call
print("Fetching energy‐grid for Baghdad …")
bag_poly = boundaries.get("baghdad") or ox.geocode_to_gdf("Baghdad, Iraq").geometry.iloc[0]

try:
    bag_gdf = ox.features_from_polygon(bag_poly, tags=grid_tags)
    bag_gdf["region_name"] = "Baghdad"
    bag_gdf = bag_gdf.loc[:, ~bag_gdf.columns.duplicated()]
    if "fixme" in bag_gdf.columns:
        bag_gdf = bag_gdf.drop(columns="fixme")
    out_bag = folders["energy"] / "baghdad_energy_grid.gpkg"
    bag_gdf.to_file(out_bag, driver="GPKG")
    print(f"Saved Baghdad energy grid to {out_bag}")
except Exception as e:
    print(f"Failed to fetch Baghdad energy‐grid: {e}")

## 2 Other Helper Functions

### 2.1 Generate Meshes for filling features

Generate meshes, from 2023-01-01 to 2024-12-31, one mesh for each day.

In [None]:
import shutil
from datetime import datetime, timedelta
import fiona

mesh_addis = data_root / "mesh-grid" / "grid_addis_ababa.gpkg"
mesh_baghdad = data_root / "mesh-grid" / "grid_baghdad.gpkg"

lyr_addis_name = fiona.listlayers(mesh_addis)[0]  # control layer number
lyr_baghdad_name = fiona.listlayers(mesh_baghdad)[0]

# start and end date
start_date = datetime.strptime("2023-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2024-12-31", "%Y-%m-%d")

addis_meshes_path = data_root / 'addis-empty-mesh-data'
baghdad_meshes_path = data_root / 'baghdad-empty-mesh-data'

addis_meshes_path.mkdir(exist_ok=True)
baghdad_meshes_path.mkdir(exist_ok=True)

delta = end_date - start_date
days_count = delta.days + 1

# For Addis Ababa
for i in range(days_count):
    current_date = start_date + timedelta(days=i)
    date_str = current_date.strftime("%Y-%m-%d")
    filename = f"addis-ababa-{date_str}.gpkg"
    dest_path = addis_meshes_path / filename

    shutil.copy(mesh_addis, dest_path)

print(f"Complete Generating meshes for Addis Ababa!")

# For Baghdad
for i in range(days_count):
    current_date = start_date + timedelta(days=i)
    date_str = current_date.strftime("%Y-%m-%d")
    filename = f"baghdad-{date_str}.gpkg"
    dest_path = baghdad_meshes_path / filename

    shutil.copy(mesh_baghdad, dest_path)


print(f"Complete Generating meshes for Baghdad!")


Complete Generating meshes for Addis Ababa!
Complete Generating meshes for Baghdad!


### 2.2 Generate Date Tables for Exploratory Data Analysis

In [4]:
# Generate Standard Date Table
import pandas as pd

# Generate date range
date_range = pd.date_range(start='2023-01-01', end='2024-12-31', freq='D')
df = pd.DataFrame({'Date': date_range})
df['Weekday'] = df['Date'].dt.day_name()

# Define Ethiopia workday type: 
# Mon-Fri -> "Workdays", Sat-Sun -> "Weekends"
df['Ethiopia_Workday_Type'] = df['Weekday'].apply(
    lambda x: "Workdays" if x in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] else "Weekends"
)

# Define Iraq workday type:
# Mon-Tue-Wed -> 'Mon-Tue-Wed', Fri-Sat -> 'Fri-Sat', Thu-Sun -> 'Thu-Sun'
iraq_workday_type = {
    'Monday': 'Mon-Tue-Wed', 'Tuesday': 'Mon-Tue-Wed', 'Wednesday': 'Mon-Tue-Wed',
    'Friday': 'Fri-Sat', 'Saturday': 'Fri-Sat',
    'Thursday': 'Thu-Sun', 'Sunday': 'Thu-Sun'
}
df['Iraq_Workday_Type'] = df['Weekday'].map(iraq_workday_type)

# Reorder columns
df = df[['Date', 'Weekday', 'Ethiopia_Workday_Type', 'Iraq_Workday_Type']]
df.to_csv(data_root / 'helper-files' / 'workday_type_2023_2024.csv', index=False, encoding='utf-8-sig')  # save the data if needed

# Preview first few rows
print(df.head(10))

        Date    Weekday Ethiopia_Workday_Type Iraq_Workday_Type
0 2023-01-01     Sunday              Weekends           Thu-Sun
1 2023-01-02     Monday              Workdays       Mon-Tue-Wed
2 2023-01-03    Tuesday              Workdays       Mon-Tue-Wed
3 2023-01-04  Wednesday              Workdays       Mon-Tue-Wed
4 2023-01-05   Thursday              Workdays           Thu-Sun
5 2023-01-06     Friday              Workdays           Fri-Sat
6 2023-01-07   Saturday              Weekends           Fri-Sat
7 2023-01-08     Sunday              Weekends           Thu-Sun
8 2023-01-09     Monday              Workdays       Mon-Tue-Wed
9 2023-01-10    Tuesday              Workdays       Mon-Tue-Wed


### 2.3 Mesh File to CSV files

Change gpkg format to csv format. 

Addis Ababa:

In [None]:
import geopandas as gpd

# Addis Ababa
file_path = DATA_PATH / 'addis-mesh-data'
output_path = DATA_PATH / 'addis-mesh-data-csv'
output_path.mkdir(exist_ok=True)

files = list(file_path.glob("*.gpkg"))

for file in files:
    name = file.stem
    print(name)
    gdf = gpd.read_file(file)
    gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkt if geom else None)
    gdf.to_csv(output_path / f"{name}.csv", index=False)

Baghdad:

In [None]:
file_path = DATA_PATH / 'baghdad-mesh-data'
output_path = DATA_PATH / 'baghdad-mesh-data-csv'
output_path.mkdir(exist_ok=True)

files = list(file_path.glob("*.gpkg"))
# files

for file in files:
    name = file.stem
    print(name)
    gdf = gpd.read_file(file)
    gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkt if geom else None)
    gdf.to_csv(output_path / f"{name}.csv", index=False)