# Appendix - Code Assiting Data Preparation

## README

TBC-------------

## 0 Initial Run

Run the following cell every time you start a new kernel to configure related parameters.

In [None]:
# Configuration
from pathlib import Path
import sys

CURR_PATH = Path().resolve()            # current file path
REPO_PATH = CURR_PATH.parent            # current repository path
DATA_PATH = REPO_PATH / "data"          # path for saving the data
DEMO_PATH = DATA_PATH / "demo-data"     # path for demo purpose 

SRC_PATH = REPO_PATH / "src"    # path for other sources
sys.path.append(str(SRC_PATH))  # add src to system path to import custom functions


## 1 Data Download

### 1.1 NO2 Data Download 

In this section, NO2 pollution data from [Google Earth Engine Sentinel 5P](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_NRTI_L3_NO2) is downloaded, for both Ethiopia and Iraq in country level.

From related literature and data quality, we finally decided to use **NO2_column_number_density** as the proxy for NO2 concentration level.

#### 1) Custom Functions

Custom function to generate desired time period of NOx data.

In [2]:
import pandas as pd
from typing import List

import ee
ee.Authenticate() # For the first Initialization, individual API is needed to log into Google Earth Engine
ee.Initialize()

# Function: generate desired time period of NO2 data  
def specific_date(start_date: str, end_date: str, time_resolution: str = 'D') -> List[str]:
    """
    Generate a list of dates within specified time period and resolution.

    Parameters:
    - start_date: str
        Start date, format: 'YYYY-MM-DD'.
    - end_date: str
        End date, format: 'YYYY-MM-DD'.
    - time_resolution: str
        Time resolution (e.g., 'D' for daily, 'W' for weekly, 'M' for monthly). Default is 'D'.
    
    Return:
    - dates(list): List of date strings marking the ends of each time segment, format: 'YYYY-MM-DD'.
    
    """
    dates = (
        pd.date_range(start_date, end_date, freq = time_resolution)
        .strftime('%Y-%m-%d')
        .tolist()
    )
    return dates



Successfully saved authorization token.


Request tasks to download in Google Drive.

In [None]:
# Function: download NO2 data
def download_no2_country(country_name: str, dates: list):
    """
    Request NO2 data download from Earth Engine for a specified country and time period

    Parameters:
    - country_name: str
        Name of the target country. Must match the format used by Earth Engine.
    - dates: list
        List containing the desired time range, (e.g., [start_date, end_date]).

    Return:
    - None. Sends a/multiple request(s) to Earth Engine to initiate data download.
        Exported files are saved under a folder named 'NO2_<country_name>' in first-level Google Drive directory.
        Each exported .tiff file is named using its starting date.
    """
    
    countries = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017')
    country = countries.filter(ee.Filter.eq('country_na', country_name)).geometry()

    n_dates = len(dates)

    for i in range(n_dates-1):

        date_start, date_end = dates[i], dates[i+1]

        no2 = (ee.ImageCollection('COPERNICUS/S5P/NRTI/L3_NO2')
            .select('tropospheric_NO2_column_number_density')
            .filterDate(date_start, date_end)
            .mean())

        task = ee.batch.Export.image.toDrive(
            image=no2,
            description=f'{country_name}_NO2_{date_start}_{date_end}',
            folder=f'NO2_{country_name}',
            fileNamePrefix=f'{country_name}_NO2_{date_start}',
            region=country,
            scale=1000,
            maxPixels=1e13
        )

        try:
            task.start()
            print(f'{country_name}: The export task for {date_start} is ongoing, please check the results in Google Drive.')
        except Exception as e:
            print(f'Fail to submit task: {e}')

In [None]:
# Function: download EVI data
def download_EVI_country(country_name: str, dates: list):
    """
    Request NO2 data download from Earth Engine for a specified country and time period

    Parameters:
    - country_name: str
        Name of the target country. Must match the format used by Earth Engine.
    - dates: list
        List containing the desired time range, (e.g., [start_date, end_date]).

    Return:
    - None. Sends a/multiple request(s) to Earth Engine to initiate data download.
        Exported files are saved under a folder named 'NO2_<country_name>' in first-level Google Drive directory.
        Each exported .tiff file is named using its starting date.
    """
    
    countries = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017')
    country = countries.filter(ee.Filter.eq('country_na', country_name)).geometry()

    n_dates = len(dates)

    for i in range(n_dates-1):

        date_start, date_end = dates[i], dates[i+1]

        no2 = (ee.ImageCollection('MODIS/MOD09GA_006_EVI')
            .select('EVI')
            .filterDate(date_start, date_end)
            .mean())

        task = ee.batch.Export.image.toDrive(
            image=no2,
            description=f'{country_name}_NO2_{date_start}_{date_end}',
            folder=f'NO2_{country_name}',
            fileNamePrefix=f'{country_name}_NO2_{date_start}',
            region=country,
            scale=1000,
            maxPixels=1e13
        )

        try:
            task.start()
            print(f'{country_name}: The export task for {date_start} is ongoing, please check the results in Google Drive.')
        except Exception as e:
            print(f'Fail to submit task: {e}')

#### 2) Call and Download Data

In [None]:
dates = specific_date('2023-01-01', '2024-12-31')
len(dates) # 731

# Download Ethiopia NO2 Data
download_no2_country('Ethiopia', dates)

# Download Iraq NO2 Data
download_no2_country('Iraq', dates)

# Download Ethiopia EVI Data
download_EVI_country('Ethiopia', dates)

# Download Iraq EVI Data
download_EVI_country('Iraq', dates)

731

### 1.2 OSM Data Download

Including code to download data from OpenStreetMap(OSM), [OSM Ethiopia](https://download.geofabrik.de/africa/ethiopia-latest-free.shp.zip) and [OSM Iraq](https://download.geofabrik.de/asia/iraq-latest-free.shp.zip).

#### Install & import libraries, define folder structure

In [None]:
import pandas as pd
import osmnx as ox
import geopandas as gpd
from pathlib import Path
import osm2geojson
import requests
import urllib3
from shapely.geometry import Point

In [None]:
# Define base data directory and subfolders
base_dir = Path(r"C:\Users\Luis.ParraMorales\OneDrive - Imperial College London\Group Design Project\Data")
folders = {
    "boundaries": base_dir / "boundaries",
    "roads":      base_dir / "roads",
    "industry":   base_dir / "industry",
    "energy":     base_dir / "energy",
}
for path in folders.values():
    path.mkdir(parents=True, exist_ok=True)

# OSMnx settings
ox.settings.use_cache        = True
ox.settings.log_console      = True
ox.settings.requests_kwargs  = {"verify": False}

#### Country/city boundaries (Ethiopia and Baghdad)

In [None]:
# Define queries
areas = {
    "ethiopia": "Ethiopia, Africa",
    "baghdad":  "Baghdad, Iraq",
}

# Dictionary to hold geometry polygons
boundaries = {}

for name, query in areas.items():
    print(f"Fetching boundary for {name}...")
    gdf = ox.geocode_to_gdf(query)
    poly = gdf.loc[0, "geometry"]
    boundaries[name] = poly
    # save as shapefile
    out_fp = folders["boundaries"] / f"{name}_boundary.shp"
    gdf.to_file(out_fp)
    print(f"Saved boundary to {out_fp}")

#### Road networks

In [None]:
# Read Ethiopia subregions
subregs = gpd.read_file(folders["boundaries"] / "ethiopia_subregions.gpkg")

# Define the road filter
road_types = ["motorway","trunk","primary","secondary","tertiary"]
filter_str = f'["highway"~"^({"|".join(road_types)})$"]'

ethi_roads_parts = []
for _, row in subregs.iterrows():
    region_name = row["region_name"]
    poly = row["geometry"]
    print(f"Fetching roads for Ethiopia – {region_name}…")
    try:
        G = ox.graph_from_polygon(poly, custom_filter=filter_str)
        roads = ox.graph_to_gdfs(G, nodes=False, edges=True, fill_edge_geometry=True)
        roads["region_name"] = region_name
        ethi_roads_parts.append(roads)
    except Exception as e:
        print(f"   skipped {region_name}: {e}")

# Concatenate and save Ethiopia roads
ethi_roads = pd.concat(ethi_roads_parts, ignore_index=True)
out_fp_eth = folders["roads"] / "ethiopia_roads.shp"
ethi_roads.to_file(out_fp_eth)
print(f"Saved Ethiopia roads to {out_fp_eth}")

# Fetch Baghdad’s roads
print("📥 Fetching roads for Baghdad…")
G_bag = ox.graph_from_place("Baghdad, Iraq", custom_filter=filter_str)
bag_roads = ox.graph_to_gdfs(G_bag, nodes=False, edges=True, fill_edge_geometry=True)
out_fp_bag = folders["roads"] / "baghdad_roads.shp"
bag_roads.to_file(out_fp_bag)
print(f"Saved Baghdad roads to {out_fp_bag}")

#### Industrial features & power plants

In [None]:
# 1) SSL off & HTTP endpoints
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ox.settings.requests_kwargs    = {"verify": False}
ox.settings.nominatim_endpoint = "http://nominatim.openstreetmap.org/search"
ox.settings.overpass_endpoint  = "http://overpass-api.de/api/interpreter"

# 2) Ensure poi folder exists
folders["poi"] = folders.get("poi", folders["industry"].parent / "poi")
folders["poi"].mkdir(parents=True, exist_ok=True)

# 3) Tags for POIs
poi_tags = {
    "amenity": [
        "bus_station","bus_stop","parking","fuel","marketplace",
        "school","college","university","hospital","clinic",
        "bank","restaurant","cafe","fast_food","bar","police","fire_station"
    ],
    "shop": True,
    "highway": ["bus_stop","bus_station"],
    "railway": ["station","halt","tram_stop"],
    "aeroway": ["aerodrome","helipad","terminal"],
    "landuse": ["industrial"],
    "man_made": ["works","chimney","storage_tank"],
    "power": ["plant","substation","generator","tower","transformer"],
    "office": True,
    "craft": True,
    "place": ["city","town","village","suburb","neighbourhood","hamlet"],
}

# 4) Ethiopia – loop per subregion
subregs    = gpd.read_file(folders["boundaries"] / "ethiopia_subregions.gpkg")
ethi_parts = []

for _, row in subregs.iterrows():
    region = row["region_name"]
    poly   = row.geometry
    print(f"📥 Fetching POIs for Ethiopia – {region} …")
    try:
        gdf = ox.features_from_polygon(poly, tags=poi_tags)
        if gdf.empty:
            continue
        # convert all non-Points to centroids
        gdf["geometry"] = gdf.geometry.apply(
            lambda g: g if isinstance(g, Point) else g.centroid
        )
        gdf["region_name"] = region
        ethi_parts.append(gdf)
    except Exception as e:
        print(f"   Skipped {region}: {e}")

if ethi_parts:
    ethi_pois = pd.concat(ethi_parts, ignore_index=True).set_crs("EPSG:4326")
    # drop duplicate columns
    ethi_pois = ethi_pois.loc[:, ~ethi_pois.columns.duplicated()]
    # drop any fixme column
    for bad in ["fixme", "FIXME"]:
        if bad in ethi_pois.columns:
            ethi_pois = ethi_pois.drop(columns=bad)
    out_eth = folders["poi"] / "ethiopia_pois.gpkg"
    ethi_pois.to_file(out_eth, driver="GPKG")
    print(f"Saved Ethiopia POIs to {out_eth}")
else:
    print("No Ethiopia POIs fetched.")

# 5) Baghdad – single call
print("Fetching POIs for Baghdad …")
bag_poly = boundaries.get("baghdad") or ox.geocode_to_gdf("Baghdad, Iraq").geometry.iloc[0]

try:
    bag_pois = ox.features_from_polygon(bag_poly, tags=poi_tags)
    bag_pois["geometry"] = bag_pois.geometry.apply(
        lambda g: g if isinstance(g, Point) else g.centroid
    )
    bag_pois["region_name"] = "Baghdad"
    # drop duplicate columns
    bag_pois = bag_pois.loc[:, ~bag_pois.columns.duplicated()]
    # drop any fixme column
    if "fixme" in bag_pois.columns:
        bag_pois = bag_pois.drop(columns="fixme")
    out_bag = folders["poi"] / "baghdad_pois.gpkg"
    bag_pois.to_file(out_bag, driver="GPKG")
    print(f"Saved Baghdad POIs to {out_bag}")
except Exception as e:
    print(f"Failed to fetch Baghdad POIs: {e}")

#### Energy-grid components

In [None]:
# 1) Disable SSL verification & warnings, force HTTP endpoints
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ox.settings.requests_kwargs    = {"verify": False}
ox.settings.nominatim_endpoint = "http://nominatim.openstreetmap.org/search"
ox.settings.overpass_endpoint  = "http://overpass-api.de/api/interpreter"

# 2) Ensure energy folder exists
folders["energy"].mkdir(parents=True, exist_ok=True)

# 3) Define tags for grid components
grid_tags = {
    "power": ["line", "substation", "transformer", "tower"]
}

# 4) Ethiopia – loop per admin_level=4 subregion
subregs   = gpd.read_file(folders["boundaries"] / "ethiopia_subregions.gpkg")
eth_parts = []

for _, row in subregs.iterrows():
    region   = row["region_name"]
    poly     = row.geometry
    print(f"Fetching energy‐grid for Ethiopia – {region} …")
    try:
        gdf = ox.features_from_polygon(poly, tags=grid_tags)
        if gdf.empty:
            continue
        # tag the region
        gdf["region_name"] = region
        # drop any duplicate columns
        gdf = gdf.loc[:, ~gdf.columns.duplicated()]
        # drop problematic 'fixme' field if present
        for bad in ["fixme", "FIXME"]:
            if bad in gdf.columns:
                gdf = gdf.drop(columns=bad)
        eth_parts.append(gdf)
    except Exception as e:
        print(f"   Skipped {region}: {e}")

# 5) Save Ethiopia grid
if eth_parts:
    eth_grid = pd.concat(eth_parts, ignore_index=True).set_crs("EPSG:4326")
    out_eth = folders["energy"] / "ethiopia_energy_grid.gpkg"
    eth_grid.to_file(out_eth, driver="GPKG")
    print(f"Saved Ethiopia energy grid to {out_eth}")
else:
    print("No Ethiopia energy‐grid features fetched.")

# 6) Baghdad – single call
print("Fetching energy‐grid for Baghdad …")
bag_poly = boundaries.get("baghdad") or ox.geocode_to_gdf("Baghdad, Iraq").geometry.iloc[0]

try:
    bag_gdf = ox.features_from_polygon(bag_poly, tags=grid_tags)
    bag_gdf["region_name"] = "Baghdad"
    bag_gdf = bag_gdf.loc[:, ~bag_gdf.columns.duplicated()]
    if "fixme" in bag_gdf.columns:
        bag_gdf = bag_gdf.drop(columns="fixme")
    out_bag = folders["energy"] / "baghdad_energy_grid.gpkg"
    bag_gdf.to_file(out_bag, driver="GPKG")
    print(f"Saved Baghdad energy grid to {out_bag}")
except Exception as e:
    print(f"Failed to fetch Baghdad energy‐grid: {e}")

## OSM Power Plants Coal and Gas

In [15]:
import requests
import geopandas as gpd
import pandas as pd
import re
import urllib3
from shapely.geometry import Point, LineString, Polygon
from shapely.ops import unary_union
from pathlib import Path

# ------------------------------------------------------------------------------
# Suppress SSL warnings (we are intentionally bypassing certificate verification)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# ------------------------------------------------------------------------------
# Constants and output directory configuration
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
OUT_DIR = Path(r"C:\Users\Luis.ParraMorales\OneDrive - Imperial College London\Group Design Project\Data\energy")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Compile a regex pattern to identify any fossil fuel mention in relevant tags
FOSSIL_PATTERN = re.compile(r"\b(coal|gas|oil|diesel|natural\s?gas|hfo|petroleum)\b", re.IGNORECASE)

# The Overpass‐QL template, using ISO3166-1 alpha-2 code to fetch the country’s admin boundary
QUERY_TEMPLATE = """
[out:json][timeout:300];
area["ISO3166-1"="{iso2}"]["admin_level"="2"]->.country;
(
  nwr["power"="plant"](area.country);
  nwr["power"="generator"](area.country);
);
out geom qt;
"""


def build_geom(element: dict):
    """
    Convert an Overpass element (node/way/relation) into a Shapely geometry.
    Returns:
      - Point for node records
      - LineString or Polygon for way records, depending on closure
      - Unary union of Polygons for relation records composed of closed ways
      - None if geometry cannot be constructed
    """
    elem_type = element.get("type")

    # --- Node: simple point geometry ---
    if elem_type == "node":
        return Point(element["lon"], element["lat"])

    # For ways and relations, ensure a 'geometry' array exists
    if "geometry" not in element:
        return None
    coords = [(pt["lon"], pt["lat"]) for pt in element["geometry"]]

    # --- Way: decide between LineString vs. closed Polygon ---
    if elem_type == "way":
        if len(coords) < 2:
            return None
        # If the first and last coordinates match, and there are at least 4 points → Polygon
        if coords[0] == coords[-1] and len(coords) >= 4:
            return Polygon(coords)
        # Otherwise, interpret as a LineString
        return LineString(coords)

    # --- Relation: union of member polygons ---
    if elem_type == "relation":
        member_polygons = []
        for member in element.get("members", []):
            if member.get("type") == "way" and "geometry" in member:
                member_coords = [(pt["lon"], pt["lat"]) for pt in member["geometry"]]
                if len(member_coords) >= 4 and member_coords[0] == member_coords[-1]:
                    member_polygons.append(Polygon(member_coords))
        if member_polygons:
            try:
                return unary_union(member_polygons)
            except Exception:
                return member_polygons[0]
    return None


def fetch_osm_power_assets(iso2: str) -> gpd.GeoDataFrame:
    """
    Query Overpass for all 'power=plant' and 'power=generator' elements
    within the specified country's admin boundary (ISO3166-1 alpha-2).
    Returns a GeoDataFrame containing all elements’ tags and geometries.
    """
    # Build the Overpass query
    query = QUERY_TEMPLATE.format(iso2=iso2)
    response = requests.post(OVERPASS_URL, data={"data": query}, verify=False)
    response.raise_for_status()

    elements = response.json().get("elements", [])
    rows = []

    for elem in elements:
        geom = build_geom(elem)
        if geom is None:
            continue  # Skip elements with no valid geometry
        tags = elem.get("tags", {})
        # Merge all tag key-value pairs with the geometry into a single row
        row = {**tags, "geometry": geom}
        rows.append(row)

    if not rows:
        # Return an empty GeoDataFrame if nothing was fetched
        return gpd.GeoDataFrame([], geometry=[], crs="EPSG:4326")

    return gpd.GeoDataFrame(rows, geometry="geometry", crs="EPSG:4326")


def fossil_only(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Filter a GeoDataFrame of power assets to retain only those
    whose relevant tag values match the fossil fuel pattern.
    Checks 'plant:source', 'generator:source', 'generator:primary_fuel', and 'fuel'.
    """
    def is_fossil(row):
        for key in ("plant:source", "generator:source", "generator:primary_fuel", "fuel"):
            value = str(row.get(key, ""))
            if FOSSIL_PATTERN.search(value):
                return True
        return False

    mask = gdf.apply(is_fossil, axis=1)
    return gdf[mask].copy()


# ------------------------------------------------------------------------------
# Main loop: fetch, filter, and save for each country
for iso2_code, output_filename in [
    ("ET", "ethiopia_power_fossil.gpkg"),
    ("IQ", "iraq_power_fossil.gpkg")
]:
    print(f"⏳ Downloading OSM power assets for ISO '{iso2_code}' …")
    raw_gdf = fetch_osm_power_assets(iso2_code)
    fossil_gdf = fossil_only(raw_gdf)

    output_path = OUT_DIR / output_filename
    fossil_gdf.to_file(output_path, driver="GPKG")

    print(f"✅ ISO '{iso2_code}': {len(fossil_gdf)} fossil‐fuel assets saved → '{output_filename}'\n")


⏳ Downloading OSM power assets for ISO 'ET' …
✅ ISO 'ET': 5 fossil‐fuel assets saved → 'ethiopia_power_fossil.gpkg'

⏳ Downloading OSM power assets for ISO 'IQ' …
✅ ISO 'IQ': 172 fossil‐fuel assets saved → 'iraq_power_fossil.gpkg'



### 2.3 Mesh File to CSV files

Change gpkg format to csv format. 

Addis Ababa:

In [None]:
import geopandas as gpd

# Addis Ababa
file_path = DATA_PATH / 'addis-mesh-data'
output_path = DATA_PATH / 'addis-mesh-data-csv'
output_path.mkdir(exist_ok=True)

files = list(file_path.glob("*.gpkg"))

for file in files:
    name = file.stem
    print(name)
    gdf = gpd.read_file(file)
    gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkt if geom else None)
    gdf.to_csv(output_path / f"{name}.csv", index=False)

Baghdad:

In [None]:
file_path = DATA_PATH / 'baghdad-mesh-data'
output_path = DATA_PATH / 'baghdad-mesh-data-csv'
output_path.mkdir(exist_ok=True)

files = list(file_path.glob("*.gpkg"))
# files

for file in files:
    name = file.stem
    print(name)
    gdf = gpd.read_file(file)
    gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkt if geom else None)
    gdf.to_csv(output_path / f"{name}.csv", index=False)