In [96]:
from enum import Enum
import re
import pandas as pd


class PHRegion(Enum):
    REGION_I = "Region I - Ilocos Region"
    REGION_II = "Region II - Cagayan Valley"
    REGION_III = "Region III - Central Luzon"
    REGION_IVA = "Region IVA - CALABARZON"
    REGION_IVB = "Region IVB - MIMAROPA"
    REGION_V = "Region V - Bicol"
    REGION_VI = "Region VI - Western Visayas"
    REGION_VII = "Region VII - Central Visayas"
    REGION_VIII = "Region VIII - Eastern Visayas"
    REGION_IX = "Region IX - Zamboanga Peninsula"
    REGION_X = "Region X - Northern Mindanao"
    REGION_XI = "Region XI - Davao"
    REGION_XII = "Region XII - SOCCSKSARGEN"
    REGION_XIII = "Region XIII - Caraga"
    NCR = "National Capital Region"
    CAR = "Cordillera Administrative Region"
    BARMM = "Bangsamoro Autonomous Region in Muslim Mindanao"

In [97]:
DIRECTION_ALIASES = {
    "north": "Northern",
    "south": "Southern",
    "east": "Eastern",
    "west": "Western",
    "northern": "Northern",
    "southern": "Southern",
    "eastern": "Eastern",
    "western": "Western",
    "central": "Central",
}

ISLANDS = ["Luzon", "Visayas", "Mindanao"]

def expand_directional_regions(text: str) -> str:
    """
        Expand directional modifiers (e.g., North, South, East, West, Central) in a location string
        so that each direction is explicitly attached to its corresponding island group (Luzon, Visayas, Mindanao).

        Examples:
            "Northern, Central, and Southern Luzon" -> "Northern Luzon, Central Luzon, Southern Luzon"
            "Western and Eastern Visayas" -> "Western Visayas, Eastern Visayas"
            "North, South, and East Mindanao" -> "North Mindanao, South Mindanao, East Mindanao"

        This function handles any number of directional words preceding an island group and preserves
        other non-directional locations in the string.
    """
    for island in ISLANDS:
        pattern = rf"\b((?:{'|'.join(DIRECTION_ALIASES.keys())})(?:\s+and\s+(?:{'|'.join(DIRECTION_ALIASES.keys())}))*)\s+{island}\b"

        def repl(match):
            dirs = re.split(r"\s+and\s+", match.group(1))
            expanded = [f"{DIRECTION_ALIASES[d.lower()]} {island}" for d in dirs]
            return " and ".join(expanded)

        text = re.sub(pattern, repl, text, flags=re.IGNORECASE)

    return text.lower()

In [98]:
def remove_fillers(token: str) -> str:
    """
        Remove common filler words from a string to simplify and standardize it.

        Filler words include generic geographic terms such as 'region', 'province', 'city',
        'valley', 'peninsula', 'district', 'island', 'municipality', and similar words that
        do not help identify the specific location.

        This function:
        - Strips leading/trailing whitespace
        - Removes the filler words (case-insensitive)
        - Collapses multiple spaces into a single space
    """
    token = token.strip()
    fillers = ["region", "regions", "province", "city", "valley", "peninsula", "district", "island", "isl", "in", "municipality", "municipalities"]

    if not token:
        return ""
    pattern = r"\b(?:{})\b".format("|".join(fillers))

    cleaned = re.sub(pattern, "", token, flags=re.IGNORECASE)
    cleaned = " ".join(cleaned.split())

    return cleaned

In [99]:
REGIONS = {
    "i": PHRegion.REGION_I.value,
    "ilocos": PHRegion.REGION_I.value,
    "1": PHRegion.REGION_I.value,

    "ii": PHRegion.REGION_II.value,
    "cagayan": PHRegion.REGION_II.value,
    "2": PHRegion.REGION_II.value,

    "iii": PHRegion.REGION_III.value,
    "centralluzon": PHRegion.REGION_III.value,
    "3": PHRegion.REGION_III.value,

    "iva": PHRegion.REGION_IVA.value,
    "iv": PHRegion.REGION_IVA.value,
    "iv-a": PHRegion.REGION_IVA.value,
    "calabarzon": PHRegion.REGION_IVA.value,
    "4": PHRegion.REGION_IVA.value,
    "4a": PHRegion.REGION_IVA.value,
    "4-a": PHRegion.REGION_IVA.value,

    "v": PHRegion.REGION_V.value,
    "bicol": PHRegion.REGION_V.value,
    "5": PHRegion.REGION_V.value,

    "vi": PHRegion.REGION_VI.value,
    "westernvisayas": PHRegion.REGION_VI.value,
    "southernvisayas": PHRegion.REGION_VI.value,
    "6": PHRegion.REGION_VI.value,

    "vii": PHRegion.REGION_VII.value,
    "centralvisayas": PHRegion.REGION_VII.value,
    "7": PHRegion.REGION_VII.value,

    "viii": PHRegion.REGION_VIII.value,
    "easternvisayas": PHRegion.REGION_VIII.value,
    "8": PHRegion.REGION_VIII.value,

    "ix": PHRegion.REGION_IX.value,
    "zamboanga": PHRegion.REGION_IX.value,
    "9": PHRegion.REGION_IX.value,

    "x": PHRegion.REGION_X.value,
    "northernmindanao": PHRegion.REGION_X.value,
    "10": PHRegion.REGION_X.value,

    "xi": PHRegion.REGION_XI.value,
    "davao": PHRegion.REGION_XI.value,
    "11": PHRegion.REGION_XI.value,

    "xii": PHRegion.REGION_XII.value,
    "soccsksargen": PHRegion.REGION_XII.value,
    "12": PHRegion.REGION_XII.value,

    "ncr": PHRegion.NCR.value,
    "nationalcapital": PHRegion.NCR.value,
    "manila": PHRegion.NCR.value,
    "metromanila": PHRegion.NCR.value,

    "car": PHRegion.CAR.value,
    "cordillera": PHRegion.CAR.value,

    "xiii": PHRegion.REGION_XIII.value,
    "caraga": PHRegion.REGION_XIII.value,
    "13": PHRegion.REGION_XIII.value,

    "armm": PHRegion.BARMM.value,
    "barmm": PHRegion.BARMM.value,

    "ivb": PHRegion.REGION_IVB.value,
    "iv-b": PHRegion.REGION_IVB.value,
    "mimaropa": PHRegion.REGION_IVB.value,
    "4b": PHRegion.REGION_IVB.value,
    "4-b": PHRegion.REGION_IVB.value,
}


In [100]:
def extract_region_number(token: str) -> str or None:
    """
        Check if token starts with 'Region' + Roman numeral, and return the numeral if it exists.

        Examples:
            "something region v something" -> "v"
            "something region 5 something" -> "5"
    """
    pattern = r"region+([IVXLCDM]+|\d+)\b"
    match = re.search(pattern, token, flags=re.IGNORECASE)
    if match:
        return match.group(1)
    return None

In [101]:
def province_to_region(loc: str) -> str or None:
    """
        Map a given location string to its corresponding Philippine region based on known provinces.

        The function checks if any of the predefined province names appear within the input string.
        If a match is found, it returns the associated region. The match is substring-based, so the
        input does not need to exactly equal the province name.

        Examples:
            "cebu" appears in the list so it gets mapped to "Region VII - Central Visayas"
            "davaodeoro" contains "davao" so it gets mapped to "Region XI - Davao"
            "california" does not appear in the list so None is returned
    """
    REGION_PROVINCE_MAP = {
        PHRegion.REGION_I.value: ["ilocosnorte", "ilocossur", "vigan", "launion", "pangasinan"],
        PHRegion.REGION_II.value: ["batanes", "isabela", "nuevavizcaya", "nuevaviscaya", "quirino"],
        PHRegion.REGION_III.value: ["aurora", "bataan", "bulacan", "pampanga", "nuevaecija", "tarlac", "zambales"],
        PHRegion.REGION_IVA.value: ["batangas", "cavite", "laguna", "quezon", "rizal"],
        PHRegion.REGION_IVB.value: ["marinduque", "occidentalmindoro", "orientalmindoro", "palawan", "romblon"],
        PHRegion.REGION_V.value: ["albay", "camarinesnorte", "camarinessur", "catanduanes", "masbate", "sorsogon"],
        PHRegion.REGION_VI.value: ["aklan", "atique", "capiz", "guimaras", "negrosoccidental", "iloilo", "ilo-ilo"],
        PHRegion.REGION_VII.value: ["bohol", "cebu", "negrosoriental", "siquijor"],
        PHRegion.REGION_VIII.value: ["biliran", "easternsamar", "leyte", "northernsamar", "southernsamar", "southernleyte", "westernsamar"],
        PHRegion.REGION_IX.value: ["zamboangadelnorte", "zamboangadelsur", "zamboangasibugay"],
        PHRegion.REGION_X.value: ["bukidnon", "camiguin", "lanaodelnorte", "misamisoccidental", "misamisoriental"],
        PHRegion.REGION_XI.value: ["compostela", "davaodelnorte", "davaodelsur", "davaooriental", "davaooccidental", "davao"],
        PHRegion.REGION_XII.value: ["cotabato", "sarangani", "southcotabato", "northcotabato", "cotabato", "sultankudarat"],
        PHRegion.REGION_XIII.value: ["agusandelnorte", "agusandelsur", "dinagatislands", "dinagat", "surigao", "surigaodelnorte", "surigaodelsur"],
        PHRegion.CAR.value: ["abra", "apayao", "benguet", "ifugao", "kalinga", "mt.province", "mtprovince", "mountainprovince"],
        PHRegion.BARMM.value: ["basilan", "lanaodelsur", "maguindanao", "shariffkabunsuan", "sulu", "tawitawi", "tawi-tawi"],
        # NCR intentionally left out because it doesn't have provinces
    }
    for region, provinces in REGION_PROVINCE_MAP.items():
        for province in provinces:
            if province in loc:
                return region
    return None

In [102]:
def other_region_associations(token: str) -> [str] or None:
    """
        Handle dataset-specific edge cases where certain location strings map to multiple regions
        or are ambiguous, and cannot be determined by simple province matching.

        This function returns a list of Philippine regions associated with special keywords or
        phrases found in the input string. It is designed specifically for the quirks and edge
        cases in the current dataset.

        Examples:
            "luzon" -> ['Region I - Ilocos Region', 'Region II - Cagayan Valley', 'Region III - Central Luzon',
             'Region IVA - CALABARZON', 'Region IVB - MIMAROPA', 'Region V - Bicol',
             'Cordillera Administrative Region', 'National Capital Region']
            "negros" -> ['Region VI - Western Visayas', 'Region VII - Central Visayas']
    """
    if token == 'luzon':
        return [
            PHRegion.REGION_I.value,
            PHRegion.REGION_II.value,
            PHRegion.REGION_III.value,
            PHRegion.REGION_IVA.value,
            PHRegion.REGION_IVB.value,
            PHRegion.REGION_V.value,
            PHRegion.CAR.value,
            PHRegion.NCR.value
        ]
    elif token == 'northernluzon':
        return [
            PHRegion.REGION_I.value,
            PHRegion.REGION_II.value,
            PHRegion.CAR.value
        ]
    elif token == 'visayas':
        return [
            PHRegion.REGION_VI.value,
            PHRegion.REGION_VII.value,
            PHRegion.REGION_VIII.value
        ]
    elif token == 'mindanao':
        return [
            PHRegion.REGION_IX.value,
            PHRegion.REGION_X.value,
            PHRegion.REGION_XI.value,
            PHRegion.REGION_XII.value,
            PHRegion.REGION_XIII.value,
            PHRegion.BARMM.value
        ]
    elif token == 'luzonstrait':
        return [
            PHRegion.REGION_I.value,
            PHRegion.REGION_II.value
        ]
    elif token == "lingig":
        return [PHRegion.REGION_XIII.value]
    elif token == "pitogo":
        return [PHRegion.REGION_IVA.value]
    elif "negros" in token:
        return [
            PHRegion.REGION_VI.value,
            PHRegion.REGION_VII.value
        ]
    return None

In [103]:
def clean_location(loc_string: str) -> str:
    """
        Process a raw location string from the EMDAT dataset and map it to Philippine regions.

        This function applies a multi-stage cleaning and mapping pipeline to convert free-form
        location text into standardized PH Region names. The process includes:

        1. Normalization:
           - Converts text to lowercase.
           - Removes periods and hyphens.
           - Expands directional modifiers (e.g., "North, Central, and South Luzon" -> "North Luzon, Central Luzon, South Luzon").

        2. Tokenization:
           - Splits the cleaned string into tokens using commas, semicolons, "and", "&", and parentheses.

        3. Token Processing Pipeline (applied to each token):
           - Removes filler words (e.g., "region", "province", "city", etc.).
           - Maps tokens directly to known regions via aliases.
           - Detects "Region" numbers (both Roman numerals and digits) and maps to regions.
           - Maps known provinces to their corresponding regions.
           - Handles dataset-specific edge cases (e.g., "negros", "luzon", "pitogo").

        4. Output:
           - Returns a semicolon-separated string of unique PH Regions corresponding to the input location.
    """

    # STEP 1: Clean location string by converting to lowercase, removing unnecessary symbols, and expanding directional modifiers
    loc_string = loc_string.lower()
    loc_string = loc_string.replace(".", "")
    loc_string = loc_string.replace("-", "")
    loc_string = expand_directional_regions(loc_string)

    # STEP 2: Splits the cleaned string into tokens using commas, semicolons, "and", "&", and parentheses.
    tokens = re.split(r"[;,:]|\band\b|&|\(|\)", loc_string)

    # STEP 3: Apply the Token processing pipeline stages
    regions = []
    for token in tokens:
        # Clean token by removing extra white spaces and filler words
        token = token.strip()
        token = remove_fillers(token)
        token = "".join(token.split())
        # Main token processing pipeline stages
        if token in REGIONS:  # Cleanly map to Region
            regions.append(REGIONS[token])
        elif extract_region_number(token):  # Clean token before mapping to Region
            regions.append(REGIONS[extract_region_number(token)])
        elif province_to_region(token):  #  Map token to region if a province of that region is a substring
            regions.append(province_to_region(token))
        elif other_region_associations(token):  # Handle mapping of edge cases
            regions.extend(other_region_associations(token))

    # STEP 4: Return unique list of Regions separated by a semicolon
    return ";".join(list(set(regions)))

In [104]:
# MAIN DRIVER

# Read csv
emdat_df = pd.read_csv('./datasets/raw/emdat.csv', header=0, index_col=0)

# Drop rows where Location is null or empty
emdat_df = emdat_df.dropna(subset=["Location"])
emdat_df = emdat_df[emdat_df["Location"].str.strip() != ""]

# Apply clean_location pipeline and save to PH Regions column
emdat_df['PH Regions'] = emdat_df['Location'].dropna().apply(clean_location)

# Reset index temporarily so DisNo. becomes a normal column
df_reset = emdat_df.reset_index()

# Save
df_reset.to_csv('./datasets/clean/emdat_ph_regions.csv', index=False, header=True)

# Display a preview
pd.set_option('display.max_colwidth', None)
display(df_reset.loc[:, ["DisNo.", "Location", "PH Regions"]].head(10))

Unnamed: 0,DisNo.,Location,PH Regions
0,2000-0396-PHL,National Capital region (NCR) province,National Capital Region
1,2000-0414-PHL,Metropolitan Manila district (National Capital region (NCR) province),National Capital Region
2,2000-0597-PHL,Metropolitan Manila district (NCR province),National Capital Region
3,2000-0783-PHL,Zamboanga,Region IX - Zamboanga Peninsula
4,2022-0832-PHL,"Mimaropa, Bicol, Eastern Visayas, Zamboanga Peninsula, Northern Mindanao, Davao Region and Caraga (Mindanao and Luzon)",Bangsamoro Autonomous Region in Muslim Mindanao;Region I - Ilocos Region;National Capital Region;Region VIII - Eastern Visayas;Region XI - Davao;Region X - Northern Mindanao;Region II - Cagayan Valley;Cordillera Administrative Region;Region XIII - Caraga;Region V - Bicol;Region IVB - MIMAROPA;Region IVA - CALABARZON;Region IX - Zamboanga Peninsula;Region III - Central Luzon;Region XII - SOCCSKSARGEN
5,2020-0463-PHL,"Calabarzon, Mimaropa, Bicol Regions; NCR, II, III, V, VIII, CAR regions",National Capital Region;Region VIII - Eastern Visayas;Region II - Cagayan Valley;Cordillera Administrative Region;Region V - Bicol;Region IVB - MIMAROPA;Region IVA - CALABARZON;Region III - Central Luzon
6,2019-0162-PHL,Castillejos (Zambales),Region III - Central Luzon
7,2019-0489-PHL,North Cotabato Province (Mindanao Island),Bangsamoro Autonomous Region in Muslim Mindanao;Region XI - Davao;Region X - Northern Mindanao;Region XIII - Caraga;Region IX - Zamboanga Peninsula;Region XII - SOCCSKSARGEN
8,2019-0619-PHL,"Davao del Sur Province (Davao Region, Mindanao Island)",Bangsamoro Autonomous Region in Muslim Mindanao;Region XI - Davao;Region X - Northern Mindanao;Region XIII - Caraga;Region IX - Zamboanga Peninsula;Region XII - SOCCSKSARGEN
9,2017-0050-PHL,Surigao Del Norte district (Region XIII (Caraga) province),Region XIII - Caraga


In [105]:
# Driver for edge cases
edge_cases_df = pd.read_csv('datasets/raw/edge_cases.csv', delimiter=';', header=0)
edge_cases_df['PH Regions'] = edge_cases_df['Location'].dropna().apply(clean_location)
display(edge_cases_df)

Unnamed: 0,Location,PH Regions
0,"Luzon Strait (near Calayan Island, southern Babuyan Islands)",Region II - Cagayan Valley;Region I - Ilocos Region
1,"Massara, Maco Municipality (Davao de Oro Province, Mindanao Island)",Bangsamoro Autonomous Region in Muslim Mindanao;Region XI - Davao;Region X - Northern Mindanao;Region XIII - Caraga;Region IX - Zamboanga Peninsula;Region XII - SOCCSKSARGEN
2,"Cagayan Valley, Central Luzon, Calabarzon, Mimaropa, Bicol, and Eastern and Central Visayas.",Region VII - Central Visayas;Region VIII - Eastern Visayas;Region II - Cagayan Valley;Region V - Bicol;Region IVB - MIMAROPA;Region IVA - CALABARZON;Region III - Central Luzon
3,Lingig municipality (Mindanao Island),Bangsamoro Autonomous Region in Muslim Mindanao;Region XI - Davao;Region X - Northern Mindanao;Region XIII - Caraga;Region IX - Zamboanga Peninsula;Region XII - SOCCSKSARGEN
4,Northern Luzon island,Region II - Cagayan Valley;Region I - Ilocos Region;Cordillera Administrative Region
5,Northern Luzon,Region II - Cagayan Valley;Region I - Ilocos Region;Cordillera Administrative Region
6,Western and Central Visayas region (Negros Isl.),Region VI - Western Visayas;Region VII - Central Visayas
7,Pitogo (Lucon Isl.),Region IVA - CALABARZON
8,"Calabarzon, Mimaropa, Region 5, Region 6, Region 7, Region 8, Region 10, Caraga, and the Negros Islands Region (NIR)",Region VI - Western Visayas;Region VII - Central Visayas;Region VIII - Eastern Visayas;Region X - Northern Mindanao;Region XIII - Caraga;Region V - Bicol;Region IVB - MIMAROPA;Region IVA - CALABARZON
9,"Mindanao, southern Visayas and northern Palawan",Bangsamoro Autonomous Region in Muslim Mindanao;Region VI - Western Visayas;Region XI - Davao;Region X - Northern Mindanao;Region XIII - Caraga;Region IVB - MIMAROPA;Region IX - Zamboanga Peninsula;Region XII - SOCCSKSARGEN
