In [273]:
import pandas as pd
import re

REGIONS = {
    "i": "Region I - Ilocos Region",
    "ilocos": "Region I - Ilocos Region",
    "1": "Region I - Ilocos Region",

    "ii": "Region II - Cagayan Valley",
    "cagayan": "Region II - Cagayan Valley",
    "2": "Region II - Cagayan Valley",

    "iii": "Region III - Central Luzon",
    "centralluzon": "Region III - Central Luzon",
    "3": "Region III - Central Luzon",

    "iva": "Region IVA - CALABARZON",
    "iv": "Region IVA - CALABARZON",
    "iv-a": "Region IVA - CALABARZON",
    "calabarzon": "Region IVA - CALABARZON",
    "4": "Region IVA - CALABARZON",
    "4a": "Region IVA - CALABARZON",
    "4-a": "Region IVA - CALABARZON",

    "v": "Region V - Bicol",
    "bicol": "Region V - Bicol",
    "5": "Region V - Bicol",

    "vi": "Region VI - Western Visayas",
    "westernvisayas": "Region VI - Western Visayas",
    "southernvisayas": "Region VI - Western Visayas",
    "6": "Region VI - Western Visayas",

    "vii": "Region VII - Central Visayas",
    "centralvisayas": "Region VII - Central Visayas",
    "7": "Region VII - Central Visayas",

    "viii": "Region VIII - Eastern Visayas",
    "easternvisayas": "Region VIII - Eastern Visayas",
    "8": "Region VIII - Eastern Visayas",

    "ix": "Region IX - Zamboanga Peninsula",
    "zamboanga": "Region IX - Zamboanga Peninsula",
    "9": "Region IX - Zamboanga Peninsula",

    "x": "Region X - Northern Mindanao",
    "northernmindanao": "Region X - Northern Mindanao",
    "10": "Region X - Northern Mindanao",

    "xi": "Region XI - Davao",
    "davao": "Region XI - Davao",
    "11": "Region XI - Davao",

    "xii": "Region XII - SOCCSKSARGEN",
    "soccsksargen": "Region XII - SOCCSKSARGEN",
    "12": "Region XII - SOCCSKSARGEN",

    "ncr": "National Capital Region",
    "nationalcapital": "National Capital Region",
    "manila": "National Capital Region",
    "metromanila": "National Capital Region",

    "car": "Cordillera Administrative Region",
    "cordillera": "Cordillera Administrative Region",

    "xiii": "Region XIII - Caraga",
    "caraga": "Region XIII - Caraga",
    "13": "Region XIII - Caraga",

    "armm": "Bangsamoro Autonomous Region in Muslim Mindanao",
    "barmm": "Bangsamoro Autonomous Region in Muslim Mindanao",

    "ivb": "Region IVB - MIMAROPA",
    "iv-b": "Region IVB - MIMAROPA",
    "mimaropa": "Region IVB - MIMAROPA",
    "4b": "Region IVB - MIMAROPA",
    "4-b": "Region IVB - MIMAROPA",

}

def remove_fillers(loc: str) -> str:
    loc = loc.strip()
    fillers = ["regions", "province", "city", "valley", "peninsula", "district", "island", "isl", "in", "municipality", "municipalities"]

    if not loc:
        return ""
    # Create regex pattern: \b = word boundary
    pattern = r"\b(?:{})\b".format("|".join(fillers))

    # Replace filler words with empty string
    cleaned = re.sub(pattern, "", loc, flags=re.IGNORECASE)
    # Remove extra whitespace
    cleaned = " ".join(cleaned.split())

    return cleaned

import re

def extract_region_number(token: str) -> str | None:
    """
    Extract the region identifier from a token if it contains a Philippine region reference.

    This function looks for 'Region' followed by either a Roman numeral (I, II, III, …, XIII)
    or an integer (1, 2, …, 13) anywhere in the token.

    It avoids false positives for standalone letters or Roman numerals not tied to 'Region'.

    Examples:
        "Region V - Bicol" -> "V"
        "survey in Region 5" -> "5"
        "sahdjkhasjdk region 1 fjdskfhsdj" -> "1"
        "village V" -> None  # does not match random standalone 'V'
    """
    pattern = r"\bregion\s+([IVXLCDM]+|\d+)\b"
    match = re.search(pattern, token, flags=re.IGNORECASE)
    if match:
        return match.group(1).upper()  # always return Roman numerals uppercase for consistency
    return None


def province_to_region(loc: str) -> str:
    REGION_PROVINCE_MAP = {
        "Region I - Ilocos Region": ["ilocosnorte", "ilocossur", "vigan", "launion", "pangasinan"],
        "Region II - Cagayan Valley": ["batanes", "isabela", "nuevavizcaya", "nuevaviscaya", "quirino"],
        "Region III - Central Luzon": ["aurora", "bataan", "bulacan", "pampanga", "nuevaecija", "tarlac", "zambales"],
        "Region IVA - CALABARZON": ["batangas", "cavite", "laguna", "quezon", "rizal"],
        "Region IVB - MIMAROPA": ["marinduque", "occidentalmindoro", "orientalmindoro", "palawan", "romblon"],
        "Region V - Bicol": ["albay", "camarinesnorte", "camarinessur", "catanduanes", "masbate", "sorsogon"],
        "Region VI - Western Visayas": ["aklan", "atique", "capiz", "guimaras", "negrosoccidental", "iloilo", "ilo-ilo"],
        "Region VII - Central Visayas": ["bohol", "cebu", "negrosoriental", "siquijor"],
        "Region VIII - Eastern Visayas": ["biliran", "easternsamar", "leyte", "northernsamar", "southernsamar", "southernleyte", "westernsamar"],
        "Region IX - Zamboanga Peninsula": ["zamboangadelnorte", "zamboangadelsur", "zamboangasibugay"],
        "Region X - Northern Mindanao": ["bukidnon", "camiguin", "lanaodelnorte", "misamisoccidental", "misamisoriental"],
        "Region XI - Davao": ["compostela", "davaodelnorte", "davaodelsur", "davaooriental", "davaooccidental", "davao"],
        "Region XII - SOCCSKSARGEN": ["cotabato", "sarangani", "southcotabato", "northcotabato", "cotabato", "sultankudarat"],
        "Region XIII - Caraga": ["agusandelnorte", "agusandelsur", "dinagatislands", "dinagat", "surigao", "surigaodelnorte", "surigaodelsur"],
        # "National Capital Region": [],
        "Cordillera Administrative Region": ["abra", "apayao", "benguet", "ifugao", "kalinga", "mt.province", "mtprovince", "mountainprovince"],
        "Bangsamoro Autonomous Region in Muslim Mindanao": ["basilan", "lanaodelsur", "maguindanao", "shariffkabunsuan", "sulu", "tawitawi", "tawi-tawi"]
    }
    for region, provinces in REGION_PROVINCE_MAP.items():
        for province in provinces:
            if province in loc:
                return region
    return None

def other_region_associations(loc_string):
    if loc_string == 'luzon':
        return ["Region I - Ilocos Region", "Region II - Cagayan Valley", "Region III - Central Luzon", "Region IVA - CALABARZON", "Region IVB - MIMAROPA", "Region V - Bicol", "Cordillera Administrative Region", "National Capital Region"]
    elif loc_string == 'northernluzon':
        return ["Region I - Ilocos Region", "Region II - Cagayan Valley", "Cordillera Administrative Region"]
    elif loc_string == 'mindanao':
        return ["Region IX - Zamboanga Peninsula", "Region X - Northern Mindanao", "Region XI - Davao", "Region XII - SOCCSKSARGEN", "Region XIII - Caraga", "Bangsamoro Autonomous Region in Muslim Mindanao"]
    elif loc_string == 'luzonstrait':
        return ["Region I - Ilocos Region", "Region II - Cagayan Valley"]
    elif loc_string == "lingig":
        return ["Region XIII - Caraga"]
    elif loc_string == "pitogo":
        return ["Region IVA - CALABARZON"]
    elif "negros" in loc_string:
        return ["Region VI - Western Visayas", "Region VII - Central Visayas"]
    return None


import re

DIRECTION_ALIASES = {
    "north": "Northern",
    "south": "Southern",
    "east": "Eastern",
    "west": "Western",
    "northern": "Northern",
    "southern": "Southern",
    "eastern": "Eastern",
    "western": "Western",
    "central": "Central",
}

ISLANDS = ["Luzon", "Visayas", "Mindanao"]

def expand_directions(text: str) -> str:
    for island in ISLANDS:
        # Match: "Eastern and Central Visayas"
        pattern = rf"\b((?:{'|'.join(DIRECTION_ALIASES.keys())})(?:\s+and\s+(?:{'|'.join(DIRECTION_ALIASES.keys())}))*)\s+{island}\b"

        def repl(match):
            dirs = re.split(r"\s+and\s+", match.group(1))
            expanded = [f"{DIRECTION_ALIASES[d.lower()]} {island}" for d in dirs]
            return " and ".join(expanded)

        text = re.sub(pattern, repl, text, flags=re.IGNORECASE)

    return text.lower()


def clean_location(loc_string: str) -> [str]:
    # Split Location string to obtain list of possible locations
    loc_string = loc_string.lower()
    loc_string = loc_string.replace(".", "")
    loc_string = loc_string.replace("-", "")
    loc_string = expand_directions(loc_string)
    # Split on comma, semicolon, "and", "&", "(" , ")"
    loc_parts = re.split(r"[;,:]|\band\b|&|\(|\)", loc_string)
    # Clean whitespace and title-case
    parts = [p.strip() for p in loc_parts if p]
    print(parts)
    locs = []
    for part in parts:
        part = remove_fillers(part)
        part = "".join(part.split())
        if part in REGIONS:
            locs.append(REGIONS[part])
        elif extract_region_number(part):
            locs.append(REGIONS[extract_region_number(part)])
        elif province_to_region(part):
            locs.append(province_to_region(part))
        elif other_region_associations(part):
            locs = locs + other_region_associations(part)



    # print(set(locs))
    return ";".join(list(set(locs)))


# df = pd.read_csv('./datasets/raw/emdat.csv', header=0, index_col=0)
#
# # Drop rows where Location is null or empty
# df = df.dropna(subset=["Location"])
# df = df[df["Location"].str.strip() != ""]
#
# # Apply your clean_location function first
# df['PH Regions'] = df['Location'].dropna().apply(clean_location)
# # Reset index temporarily so DisNo. becomes a normal column
# df_reset = df.reset_index()
# df_reset.to_csv('./datasets/clean/emdat_ph_regions.csv', index=False, header=True)
# # Display a preview
# pd.set_option('display.max_colwidth', None)
# display(df_reset.loc[:, ["DisNo.", "Location", "PH Regions"]].head(10))
# # Page parameters
# page =40
# chunk_size = 10
# start = (page - 1) * chunk_size
# end = start + chunk_size
#
# # Slice DataFrame manually
# page_df = df.iloc[start:end][['Location', 'CleanedLocations']]
# pd.set_option('display.max_colwidth', None)
#
# # Display the first page
# display(page_df)

print(clean_location('what about if it is Region 1, 2, 3'))


['what about if it is region 1', '2', '3']
Region II - Cagayan Valley;Region III - Central Luzon
