In [174]:
import pandas as pd
import re

REGIONS = {
    "i": "Region I - Ilocos Region",
    "ilocos": "Region I - Ilocos Region",

    "ii": "Region II - Cagayan Valley",
    "cagayan": "Region II - Cagayan Valley",

    "iii": "Region III - Central Luzon",
    "centralluzon": "Region III - Central Luzon",

    "iva": "Region IVA - CALABARZON",
    "iv": "Region IVA - CALABARZON",
    "iv-a": "Region IVA - CALABARZON",
    "calabarzon": "Region IVA - CALABARZON",

    "v": "Region V - Bicol",
    "bicol": "Region V - Bicol",

    "vi": "Region VI - Western Visayas",
    "westernvisayas": "Region VI - Western Visayas",

    "vii": "Region VII - Central Visayas",
    "centralvisayas": "Region VII - Central Visayas",

    "viii": "Region VIII - Eastern Visayas",
    "easternvisayas": "Region VIII - Eastern Visayas",

    "ix": "Region IX - Zamboanga Peninsula",
    "zamboanga": "Region IX - Zamboanga Peninsula",

    "x": "Region X - Northern Mindanao",
    "northernmindanao": "Region X - Northern Mindanao",

    "xi": "Region XI - Davao",
    "davao": "Region XI - Davao",

    "xii": "Region XII - SOCCSKSARGEN",
    "soccsksargen": "Region XII - SOCCSKSARGEN",

    "ncr": "National Capital Region",
    "nationalcapital": "National Capital Region",
    "manila": "National Capital Region",
    "metromanila": "National Capital Region",

    "car": "Cordillera Administrative Region",
    "cordillera": "Cordillera Administrative Region",

    "xiii": "Region XIII - Caraga",
    "caraga": "Region XIII - Caraga",

    "armm": "Bangsamoro Autonomous Region in Muslim Mindanao",
    "barmm": "Bangsamoro Autonomous Region in Muslim Mindanao",

    "ivb": "Region IVB - MIMAROPA",
    "iv-b": "Region IVB - MIMAROPA",
    "mimaropa": "Region IVB - MIMAROPA",
}

def remove_fillers(loc: str) -> str:
    loc = loc.strip()
    fillers = ["region", "regions", "province", "city", "valley", "peninsula", "district", "island", "isl", "in"]

    if not loc:
        return ""
    # Create regex pattern: \b = word boundary
    pattern = r"\b(?:{})\b".format("|".join(fillers))

    # Replace filler words with empty string
    cleaned = re.sub(pattern, "", loc, flags=re.IGNORECASE)
    # Remove extra whitespace
    cleaned = " ".join(cleaned.split())

    return cleaned

def extract_region_number(loc: str):
    """
    Check if token starts with 'Region' + Roman numeral,
    and return the numeral if it exists.
    """
    pattern = r"^Region\s+([IVXLCDM]+)\b"
    match = re.search(pattern, loc, flags=re.IGNORECASE)
    if match:
        return match.group(1)  # The Roman numeral
    return None

def province_to_region(loc: str) -> str:
    REGION_PROVINCE_MAP = {
        "Region I - Ilocos Region": ["ilocosnorte", "ilocossur", "vigan", "launion", "pangasinan"],
        "Region II - Cagayan Valley": ["batanes", "isabela", "nuevavizcaya", "nuevaviscaya", "quirino"],
        "Region III - Central Luzon": ["aurora", "bataan", "bulacan", "pampanga", "nuevaecija", "tarlac", "zambales"],
        "Region IVA - CALABARZON": ["batangas", "cavite", "laguna", "quezon", "rizal"],
        "Region IVB - MIMAROPA": ["marinduque", "occidentalmindoro", "orientalmindoro", "palawan", "romblon"],
        "Region V - Bicol": ["albay", "camarinesnorte", "camarinessur", "catanduanes", "masbate", "sorsogon"],
        "Region VI - Western Visayas": ["aklan", "atique", "capiz", "guimaras", "negrosoccidental", "iloilo", "ilo-ilo"],
        "Region VII - Central Visayas": ["bohol", "cebu", "negrosoriental", "siquijor"],
        "Region VIII - Eastern Visayas": ["biliran", "easternsamar", "leyte", "northernsamar", "southernsamar", "southernleyte", "westernsamar"],
        "Region IX - Zamboanga Peninsula": ["zamboangadelnorte", "zamboangadelsur", "zamboangasibugay"],
        "Region X - Northern Mindanao": ["bukidnon", "camiguin", "lanaodelnorte", "misamisoccidental", "misamisoriental"],
        "Region XI - Davao": ["compostela", "davaodelnorte", "davaodelsur", "davaooriental", "davaooccidental"],
        "Region XII - SOCCSKSARGEN": ["cotabato", "sarangani", "southcotabato", "northcotabato", "cotabato", "sultankudarat"],
        "Region XIII - Caraga": ["agusandelnorte", "agusandelsur", "dinagatislands", "dinagat", "surigao", "surigaodelnorte", "surigaodelsur"],
        # "National Capital Region": [],
        "Cordillera Administrative Region": ["abra", "apayao", "benguet", "ifugao", "kalinga", "mt.province", "mtprovince", "mountainprovince"],
        "Bangsamoro Autonomous Region in Muslim Mindanao": ["basilan", "lanaodelsur", "maguindanao", "shariffkabunsuan", "sulu", "tawitawi", "tawi-tawi"]
    }
    for region, provinces in REGION_PROVINCE_MAP.items():
        for province in provinces:
            if province in loc:
                return region
    return None

def clean_location(loc_string: str) -> [str]:
    # Split Location string to obtain list of possible locations
    loc_string = loc_string.lower()
    loc_string = loc_string.replace(".", "")
    loc_string = loc_string.replace("-", "")
    # Split on comma, semicolon, "and", "&", "(" , ")"
    loc_parts = re.split(r"[;,:]|\band\b|&|\(|\)", loc_string)
    # Clean whitespace and title-case
    parts = [p.strip() for p in loc_parts if p]

    locs = []
    for part in parts:
        part = remove_fillers(part)
        part = "".join(part.split())
        if part in REGIONS:
            locs.append(REGIONS[part])
        elif extract_region_number(part):
            locs.append(REGIONS[extract_region_number(part)])
        elif province_to_region(part):
            locs.append(province_to_region(part))



    # print(set(locs))
    return set(locs)

df = pd.read_csv('./datasets/raw/emdat.csv')

# Apply your clean_location function first
df['CleanedLocations'] = df['Location'].dropna().apply(clean_location)

# Page parameters
page =36
chunk_size = 10
start = (page - 1) * chunk_size
end = start + chunk_size

# Slice DataFrame manually
page_df = df.iloc[start:end][['Location', 'CleanedLocations']]
pd.set_option('display.max_colwidth', None)

# Display the first page
display(page_df)

# print(clean_location('Glan, Kiamba, Polompok, and General Santos City (Davao Occidental in Region XI, southern Mindanao)'))


Unnamed: 0,Location,CleanedLocations
350,"Benito Soliven, Dinapigue, Dilvilacan, Maconacon, Palanan areas (Isabela district, Region II (Cagayan Valley) province), Aparri, Buguey, Calayan, Gonzaga, Santa Ana, Santa Teresita areas (Cagayan district, Region II (Cagayan Valley) province)",{Region II - Cagayan Valley}
351,"Cordillera Administrative region (CAR), Region I (Ilocos region), Region II (Cagayan Valley) provinces","{Region II - Cagayan Valley, Cordillera Administrative Region, Region I - Ilocos Region}"
352,Pangasinan province,{Region I - Ilocos Region}
353,"Janiuary area (Iloilo district, Region VI (Western Visayas) province), Valladolid area (Negros Occidental district, Region VI (Western Visayas) province), San Lorenzo area (Guimaras district, Region VI (Western Visayas) province), Bayawan, Siaton areas (Negros Oriental district, Region VII (Central Visayas) province), Kapatagan, Laia, Sapad areas (Lanao Del Norte district, Region X (Northern Mindanao) province)","{Region VI - Western Visayas, Region X - Northern Mindanao, Region VII - Central Visayas}"
354,"Camarines Norte, Catanduanes districts (Region V (Bicol region) province), Cordillera Administrative region (CAR), National Capital region (NCR) province, Region I (Ilocos region), Region II (Cagayan Valley), Region III (Central Luzon), Region IV-A (Calabarzon) provinces","{Region V - Bicol, Region II - Cagayan Valley, Region III - Central Luzon, Cordillera Administrative Region, Region IVA - CALABARZON, National Capital Region, Region I - Ilocos Region}"
355,"Nueva Vizcaya district (Region II (Cagayan Valley) province), Quezon, Batangas districts (Region IV-A (Calabarzon) province), Marinduque, Mindoro Oriental, Mindoro Occidental, Romblon districts (Region IV (Southern Tagalog) province), Catanduanes, Albay, Masbate, Sorsogon districts (Region V (Bicol region) province), Biliran districts (Region VII (Central Visayas) province), Samar, Northern Samar districts (Region VIII (Eastern Visayas) province), Quezon City area (Metropolitan Manila district, National Capital region (NCR) province)","{Region V - Bicol, Region II - Cagayan Valley, Region VIII - Eastern Visayas, Region IVA - CALABARZON, National Capital Region, Region VII - Central Visayas, Region IVB - MIMAROPA}"
356,"Ilocos Norte district (Region I (Ilocos region) province), Batanes, Cagayan districts (Region II (Cagayan valley) province)","{Region II - Cagayan Valley, Region I - Ilocos Region}"
357,"Ilocos Norte, Ilocos Sur, La Union, Pangasinan districts (Region I (Ilocos region) province), Cagayan, Isabela, Nueva Vizcaya, Quirino districts (Region II (Cagayan valley) province), Aurora, Bataan, Bulacan, Nueva Ecija, Pampanga, Tarlac, Zambales districts (Region III (Central Luzon) province), Batangas, Quezon, Rizal districts (Regio IV-A (Calabarzon) province), Camarines Norte, Sorgoson (Region V (Bicol region) province)), Abra, Apayao, Benguet, Ifugao, Kalinga, Mountain province (Cordillera Administrative region (CAR))","{Region V - Bicol, Region II - Cagayan Valley, Region III - Central Luzon, Cordillera Administrative Region, Region IVA - CALABARZON, Region I - Ilocos Region}"
358,"Apayao, Benguet, Cagayan, Kalinga, Isabela, Abra, Ilocos Norte and Ilocos Sur","{Region II - Cagayan Valley, Cordillera Administrative Region, Region I - Ilocos Region}"
359,"Palawan (Mimaropa); Capiz, Iloilo, Negros occidental (Region IV - Western Visayas); Bohol, Cebu, Negros Oriental, Siquijor (Region VII - Central Visayas); Biliran, Eastern Samar, Leyte, Samar, Southern Leyte (Region VII - Eastern Visayas); Agusan del Norte, Dinagat Isl., Surigao del Norte, Surigao del Sur (REgion XIII - Caraga)","{Region VI - Western Visayas, Region XIII - Caraga, Region VIII - Eastern Visayas, Region VII - Central Visayas, Region IVB - MIMAROPA}"
