In [7]:

from bs4 import BeautifulSoup

with open("female-detainee-cases.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")



In [11]:
# grab every <a> tag
all_links = soup.find_all("a")

# keep only those whose text starts with “Case ” and whose href ends with “.html”
case_links = [
    a for a in all_links
    if a.text.strip().startswith("Case ")
    and a.get("href", "").endswith(".html")
]



In [13]:
records = []
for a in case_links:
    href = a["href"]
    text = a.get_text(strip=True)
    # e.g. "Case 2657 Moy Chin See his wife"
    records.append({"href": href, "raw_text": text})




In [15]:
import re

parsed = []
pattern = re.compile(r"Case\s+(\d+)\s+(.+)")
for rec in records:
    m = pattern.match(rec["raw_text"])
    if not m:
        # flag for manual review
        parsed.append({
            **rec,
            "case_number": None,
            "name": None,
            "descriptor": None,
            "note": "FAILED TO PARSE"
        })
        continue

    num = m.group(1)
    remainder = m.group(2)  # e.g. "Moy Chin See his wife"
    
    # Heuristic: split off a trailing descriptor like "his wife", "alias …", "nee …"
    # You may need to refine this for cases like "Kwok Ah Ying and Kowk Sue (sisters)"
    parts = re.split(r"\s+(alias|nee|wife|daughter|and)\b", remainder, maxsplit=1)
    if len(parts) == 1:
        name, descriptor = parts[0], ""
    else:
        name = parts[0].strip()
        descriptor = remainder[len(name):].strip()

    parsed.append({
        **rec,
        "case_number": num,
        "name": name,
        "descriptor": descriptor
    })


In [19]:
import pandas as pd

df = pd.DataFrame(parsed)

# Optional cleaning:
df["case_number"] = df["case_number"].astype("Int64")  # integer column
df["name"] = df["name"].str.replace(r"^Mrs\.\s*", "", regex=True)
df["descriptor"] = df["descriptor"].str.replace(r"[()]", "", regex=True)


df

Unnamed: 0,href,raw_text,case_number,name,descriptor
0,2657.html,Case 2657 Moy Chin See his wife,2657,Moy Chin See his,wife
1,2917.html,Case 2917 Lee Kin Sai alias Lee Wah Chung,2917,Lee Kin Sai,alias Lee Wah Chung
2,2950.html,Case 2950 Tie Yimm a woman,2950,Tie Yimm a woman,
3,3068.html,"Case 3068 Lin Kum daughter, Wye See mother",3068,Lin Kum,"daughter, Wye See mother"
4,3100.html,Case 3100 Tarm How Yen wife,3100,Tarm How Yen,wife
...,...,...,...,...,...
133,10116.html,Case 10116 Chin Chon Loy,10116,Chin Chon Loy,
134,10144.html,Case 10144 Cha Sing Kwai,10144,Cha Sing Kwai,
135,10145.html,Case 10145 Cha Tai Kim,10145,Cha Tai Kim,
136,youngchoyling.html,Case 10175 Young Choy Ling,10175,Young Choy Ling,


In [23]:
df.to_csv("female_detainee_cases.csv", index=False, encoding="utf-8")