In [212]:
import pandas as pd
import requests as requests
import json
import pycountry
import re
import unicodedata


In [84]:
df = pd.read_excel("GSAF5.xls")

Remove unnecessary columns

In [85]:
df = df.drop(["pdf", "href formula", "href", "Case Number", "Case Number.1", "original order", "Unnamed: 21", "Unnamed: 22", "Source", "State", "Time"], axis=1)

Filter valid Types

In [86]:
df = df[df['Type'].isin(["Unprovoked", "Provoked", "Watercraft" , "Air", "Sea Disaster"])] 


Remove rows without Location or Country or Year given

In [87]:
df = df.dropna(subset=["Location", "Country", "Year"])

Trim data to only be between 2000 and 2026

In [88]:
df = df[df["Year"] >= 2000]
df["Year"] = pd.to_numeric(df["Year"], downcast='integer', errors='coerce')

In [189]:
display(df)
df.count()

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal Y/N,Species,CountryCode,latitude,longitude
0,10th January,2026,Unprovoked,Australia,Avalon Beach,Surfing,Paul Stanton,M,?,puncture mark to left thumb,N,Unknown,AU,-33.6365039,151.3290299
1,8th January,2026,Unprovoked,US Virgin Islands,Dorsch Beach,Snorkeling,Arlene Lillis,F,56,Left arm torn off in the attack below the elbow,Y,Unknown,VI,,
2,3rd January,2026,Unprovoked,New Caledonia,Between Bourail and Moindou,Scuba Diving,Unknown,M,?,Injuries to upper limbs,N,Unknown,NC,,
3,21st December,2025,Unprovoked,USA,Lovers Point Pacific Grove,Swimming,Erica Fox,F,55,Taken by shark body recovered with multiple in...,Y,Great White Shark,US,36.6233941,-121.9066666
4,12th December,2025,Unprovoked,USA,Salmon Creek,Surfing,Unknown,M,?,Hand Injury,N,Suspected Great White Shark,US,45.7129025,-122.6643482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2808,03-Feb-2000,2000,Unprovoked,NEW ZEALAND,Oreti Beach (reported as the 4th person bitten...,Surfing,Michael Petas,M,12,"No injury, wetsuit punctured",N,,NZ,,
2809,01-Feb-2000,2000,Unprovoked,AUSTRALIA,"Point Sinclair, Cactus Beach near Penong",Surfing,Anthony Hayes,M,26,Hand bitten,N,3 m [10'] shark,AU,,
2810,Reported 28-Jan-2000,2000,Watercraft,REUNION,Saint Pierre,Canoe with 3 men onboard sank,Boulabhaï Ishmael,M,,FATAL,Y,,RE,,
2811,05-Jan-2000,2000,Unprovoked,THAILAND,Phang nga Island,Diving,Stephan Kahl,M,35,FATAL,Y,,TH,9.3340302,98.3232413


Date           2495
Year           2495
Type           2495
Country        2495
Location       2495
Activity       2413
Name           2467
Sex            2400
Age            1942
Injury         2488
Fatal Y/N      2482
Species        1586
CountryCode    2495
latitude       1677
longitude      1677
dtype: int64

Fetch country codes for API that gets coordinates from locations

In [147]:
PLACE_TO_ALPHA2_N = {
    "US Virgin Islands": "VI",
    "Canary Islands": "ES",
    "Maldive Islands": "MV",
    "ST KITTS / NEVIS": "KN",
    "ST MARTIN": "MF",
    "ST. MARTIN": "MF",
    "ST. MAARTIN": "SX",
    "REUNION ISLAND": "RE",
    "ST HELENA, British overseas territory": "SH",
    "PALESTINIAN TERRITORIES": "PS",
    "TURKS & CAICOS": "TC",
    "AZORES": "PT",
    "UNITED ARAB EMIRATES (UAE)": "AE",
    "GRAND CAYMAN": "KY",
    "CARIBBEAN SEA": "AI",  # took iso of anguilla because it was near it
}


In [158]:
def get_iso_code(name):
    try:
        iso = pycountry.countries.search_fuzzy(name)
        iso = iso[0].alpha_2
    except:
        if name in PLACE_TO_ALPHA2_N:
            iso = PLACE_TO_ALPHA2_N[name]
            if iso is None:
                print(name)
    return iso
            

In [159]:
df["CountryCode"] = df["Country"].apply(get_iso_code)

Fetch coordinates (latitude and logitude)

In [187]:
def get_coordinates(location, countryCode):
    headers = {"User-Agent": "shark-attack-map/1.0 (contact: e12217684@student.tuwien.ac.at)"}
    params = {"q": f"{location}", "format": "json", "countrycodes": f"{countryCode}", "limit": 1}
    response = requests.get("https://nominatim.openstreetmap.org/search",headers=headers, params=params)
    if response.status_code == 200 and len(response.json()) != 0:
        print(response.json())
        return (response.json()[0]["lat"], response.json()[0]["lon"])
    else:
        return (None, None)

In [188]:
df[["latitude", "longitude"]] = df.apply(
    lambda row: get_coordinates(row["Location"], row["CountryCode"]), axis=1, result_type="expand"
)

[{'place_id': 19543365, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 2918144, 'lat': '-33.6365039', 'lon': '151.3290299', 'class': 'boundary', 'type': 'administrative', 'place_rank': 18, 'importance': 0.32173239253715025, 'addresstype': 'suburb', 'name': 'Avalon Beach', 'display_name': 'Avalon Beach, Northern Beaches, Northern Beaches Council, New South Wales, 2107, Australia', 'boundingbox': ['-33.6432341', '-33.6142715', '151.3115063', '151.3438980']}]
[{'place_id': 298827350, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 7037922, 'lat': '36.6233941', 'lon': '-121.9066666', 'class': 'boundary', 'type': 'protected_area', 'place_rank': 25, 'importance': 0.06671889295796139, 'addresstype': 'protected_area', 'name': 'Lovers Point - Julia Platt State Marine Reserve', 'display_name': 'Lovers Point - Julia Platt State Marine Reserve, Pacific Grove 

1. Clean up of locations: removing special symbols, between, off, etc

In [205]:
# Regex-Bausteine
RE_PARENS = re.compile(r"\([^)]*\)")
RE_BETWEEN = re.compile(r"\bbetween\s+(?P<a>.+?)\s+(?:and|&)\s+(?P<b>.+)\b", re.IGNORECASE)
RE_CHANNEL_BETWEEN = re.compile(r"\bchannel\s+between\s+(?P<a>.+?)\s+(?:and|&)\s+(?P<b>.+)\b", re.IGNORECASE)
RE_OFF = re.compile(r"\b(?:\d+\s*(?:nm|km|miles?)\s+)?off\s+(?P<place>.+)\b", re.IGNORECASE)

# Distanzen/“about 500 km (310 miles)” usw.
RE_DISTANCE = re.compile(r"\babout\s+\d+.*?$|\b\d+\s*(?:km|m|nm|miles?)\b", re.IGNORECASE)

# Richtungsausdrücke (macht’s simpler)
RE_DIRECTION_OF = re.compile(r"\b(north|south|east|west)\s+of\s+", re.IGNORECASE)

def simplify_location(loc: str) -> str:
    if not isinstance(loc, str) or not loc.strip():
        return ""

    s = loc.strip()

    # 1) Klammern entfernen
    s = RE_PARENS.sub("", s)

    # 2) komische Quotes normalisieren/entfernen
    s = s.replace("“", '"').replace("”", '"').replace("’", "'").replace("`", "'")

    # 3) "between A and B" => A
    m = RE_CHANNEL_BETWEEN.search(s)
    if m:
        s = m.group("a")
    else:
        m = RE_BETWEEN.search(s)
        if m:
            s = m.group("a")

    # 4) "off X" => X (auch "70 miles off Pensacola")
    m = RE_OFF.search(s)
    if m:
        s = m.group("place")

    # 5) "north of X" => X (wir wollen was Geocodable)
    s = RE_DIRECTION_OF.sub("", s)

    # 6) "near Y" => Y (nimmt den Teil nach dem letzten 'near')
    if re.search(r"\bnear\b", s, flags=re.IGNORECASE):
        s = re.split(r"\bnear\b", s, flags=re.IGNORECASE)[-1]

    # 7) Distanzen/“about …” raus
    s = RE_DISTANCE.sub("", s)

    # 8) Fragezeichen/seltsame Satzzeichen raus
    s = s.replace("?", "").replace("!", "").replace(";", "").replace("/", "")
    s = s.replace('"', "").replace("'", "")  # Apostrophe/Quotes komplett raus (einfachster Weg)

    # 9) Mehrfachspaces/Kommas normalisieren
    s = re.sub(r"\s+", " ", s).strip()
    s = s.strip(" ,.-")

    # 10) optional: nur erster Teil vor Komma (sehr “einfacher” fallback)
    #    (wenn du lieber mehr Kontext behalten willst, kommentier die nächste Zeile aus)
    if "," in s:
        s = s.split(",")[0].strip()

    return s


In [206]:
coord_notfound = df.loc[df["latitude"].isna()].copy()
coord_notfound["cleaned_location"] = coord_notfound["Location"].apply(simplify_location)


In [209]:
display(coord_notfound)
coord_notfound.count()

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal Y/N,Species,CountryCode,latitude,longitude,cleaned_location
1,8th January,2026,Unprovoked,US Virgin Islands,Dorsch Beach,Snorkeling,Arlene Lillis,F,56,Left arm torn off in the attack below the elbow,Y,Unknown,VI,,,Dorsch Beach
2,3rd January,2026,Unprovoked,New Caledonia,Between Bourail and Moindou,Scuba Diving,Unknown,M,?,Injuries to upper limbs,N,Unknown,NC,,,Bourail
5,9th December,2025,Provoked,USA,"Ka'alu""alu Beach",Freeing trapped shark,Josiah Kaimani Ventura,M,24,Bite wounds to thigh,N,Black Tip Reef Shark,US,,,Kaalualu Beach
8,10th November,2025,Unprovoked,Australia,Prevelly Beach Magaret River,Foil Boarding,Andy McDonald,M,61,No Injury to self,N,Great White Shark,AU,,,Prevelly Beach Magaret River
9,9th November,2025,Unprovoked,French Polynesia,Hakahau Bay,Swimming,Not stated (Dentist),M,40,Deep Gash to bicep,N,3m shark,PF,,,Hakahau Bay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2804,Mar-2000,2000,Unprovoked,USA,Midnight Lump (38 miles offshore),Spearfishing,Kurt Bickel,M,39,"No injury to diver, speargun damaged",N,"Shortfin mako shark, 3 m to 3.4 m [10' to 11']",US,,,Midnight Lump
2807,14-Feb-2000,2000,Provoked,ENGLAND,The Fountain Pub in Tenbury Wells,Feeding prawns to captive sharks,"Paul Smith, a chef",M,,Fingers bitten PROVOKED INCIDENT,N,"Miami, a 60 cm blacktip shark and two 60 cm b...",GB,,,The Fountain Pub in Tenbury Wells
2808,03-Feb-2000,2000,Unprovoked,NEW ZEALAND,Oreti Beach (reported as the 4th person bitten...,Surfing,Michael Petas,M,12,"No injury, wetsuit punctured",N,,NZ,-46.4213005,168.2195689,Oreti Beach
2809,01-Feb-2000,2000,Unprovoked,AUSTRALIA,"Point Sinclair, Cactus Beach near Penong",Surfing,Anthony Hayes,M,26,Hand bitten,N,3 m [10'] shark,AU,-31.9283087,133.0110750,Penong


Date                818
Year                818
Type                818
Country             818
Location            818
Activity            796
Name                806
Sex                 783
Age                 620
Injury              814
Fatal Y/N           813
Species             552
CountryCode         818
latitude            409
longitude           409
cleaned_location    818
dtype: int64

In [208]:
coord_notfound[["latitude", "longitude"]] = coord_notfound.apply(
    lambda row: get_coordinates(row["cleaned_location"], row["CountryCode"]), axis=1, result_type="expand"
)

[{'place_id': 278535176, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 3776653, 'lat': '26.5356810', 'lon': '-78.6953623', 'class': 'boundary', 'type': 'administrative', 'place_rank': 16, 'importance': 0.4934556310655747, 'addresstype': 'city', 'name': 'City of Freeport', 'display_name': 'City of Freeport, The Bahamas', 'boundingbox': ['26.2890000', '26.7231863', '-78.9266868', '-78.3055203']}]
[{'place_id': 281871453, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'way', 'osm_id': 129535681, 'lat': '26.9411913', 'lon': '-80.0724142', 'class': 'leisure', 'type': 'park', 'place_rank': 24, 'importance': 0.08003500190737771, 'addresstype': 'park', 'name': 'Jupiter Beach Park', 'display_name': 'Jupiter Beach Park, Jupiter, Palm Beach County, Florida, United States', 'boundingbox': ['26.9385898', '26.9440290', '-80.0743346', '-80.0700683']}]
[{'place_id': 198528224, 'lice

In [210]:
coord_notfound_2 = coord_notfound.loc[coord_notfound["latitude"].isna()].copy()
coord_notfound_2.to_csv("/Users/emilywu/Desktop/DataLiteracy/MiniProjekt/noCoordFound_2.csv")


In [211]:
display(coord_notfound_2)

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal Y/N,Species,CountryCode,latitude,longitude,cleaned_location
1,8th January,2026,Unprovoked,US Virgin Islands,Dorsch Beach,Snorkeling,Arlene Lillis,F,56,Left arm torn off in the attack below the elbow,Y,Unknown,VI,,,Dorsch Beach
2,3rd January,2026,Unprovoked,New Caledonia,Between Bourail and Moindou,Scuba Diving,Unknown,M,?,Injuries to upper limbs,N,Unknown,NC,,,Bourail
5,9th December,2025,Provoked,USA,"Ka'alu""alu Beach",Freeing trapped shark,Josiah Kaimani Ventura,M,24,Bite wounds to thigh,N,Black Tip Reef Shark,US,,,Kaalualu Beach
8,10th November,2025,Unprovoked,Australia,Prevelly Beach Magaret River,Foil Boarding,Andy McDonald,M,61,No Injury to self,N,Great White Shark,AU,,,Prevelly Beach Magaret River
9,9th November,2025,Unprovoked,French Polynesia,Hakahau Bay,Swimming,Not stated (Dentist),M,40,Deep Gash to bicep,N,3m shark,PF,,,Hakahau Bay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2798,15-Mar-2000,2000,Unprovoked,NEW CALEDONIA,Poum,Spearfishing,Gilbert Bui Van Minh,M,35,FATAL,Y,Tiger shark?,NC,,,Poum
2799,14-Mar-2000,2000,Unprovoked,AUSTRALIA,"McMasters Beach, Central Coast",Surfing,Craig Ruth,M,,No Injury,N,"Tiger shark, 4 m [13'] ?",AU,,,McMasters Beach
2804,Mar-2000,2000,Unprovoked,USA,Midnight Lump (38 miles offshore),Spearfishing,Kurt Bickel,M,39,"No injury to diver, speargun damaged",N,"Shortfin mako shark, 3 m to 3.4 m [10' to 11']",US,,,Midnight Lump
2807,14-Feb-2000,2000,Provoked,ENGLAND,The Fountain Pub in Tenbury Wells,Feeding prawns to captive sharks,"Paul Smith, a chef",M,,Fingers bitten PROVOKED INCIDENT,N,"Miami, a 60 cm blacktip shark and two 60 cm b...",GB,,,The Fountain Pub in Tenbury Wells


2. Clean up of locations: removing typos, replacing beach with bay, etc

In [224]:
def strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))

def generate_queries(loc: str, country_name: str | None = None):
    if not isinstance(loc, str) or not loc.strip():
        return []
    s = loc.strip()

    # Normalisieren
    s = strip_accents(s)
    s = s.replace("-", " ")
    s = s.replace("’", "'").replace("`", "'").replace('"', "")
    s = re.sub(r"\s+", " ", s).strip()

    variants = []

    # 1) Original
    variants.append(s)

    # 2) Apostrophe komplett weg (einfachster “works often”)
    variants.append(s.replace("'", ""))

    # 3) Häufige Tippfehler / Varianten
    variants.append(re.sub(r"\bbaay\b", "bay", s, flags=re.IGNORECASE))  # Crowdy Baay -> Crowdy Bay
    variants.append(re.sub(r"\bbeach\b", "bay", s, flags=re.IGNORECASE)) # beach -> bay
    variants.append(re.sub(r"\bbeach\b", "cove", s, flags=re.IGNORECASE))
    variants.append(re.sub(r"\bbeach\b", "", s, flags=re.IGNORECASE).strip())  # beach einfach entfernen

    # 4) Wenn mehrere Wörter: nur “Kern” nehmen (z.B. "Prevelly Beach Magaret River" -> "Prevelly Beach")
    tokens = s.split()
    if len(tokens) >= 3:
        variants.append(" ".join(tokens[:2]))
        variants.append(" ".join(tokens[:3]))

    # 5) Stop-Wörter abschneiden (sehr pragmatisch)
    for stop in [" river", " island", " atoll", " point", " reef", " jetty", " harbour", " harbor"]:
        if stop in s.lower():
            cut = s.lower().split(stop)[0].strip()
            if cut:
                variants.append(cut)

    # Dedupe + leere raus
    out = []
    seen = set()
    for v in variants:
        v = (v or "").strip(" ,.")
        if len(v) < 2:
            continue
        key = v.lower()
        if key not in seen:
            out.append(v)
            seen.add(key)

    if country_name and isinstance(country_name, str) and country_name.strip():
        out = [f"{q}, {country_name.strip()}" for q in out] + out

    extra = []

    for v in out:
        v2 = remove_directional_phrases(v)
        if v2 != v:
            extra.append(v2)

        extra.append(keep_last_place_chunk(v))
        extra.append(remove_double_letters(v))

        extra.extend(beach_synonym_variants(v))
        extra.append(last_token_only(v))

    # dedupe
    for v in extra:
        v = v.strip(" ,.")
        if len(v) >= 2 and v.lower() not in seen:
            out.append(v)
            seen.add(v.lower())    

    return out


In [225]:
def get_coordinates_best_effort(cleaned_location):
    headers = {"User-Agent": "shark-attack-map/1.0 (contact: e12217684@student.tuwien.ac.at)"}
    

    for q in generate_queries(cleaned_location):

        params = {
            "q": q,
            "format": "json",
            "limit": 1,
        }
        res = requests.get("https://nominatim.openstreetmap.org/search", headers=headers, params=params)
        if res.status_code == 200:
            data = res.json()
            if data:
                print(data)
                latlon = (float(data[0]["lat"]), float(data[0]["lon"]))
                return latlon
    return (None, None)

In [215]:
coord_notfound_2[["latitude", "longitude"]] = coord_notfound_2["cleaned_location"].apply(
    get_coordinates_best_effort
).apply(pd.Series)

In [217]:
display(coord_notfound_2)
coord_notfound_2.count()

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal Y/N,Species,CountryCode,latitude,longitude,cleaned_location
1,8th January,2026,Unprovoked,US Virgin Islands,Dorsch Beach,Snorkeling,Arlene Lillis,F,56,Left arm torn off in the attack below the elbow,Y,Unknown,VI,-35.008658,138.529591,Dorsch Beach
2,3rd January,2026,Unprovoked,New Caledonia,Between Bourail and Moindou,Scuba Diving,Unknown,M,?,Injuries to upper limbs,N,Unknown,NC,-21.567724,165.497130,Bourail
5,9th December,2025,Provoked,USA,"Ka'alu""alu Beach",Freeing trapped shark,Josiah Kaimani Ventura,M,24,Bite wounds to thigh,N,Black Tip Reef Shark,US,18.966377,-155.615713,Kaalualu Beach
8,10th November,2025,Unprovoked,Australia,Prevelly Beach Magaret River,Foil Boarding,Andy McDonald,M,61,No Injury to self,N,Great White Shark,AU,-33.973059,114.987355,Prevelly Beach Magaret River
9,9th November,2025,Unprovoked,French Polynesia,Hakahau Bay,Swimming,Not stated (Dentist),M,40,Deep Gash to bicep,N,3m shark,PF,,,Hakahau Bay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2798,15-Mar-2000,2000,Unprovoked,NEW CALEDONIA,Poum,Spearfishing,Gilbert Bui Van Minh,M,35,FATAL,Y,Tiger shark?,NC,-20.231655,164.029894,Poum
2799,14-Mar-2000,2000,Unprovoked,AUSTRALIA,"McMasters Beach, Central Coast",Surfing,Craig Ruth,M,,No Injury,N,"Tiger shark, 4 m [13'] ?",AU,44.353110,-74.215714,McMasters Beach
2804,Mar-2000,2000,Unprovoked,USA,Midnight Lump (38 miles offshore),Spearfishing,Kurt Bickel,M,39,"No injury to diver, speargun damaged",N,"Shortfin mako shark, 3 m to 3.4 m [10' to 11']",US,,,Midnight Lump
2807,14-Feb-2000,2000,Provoked,ENGLAND,The Fountain Pub in Tenbury Wells,Feeding prawns to captive sharks,"Paul Smith, a chef",M,,Fingers bitten PROVOKED INCIDENT,N,"Miami, a 60 cm blacktip shark and two 60 cm b...",GB,50.932605,-0.322037,The Fountain Pub in Tenbury Wells


Date                409
Year                409
Type                409
Country             409
Location            409
Activity            399
Name                403
Sex                 395
Age                 318
Injury              406
Fatal Y/N           404
Species             282
CountryCode         409
latitude            294
longitude           294
cleaned_location    409
dtype: int64

In [218]:
coord_notfound_3 = coord_notfound_2.loc[coord_notfound_2["latitude"].isna()].copy()
coord_notfound_3.to_csv("/Users/emilywu/Desktop/DataLiteracy/MiniProjekt/noCoordFound_3.csv")
display(coord_notfound_3)


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal Y/N,Species,CountryCode,latitude,longitude,cleaned_location
9,9th November,2025,Unprovoked,French Polynesia,Hakahau Bay,Swimming,Not stated (Dentist),M,40,Deep Gash to bicep,N,3m shark,PF,,,Hakahau Bay
13,14th October,2025,Unprovoked,Columbia,Catagena Province,Swimming with sharks,Male child,M,14,Severe hand injury,N,Nurse shark,CA,,,Catagena Province
29,20th July,2025,Unprovoked,Canary Islands,Los Mollinas,Surfing (Hydrofoiling),Unknown,M,?,Lacerations to left leg above the knee,N,Undetermined,ES,,,Los Mollinas
35,4th July,2025,Unprovoked,South Africa,Mfazazana Hibberdene,Fishing,Unknown male,M,37,Mltiple injuries to body,Y,Undetermined,ZA,,,Mfazazana Hibberdene
43,2025-05-26 00:00:00,2025,Unprovoked,Vanuatu,Espiitu Santo Island,Swimming,Tumas,M,14,Multiple injuries to body one hand and leg bit...,Y,Not stated,VU,,,Espiitu Santo Island
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2664,25-Jul-2001,2001,Unprovoked,USA,"Taverniier, Monroe County",,male,M,,Minor injury,N,,US,,,Taverniier
2715,06-Jan-2001,2001,Watercraft,NEW ZEALAND,Sandy Bay/Whananaki,Kayaking,Dr. Michael Hogan,M,,"No injury, kayak bitten",N,White shark,NZ,,,Sandy BayWhananaki
2716,24-Dec-2000,2000,Unprovoked,AUSTRALIA,Flinders Cay,Scuba diving,male,M,23,Hand bitten,N,,AU,,,Flinders Cay
2755,21-Aug-2000,2000,Unprovoked,USA,"Bouges Bank, Emerald Isle, Carteret County",Swimming out to porpoises,male,M,,"Severe gash to left hand above wrist, almost s...",N,,US,,,Bouges Bank


3. Cleanup of locations: further cleanup

In [219]:
def remove_directional_phrases(s: str) -> str:
    patterns = [
        r"\bnorth of\b", r"\bsouth of\b", r"\beast of\b", r"\bwest of\b",
        r"\bnear\b", r"\boff\b", r"\boff the coast of\b",
        r"\bat the mouth of\b", r"\bin the vicinity of\b",
        r"\bin the area of\b", r"\bin front of\b"
    ]
    out = s
    for p in patterns:
        out = re.sub(p, "", out, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", out).strip()


In [220]:
def keep_last_place_chunk(s: str) -> str:
    # trennt bei typischen Präpositionen und nimmt das letzte Segment
    for sep in [" near ", " off ", " at ", " in ", " on "]:
        if sep in s.lower():
            return s.lower().split(sep)[-1].strip()
    return s


In [221]:
def beach_synonym_variants(s: str) -> list[str]:
    variants = []
    if re.search(r"\bbeach\b", s, flags=re.IGNORECASE):
        variants.extend([
            re.sub(r"\bbeach\b", "bay", s, flags=re.IGNORECASE),
            re.sub(r"\bbeach\b", "cove", s, flags=re.IGNORECASE),
            re.sub(r"\bbeach\b", "point", s, flags=re.IGNORECASE),
            re.sub(r"\bbeach\b", "", s, flags=re.IGNORECASE).strip(),
        ])
    return variants


In [222]:
def last_token_only(s: str) -> str:
    toks = s.split()
    if len(toks) >= 2:
        return toks[-1]
    return s


In [223]:
def remove_double_letters(s: str) -> str:
    # z.B. baay -> bay, coove -> cove
    return re.sub(r"(.)\1+", r"\1", s)

In [226]:
coord_notfound_3[["latitude", "longitude"]] = coord_notfound_3["cleaned_location"].apply(
    get_coordinates_best_effort
).apply(pd.Series)

[{'place_id': 279995906, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 1210718, 'lat': '30.2481693', 'lon': '-85.6593633', 'class': 'boundary', 'type': 'administrative', 'place_rank': 12, 'importance': 0.5567836372157328, 'addresstype': 'county', 'name': 'Bay County', 'display_name': 'Bay County, Florida, United States', 'boundingbox': ['29.9010330', '30.5674239', '-85.9998930', '-85.3841500']}]
[{'place_id': 66634626, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 45756, 'lat': '46.1029536', 'lon': '11.1297425', 'class': 'boundary', 'type': 'administrative', 'place_rank': 12, 'importance': 0.7293133565197413, 'addresstype': 'county', 'name': 'Provincia di Trento', 'display_name': 'Provincia di Trento, Trentino-Alto Adige/Südtirol, Italia', 'boundingbox': ['45.6728669', '46.5330375', '10.4522048', '11.9628023']}]
[{'place_id': 273732410, 'licenc

In [227]:
display(coord_notfound_3)
coord_notfound_3.count()

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal Y/N,Species,CountryCode,latitude,longitude,cleaned_location
9,9th November,2025,Unprovoked,French Polynesia,Hakahau Bay,Swimming,Not stated (Dentist),M,40,Deep Gash to bicep,N,3m shark,PF,30.248169,-85.659363,Hakahau Bay
13,14th October,2025,Unprovoked,Columbia,Catagena Province,Swimming with sharks,Male child,M,14,Severe hand injury,N,Nurse shark,CA,46.102954,11.129743,Catagena Province
29,20th July,2025,Unprovoked,Canary Islands,Los Mollinas,Surfing (Hydrofoiling),Unknown,M,?,Lacerations to left leg above the knee,N,Undetermined,ES,37.493177,-2.150670,Los Mollinas
35,4th July,2025,Unprovoked,South Africa,Mfazazana Hibberdene,Fishing,Unknown male,M,37,Mltiple injuries to body,Y,Undetermined,ZA,-30.571944,30.573611,Mfazazana Hibberdene
43,2025-05-26 00:00:00,2025,Unprovoked,Vanuatu,Espiitu Santo Island,Swimming,Tumas,M,14,Multiple injuries to body one hand and leg bit...,Y,Not stated,VU,64.984182,-18.105901,Espiitu Santo Island
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2664,25-Jul-2001,2001,Unprovoked,USA,"Taverniier, Monroe County",,male,M,,Minor injury,N,,US,25.022779,-80.495350,Taverniier
2715,06-Jan-2001,2001,Watercraft,NEW ZEALAND,Sandy Bay/Whananaki,Kayaking,Dr. Michael Hogan,M,,"No injury, kayak bitten",N,White shark,NZ,,,Sandy BayWhananaki
2716,24-Dec-2000,2000,Unprovoked,AUSTRALIA,Flinders Cay,Scuba diving,male,M,23,Hand bitten,N,,AU,4.820680,-52.363053,Flinders Cay
2755,21-Aug-2000,2000,Unprovoked,USA,"Bouges Bank, Emerald Isle, Carteret County",Swimming out to porpoises,male,M,,"Severe gash to left hand above wrist, almost s...",N,,US,51.513105,-0.089375,Bouges Bank


Date                115
Year                115
Type                115
Country             115
Location            115
Activity            111
Name                113
Sex                 111
Age                  84
Injury              114
Fatal Y/N           112
Species              80
CountryCode         115
latitude             94
longitude            94
cleaned_location    115
dtype: int64

In [228]:
coord_notfound_4 = coord_notfound_3.loc[coord_notfound_3["latitude"].isna()].copy()
coord_notfound_4.to_csv("/Users/emilywu/Desktop/DataLiteracy/MiniProjekt/noCoordFound_4.csv")
display(coord_notfound_4)

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal Y/N,Species,CountryCode,latitude,longitude,cleaned_location
68,2024-12-30 00:00:00,2024,Unprovoked,Mozambique,?,Diving,Unknown Male,M,?,Diving for sea urchins attacked by shark injur...,Y,Unknown,MZ,,,
188,28-May 2023,2023,Unprovoked,NEW CALEDONIA,Twagne,Spearfishing,male,M,42,"Fatal, bite to leg, shoulder and head",Y,,NC,,,Twagne
209,09-Apr-2023,2023,Unprovoked,USA,"Kewalos, Oahu",Surfing,Mike Morita,M,58,Injuries to right foot,N,"Tiger shark, 8'-10'",US,,,Kewalos
289,01-Jun-2022,2022,Unprovoked,AUSTRALIA,Oyster Stacks near Exmouth12h15,Swimming,,F,,Minor injury,N,,AU,,,Exmouth12h15
423,02-Feb-2021,2021,Watercraft,AUSTRALIA,Gleneg,Surf skiing,Jad French,M,17,"No injury, surf ski bitten",N,Bronze whaler shark,AU,,,Gleneg
424,02-Feb-2021,2021,Watercraft,USA,Ukumehama,Kayaking,Daniel and Tristan Sullivan,M,45 and 15,"No injury, kayak bitten",N,White shark,US,,,Ukumehama
763,9-Mar-2018,2018,Unprovoked,AUSTRALIA,Winkipop,Surfing,Lachie Brown,M,18,"Minor injury, ankle grazed",N,1.8 m shark,AU,,,Winkipop
834,31-Jul-2017,2017,Unprovoked,USA,"Jacksonvlle, Duval County",Skimboarding,Colton McCarty,M,15,Lacerations to lower leg,N,,US,,,Jacksonvlle
991,23-Jun-2016,2016,Unprovoked,SOUTH AFRICA,Ryspunt,Spearfishing,Rene Nel,M,43,Injuries to left leg & right hand,N,White shark,ZA,,,Ryspunt
1013,02-May-2016,2016,Provoked,NEW ZEALAND,Cormandel,Fishing,male,M,39,Foot bitten by landed shark PROVOKED INCIDENT,N,"Mako shark, 1.5 m [5']",NZ,,,Cormandel


Putting all coordinates together

In [229]:
def update_coords(target_df, source_df):
    mask = source_df["latitude"].notna() & source_df["longitude"].notna()
    target_df.loc[source_df.index[mask], ["latitude", "longitude"]] = \
        source_df.loc[mask, ["latitude", "longitude"]]

In [230]:
update_coords(df, coord_notfound)
update_coords(df, coord_notfound_2)
update_coords(df, coord_notfound_3)
update_coords(df, coord_notfound_4)

Hardcoding the last 21 missing coordinates

In [234]:
manual_coords = {
    68:  (-15.03466, 40.73447),   # Mozambique, unknown -> Küste 

    188: (-22.2760, 166.4580),    # NEW CALEDONIA "Twagne" -> grob Nouméa / Küste
    209: (21.2850, -157.8620),    # USA "Kewalos, Oahu" -> Kewalo Basin, Honolulu

    289: (-21.9330, 114.1290),    # AU "Exmouth..." -> Exmouth, WA
    423: (-34.9800, 138.5130),    # AU "Gleneg" -> Glenelg Beach, SA
    424: (20.8160, -156.6280),    # USA "Ukumehama" -> Ukumehame (Maui Westküste, grob)

    763: (-38.3700, 144.2550),    # AU "Winkipop" -> Surf spot bei Torquay, VIC
    834: (30.3320, -81.6550),     # USA "Jacksonvlle" -> Jacksonville, Florida

    991: (-34.8300, 20.0500),     # ZA "Ryspunt" -> grob Südküste (Kap-Region)
    1013:(-36.7610, 175.5020),    # NZ "Cormandel" -> Coromandel / Coromandel Peninsula

    1077:(-23.7700, 35.3900),     # MZ "Nahaduga, Inhambane Bay" -> Inhambane Bay (grob)
    1178:(23.2490, -106.4110),    # MX "Mazlatan" -> Mazatlán (Rechtschreibung)

    1179:(-17.5500, -142.6200),   # French Polynesia "Tupapati, Hikueru Atoll" -> Hikueru Atoll (grob)

    1372:(20.6560, -156.4430),    # USA "Makenat, Maui" -> Makena, Maui (grob)
    1656:(-22.2760, 166.4580),    # NEW CALEDONIA "Kendec" -> grob Nouméa / Küste

    1762:(-13.4000, 48.2000),     # Madagascar "Ambatolaoka, Nosy Be" -> Nosy Be / Ambatoloaka
    1811:(-21.9980, 35.3150),     # Mozambique "Vilanculo" -> Vilanculos Küste

    1846:(-34.4200, 21.3400),     # ZA "Jogensfontein, Stilbaai" -> Jongensfontein Küste
    2325:(20.8800, -156.6830),    # USA "Noreiga's, Maui" -> Lahaina Gegend (grob)

    2589:(-31.3100, 29.9900),     # ZA "Mkhambati" -> Mkhambati Nature Reserve Küste
    2715:(-35.5100, 174.4800),    # NZ "Sandy Bay/Whananaki" -> Whananaki / Northland Küste
}


In [235]:
for idx, (lat, lon) in manual_coords.items():
    df.loc[idx, ["latitude", "longitude"]] = lat, lon





just for testing and trying out purposes

In [197]:
coord_notfound.to_csv("/Users/emilywu/Desktop/DataLiteracy/MiniProjekt/noCoordFound.csv")

In [237]:
df.to_csv("/Users/emilywu/Desktop/DataLiteracy/MiniProjekt/clean_shark_attacks.csv")

In [175]:
res = get_coordinates("Ka alu‘alu Beach", "AU")
print(res[0]["lat"])
print(json.dumps(res, indent=4))

-33.6365039
[
    {
        "place_id": 19744246,
        "licence": "Data \u00a9 OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright",
        "osm_type": "relation",
        "osm_id": 2918144,
        "lat": "-33.6365039",
        "lon": "151.3290299",
        "class": "boundary",
        "type": "administrative",
        "place_rank": 18,
        "importance": 0.32173239253715025,
        "addresstype": "suburb",
        "name": "Avalon Beach",
        "display_name": "Avalon Beach, Northern Beaches, Northern Beaches Council, New South Wales, 2107, Australia",
        "boundingbox": [
            "-33.6432341",
            "-33.6142715",
            "151.3115063",
            "151.3438980"
        ]
    }
]


In [185]:
def get_powerbi_metadata(headers, api_endpoint, **query_params):
		"""
		send GET request with query parameters
		"""
		response = requests.get(api_endpoint, headers=headers, params=query_params)
		print(f"Status-Code: {response.status_code}")

		if response.status_code == 200:
			return response.json()
	
		else:
			response.raise_for_status()

In [204]:
res = get_powerbi_metadata("", "https://nominatim.openstreetmap.org/search", **{"q": "Ka alu alu Bay", "format": "json", "countrycodes": "aus", "limit": 1})
print(json.dumps(res, indent=4))

Status-Code: 200
[]


In [112]:
res = get_coordinates("", "https://geocoding-api.open-meteo.com/v1/search", **{"name": 'Avalon+Beach', "count": "10", "language": "en", "format": "json", "countryCode": "AU"})  

Status-Code: 200
{
    "results": [
        {
            "id": 2177471,
            "name": "Avalon Beach",
            "latitude": -33.63588,
            "longitude": 151.32903,
            "elevation": 9.0,
            "feature_code": "PPLX",
            "country_code": "AU",
            "admin1_id": 2155400,
            "admin2_id": 12047279,
            "timezone": "Australia/Sydney",
            "country_id": 2077456,
            "country": "Australia",
            "admin1": "New South Wales",
            "admin2": "Northern Beaches"
        }
    ],
    "generationtime_ms": 0.5942583
}
