<a href="https://colab.research.google.com/github/dgottschalk03/dsci_550_a1/blob/main/notebooks/events_type.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1.06- Event Type


*  search for keywords: murder, killed, homicide, death, died, slain, massacre, executed, crime, strangled, stabbed, shot, suicide, massacre, supernatural, paranormal, haunting, cursed, evil, possession, ghostly, poltergeist, eerie, demonic, mysterious, occult, unexplained, witchcraft, accident, tragedy, drowning, fire, explosion, disaster, fallen, collapsed, burned, suffocation, lost, catastrophe, avalanche, shipwreck, landslide



In [1]:
import pandas as pd
import re
import time

In [2]:


# Load data
df = pd.read_csv("haunted_places.tab", sep="\t")

# Define groups of keywords with regex patterns
event_groups = {
    "Violence": [
        r"murder(?:ed|s)?",         # matches: murder, murdered, murders
        r"kill(?:ed|s|ing)?",        # matches: kill, killed, kills, killing
        r"homicide",                # matches: homicide
        r"death(?:s)?",             # matches: death, deaths
        r"died",                    # matches: died
        r"slain",                   # matches: slain
        r"massacre(?:d|s)?",         # matches: massacre, massacred, massacres
        r"execute(?:d|s|ing)?",      # matches: execute, executed, executes, executing
        r"crime(?:s)?",             # matches: crime, crimes
        r"strangl(?:e|ed|es|ing)",   # matches: strangle, strangled, strangles, strangling
        r"stab(?:bed|s|bing)?",      # matches: stab, stabbed, stabs, stabbing
        r"shot",                    # matches: shot
        r"suicide(?:s)?"            # matches: suicide, suicides
    ],
    "Supernatural": [
        r"supernatural",            # matches: supernatural
        r"paranormal",              # matches: paranormal
        r"haunt(?:ed|ing|s)?",       # matches: haunt, haunted, haunting, haunts
        r"curse(?:d|s|ing)?",        # matches: curse, cursed, curses, cursing
        r"evil",                    # matches: evil
        r"possess(?:ion|ed|es|ing)?",# matches: possession, possessed, possesses, possessing
        r"ghost(?:ly|s)?",          # matches: ghost, ghostly, ghosts
        r"poltergeist",             # matches: poltergeist
        r"eerie",                   # matches: eerie
        r"demon(?:ic|strated|s)?",   # matches: demon, demonic, demons
        r"mysterious",              # matches: mysterious
        r"occult",                  # matches: occult
        r"unexplained",             # matches: unexplained
        r"witchcraft",              # matches: witchcraft
        r"witch(?:es)?"            # matches: witch, witches
    ],
    "Accident/Disaster": [
        r"accident(?:al|s)?",       # matches: accident, accidental, accidents
        r"traged(?:y|ies)",         # matches: tragedy, tragedies
        r"drown(?:ing|ed|s)?",       # matches: drown, drowning, drowns, drowned
        r"fire(?:d|s|ing)?",         # matches: fire, fired, fires, firing
        r"explos(?:ion|ions|ive|ed)",# matches: explosion, explosions, explosive, exploded
        r"disaster(?:s)?",          # matches: disaster, disasters
        r"fall(?:en|s|ing)?",        # matches: fall, fallen, falls, falling
        r"collaps(?:ed|es|ing)?",    # matches: collapse, collapsed, collapses, collapsing
        r"burn(?:ed|s|ing)?",        # matches: burn, burned, burns, burning
        r"suffocat(?:ion|e|es|ing)?",# matches: suffocation, suffocate, suffocates, suffocating, suffocated
        r"lost",                    # matches: lost
        r"catastroph(?:e|es)",       # matches: catastrophe, catastrophes
        r"avalanche(?:s)?",         # matches: avalanche, avalanches
        r"shipwreck(?:ed|s)?",       # matches: shipwreck, shipwrecked, shipwrecks
        r"landslide(?:s)?"          # matches: landslide, landslides
    ]
}

def classify_event(text):
    if not isinstance(text, str):
        return "Unknown"

    text = text.lower()
    triggered_groups = []

    # Loop through each group
    for group, patterns in event_groups.items():
        for pattern in patterns:
            # Check if the pattern matches anywhere
            if re.search(rf"\b{pattern}\b", text):
                triggered_groups.append(group)
                # Move on to the next group when match found
                break

    if not triggered_groups:
        return "Unknown"

    # Return all matched groups
    return ", ".join(triggered_groups)

# Start timing
start = time.time()

# Apply classification to the "description" column.
df["Event Type"] = df["description"].apply(classify_event)

# Stop timing
end = time.time()

# Count occurrences
counts = df["Event Type"].value_counts()
unknown_counts = counts.get("Unknown", 0)

extract_printout = [(category, count) for category, count in counts.items()]
print("-" * 100, "Extraction Completed", "-" * 100)
print(f"Extraction Took: {end - start:.6f} seconds\n")
print("\n".join([f"{category}: {count}" for category, count in extract_printout]))
print("-" * 100)


---------------------------------------------------------------------------------------------------- Extraction Completed ----------------------------------------------------------------------------------------------------
Extraction Took: 2.226780 seconds

Unknown: 3878
Supernatural: 2813
Violence, Supernatural: 1377
Violence: 1363
Accident/Disaster: 467
Violence, Accident/Disaster: 379
Violence, Supernatural, Accident/Disaster: 375
Supernatural, Accident/Disaster: 340
----------------------------------------------------------------------------------------------------
