In [1]:
disaster_keywords = [
    # Core Earthquake Terms
    "earthquake", "tremor", "aftershock", "seismic", "fault", "epicenter",
    "magnitude", "Richter scale", "shaking", "ground", "quake", "foreshock",
    "tectonic", "plate", "shockwave", "aftermath", "felt", "feel",
    
    # Descriptive Terms
    "strong", "massive", "devastating", "violent", "powerful", "intense",
    "mild", "deep", "surface", "shallow",
    
    # Damages and Effects
    "damage", "collapse", "ruins", "wreckage", "destroyed", "cracks",
    "crumbling", "aftermath", "impact", "destruction", "disaster", "displaced",
    "homeless", "injury", "injuries", "fatalities", "debris", "rubble", "casualties",
    "trapped", "death", "die", "died", "wreckage",
    
    # Responses and Warnings
    "alert", "warning", "evacuation", "rescue", "search",
    "emergency", "relief", "assistance", "volunteers", "preparedness",
    "shelter", "relief efforts", "response team",
    
    # Measurement and Science
    "Richter", "seismograph", "seismology", "intensity", "measurement", "scale",
    "USGS", "depth", "geological", "seismometer", "seismic",
    
    # Related Natural Disasters and Events
    "tsunami", "landslide", "fire", "eruption", "volcano", "flood",
    
    # Social and Emotional Terms
    "pray", "thoughts", "fear", "panic", "trauma", "loss", "tragedy",
    "devastation", "solidarity", "support"
]


In [14]:
import pandas as pd
merged_df = pd.read_csv("preprocessed.csv")

In [15]:
import spacy
from spacy.matcher import PhraseMatcher
from tqdm import tqdm
import json
import re


In [16]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")  # Setting 'attr' to LOWER for case-insensitive matching


# Load GeoNames data and create GPE patterns (as shown previously)
countries = ['JP']
gpe_phrases = []

for country in countries:
    file_path = f"{country}.txt"
    data = pd.read_csv(file_path, sep='\t', header=None, usecols=[2])  # Column 1 has place names
    place_names = data[2].unique().tolist()
    gpe_phrases.extend(place_names)

# Remove duplicates from GPE phrases
gpe_phrases = list(set(gpe_phrases))

# Add additional phrases
gpe_phrases.extend(["Japan"]) 

# Filter out non-string entries from gpe_phrases
gpe_phrases = [phrase for phrase in gpe_phrases if isinstance(phrase, str)]
gpe_patterns = [nlp.make_doc(phrase.lower()) for phrase in gpe_phrases]  # Lowercase GPE patterns
matcher.add("GPE", gpe_patterns)

# Lowercase disaster patterns
# disaster_patterns = [nlp.make_doc(keyword.lower()) for keyword in disaster_keywords]
# matcher.add("DISASTER", disaster_patterns)



In [17]:
import json
from tqdm import tqdm

# List to store annotated data
annotated_data = []

# Process each text with tqdm progress bar
for text in tqdm(merged_df['text'], total=len(merged_df)):
    original_text = text  # Save original text
    lowered_text = text.lower()  # Convert text to lowercase for matching
    doc = nlp(lowered_text)
    matches = matcher(doc)
    
    entities = []
    for match_id, start, end in matches:
        span = doc[start:end]
        original_start = span.start_char
        original_end = span.end_char
        label = nlp.vocab.strings[match_id]
        entities.append([original_start, original_end, label])  # Use positions in original text

    # Remove overlapping entities, keeping the shortest one
    entities = sorted(entities, key=lambda x: (x[0], x[1] - x[0]))  # Sort by start position, then by span length
    filtered_entities = []
    last_end = -1
    
    for start, end, label in entities:
        if start >= last_end:  # No overlap with the previous entity
            filtered_entities.append([start, end, label])
            last_end = end  # Update last_end to the end of the current entity

    # Append the annotated entry with the original sentence
    annotated_data.append([original_text, {"entities": filtered_entities}])

# Save annotated data in JSONL format
with open("FINAL_TEST1.jsonl", "w") as file:
    for entry in annotated_data:
        json.dump(entry, file)
        file.write("\n")


100%|██████████| 427/427 [00:08<00:00, 53.04it/s]


In [13]:
import json

# Path to input JSONL file and output file
input_file = "FINAL_TEST.jsonl"
output_file = "FINAL_TEST.jsonl"

# List to store cleaned data
cleaned_data = []

# Open and process each line in the JSONL file
with open(input_file, "r") as file:
    for line in file:
        # Load JSON data for each line
        data = json.loads(line)
        
        # Filter out entities with a negative start index
        cleaned_entities = [entity for entity in data[1]["entities"] if entity[0] >= 0]
        
        # Update the data with cleaned entities
        data[1]["entities"] = cleaned_entities
        
        # Append to cleaned_data list
        cleaned_data.append(data)

# Write cleaned data to a new JSONL file
with open(output_file, "w") as file:
    for entry in cleaned_data:
        json.dump(entry, file)
        file.write("\n")

print("Cleanup complete. Negative start index entities removed.")


Cleanup complete. Negative start index entities removed.
