In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./datasets/japan_noto.csv")

In [3]:
df

Unnamed: 0,text,disaster,gpe,detected_gpe,response_time,query_tokens,response_tokens
0,"Major Tsunami Warning 1 1, 4 22pm The Tsunami ...","tsunami, ground, Warning, Tsunami, Warning, Ts...",,"""1 Coastal areas, 2 Rivers, 3 Lakes, 4 tsunami.""",6.0752,86,17
1,According to mathematical and tectonophysical ...,earthquake,,v Volcanic Epoch,4.7211,92,5
2,Earlier video showing smaller wave caused by t...,"Tsunami, earthquake","Niigata Prefecture, Joetsu, Seki, Japan, Noto ...","Joetsu City, Niigata Prefecture, Noto Peninsula",6.9786,76,14
3,"Tsunami Advisory 8 8, 4 44pm A Tsunami Advisor...","tsunami, ground, Tsunami, Tsunami",,8 4,8.0080,80,3
4,UNCONFIRMED video of considerable damage in No...,"earthquake, magnitude, damage","Japan, Anamizu, Japan, Noto","Noto, Japan, Anamizu, Japan",4.1660,73,10
...,...,...,...,...,...,...,...
681,"The 7.6 magnitude quake, which struck the Noto...",magnitude,"Ishikawa prefecture, Noto Peninsula","Noto Prefecture, Ishikawa prefecture",15.7392,76,10
682,A portion of the sales of nottestellata2024 of...,"Earthquake, Earthquake","Japan, Noto Peninsula","Noto Peninsula Earthquake, Miyagi Children's E...",6.6286,84,12
683,People are returning to their damaged houses t...,"rescue, collapse",,"country, country",5.2317,55,3
684,Supporting Recovery for the Noto Peninsula Thi...,"disaster, earthquake","Noto Peninsula, Noto Peninsula",Noto Peninsula,4.7731,86,3


In [4]:
all_locations = df['detected_gpe'].str.split(', ').explode()

# Step 2: Remove leading/trailing spaces (in case some entries are messy)
all_locations = all_locations.str.strip()


In [5]:
all_locations

0      "1 Coastal areas
0              2 Rivers
0               3 Lakes
0           4 tsunami."
1      v Volcanic Epoch
             ...       
683             country
683             country
684      Noto Peninsula
685               Japan
685     Noto Earthquake
Name: detected_gpe, Length: 1836, dtype: object

In [6]:
from collections import Counter
from tqdm import tqdm
import folium

# input_jsonl = "../datasets/test/5xjp.jsonl" 
# with open(input_jsonl, 'r') as f:
#     data = [json.loads(line) for line in f]

# locations = []
# for entry in data:
#     text = entry[0]
#     doc = nlp(text) 
#     for ent in doc.ents:
#         if ent.label_ == "GPE": 
#             locations.append(ent.text.strip())

locations_normalized = [loc.lower() for loc in all_locations]
locations_count = Counter(locations_normalized)

reference_csv = "./datasets/coordinates.csv"  
reference_df = pd.read_csv(reference_csv)

geocoded_data = []
for location, count in tqdm(locations_count.items(), desc="Matching locations"):
    match = reference_df[reference_df['location'].str.lower() == location]
    if not match.empty:
        geocoded_data.append({
            "location": location,
            "latitude": match.iloc[0]['latitude'],
            "longitude": match.iloc[0]['longitude'],
            "frequency": count
        })

geocoded_df = pd.DataFrame(geocoded_data)

geocoded_df = geocoded_df.drop_duplicates()

Matching locations: 100%|██████████| 797/797 [00:00<00:00, 1272.53it/s]


In [7]:
if not geocoded_df.empty:
    avg_lat = geocoded_df['latitude'].mean()
    avg_lon = geocoded_df['longitude'].mean()
    location_map = folium.Map(location=[avg_lat, avg_lon], zoom_start=5)

    for _, row in geocoded_df.iterrows():
        marker_size = row['frequency']  
        color_intensity = min(int(row['frequency'] * 10), 255)  
        marker_color = f'#{255:02x}{255 - color_intensity:02x}{0:02x}'

        folium.CircleMarker(
            location=[row['latitude'], row['longitude']], 
            radius=max(5, marker_size/20)+5, 
            popup=f"{row['location']} - Frequency: {row['frequency']}",
            color=marker_color,  
            fill=True,
            fill_color=marker_color,
            fill_opacity=0.7
        ).add_to(location_map)

    output_html = "./maps/deepseek_japan.html"  
    location_map.save(output_html)
    print(f"Map saved to {output_html}")
else:
    print("No valid locations to plot.")


Map saved to ./maps/deepseek_japan.html
