In [1]:
import pandas as pd
import spacy
from tqdm import tqdm
from spacy.matcher import PhraseMatcher

In [2]:
csv_file = 'preprocessed.csv' 

In [3]:
df = pd.read_csv(csv_file)

In [4]:
df

Unnamed: 0.1,Unnamed: 0,id,text,time stamp
0,0,@EN_NERV,"Major Tsunami Warning , pm The Tsunami Warning...",2024-01-01
1,1,@mondoterremoti,"Utilizzando i dati preliminari dellUSGS, e riu...",2024-01-06
2,2,@MacLesggy,Comment une construction antisismique se compo...,2024-01-03
3,3,@Masabxpct,magnitude. strong earthquake earthquake Noto I...,2024-01-01
4,4,@Ryo_Saeba_3,Une nouvelle video prise au moment du seisme d...,2024-02-03
...,...,...,...,...
1006,1078,@SallySueIam,Replying to,2024-10-06
1007,1079,@NathanNoonan,Replying to and,2024-10-07
1008,1080,@mntomorii,Replying to,2024-10-02
1009,1081,@mrsverypicky,Japans NotoEarthquake thousands of survivors s...,2024-02-14


In [5]:
def extract_locations(text):
    nlp = spacy.load("0fner_model_custom_4_1")
    matcher = PhraseMatcher(nlp.vocab)
    # Add custom location names to the matcher
    location_patterns = []
    for loc in location_patterns:
        matcher.add("LOCATION", None, nlp(loc))

    doc = nlp(text)
    locations = []
    
    # Extract locations using NER
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            locations.append(ent.text)

    # Extract additional locations using the custom matcher
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        locations.append(span.text)
    
    return locations

In [9]:
# Replace NaN values with empty strings or any other placeholder text you prefer
df['text'] = df['text'].fillna("Replying to")

all_locations1 = []
flags = []

for text in tqdm(df['text'], desc="Extracting Locations", total=len(df)):
    locations = extract_locations(text)
    all_locations1.extend(locations)
    
    # Set the flag to 1 if locations were found, otherwise 0
    flags.append(1 if locations else 0)


Extracting Locations: 100%|██████████| 1011/1011 [08:55<00:00,  1.89it/s]


In [10]:

# Add the flag column to the DataFrame
df['flag1'] = flags
# Print or use the list of all locations
print(all_locations1)

['Tsunami', 'Tsunami', 'Tsunami', 'Those', 'Utilizzando', 'Noto', 'Giappone', 'Noto', 'Noto', 'Ishikawa', 'Japan', 'Tsunami', 'Noto', 'Noto', 'Ishikawa Prefecture', 'Noto', 'Ahora', 'Sacudio', 'Noto', 'Ishikawa', 'Wajima', 'Centro', 'Alerta', 'Tsunamis', 'Noto', 'Ishikawa', 'Kanto', 'Tokio', 'Kanazawa', 'Noto', 'Ishikawa', 'Noto', 'Japao', 'Noto', 'Ishikawa', 'Tsunami', 'Noto', 'Ishikawa', 'Japao', 'Terremoto', 'Nuevos', 'Noto', 'desplazado', 'Noto', 'Ishikawa', 'Noto', 'Giappone', 'Noto', 'Japan', 'Seki', 'Joetsu', 'Niigata Prefecture', 'Tsunami', 'Tsunami', 'Tsunami', 'Those', 'Noto', 'Japan', 'Anamizu', 'Japan', 'Noto', 'Japan', 'Noto', 'Tokio', 'Noto', 'Honshu', 'Kanazawa', 'Noto', 'Noto', 'Seki', 'Joetsu', 'Niigata', 'Tsunami', 'Tsunami', 'Tsunami', 'Tsunami', 'Noto', 'Ishikawa Prefecture', 'japan', 'Tsunami', 'Tsunami', 'Those', 'Noto', 'Ishikawa Prefecture', 'Tsunami', 'Kaiso', 'Noto', 'Parliamo', 'Giappone', 'Noto', 'Ishikawa Prefecture', 'Tsunami', 'Noto', 'Japan', 'Tsunami', 

In [11]:
count_ones = df['flag1'].sum()

In [12]:
count_ones

924

In [13]:
locations_df = pd.DataFrame(all_locations1, columns=['Location'])

In [14]:
locations_df

Unnamed: 0,Location
0,Tsunami
1,Tsunami
2,Tsunami
3,Those
4,Utilizzando
...,...
2586,Noto
2587,Japan
2588,Miyagi
2589,Noto


In [15]:
locations_df.to_csv('extracted_locations_japan.csv', index=False)

In [16]:
from geopy.geocoders import Nominatim

In [17]:
def get_coordinates(location):
    geolocator = Nominatim(user_agent="location_finder")
    try:
        location_info = geolocator.geocode(location)
        if location_info:
            return location_info.latitude, location_info.longitude
        else:
            return None, None
    except:
        return None, None

In [18]:
get_coordinates('raichur')

(16.054719400000003, 76.90036472411589)

In [19]:
locations_data = []

In [20]:
unique_locations = set(all_locations1)
for location in tqdm(unique_locations, desc="Geocoding Locations"):
    frequency = all_locations1.count(location)
    latitude, longitude = get_coordinates(location)
    if latitude is not None and longitude is not None:
        locations_data.append({'Location': location, 'Frequency': frequency, 'Latitude': latitude, 'Longitude': longitude})


Geocoding Locations: 100%|██████████| 391/391 [04:48<00:00,  1.35it/s]


In [21]:
locations_coordinate = pd.DataFrame(locations_data)

In [22]:
locations_coordinate

Unnamed: 0,Location,Frequency,Latitude,Longitude
0,Those,10,52.507007,10.125575
1,Tomita,3,36.316161,139.528807
2,Terremoto,18,42.882648,-8.533842
3,Chuetsu,2,35.645037,139.801408
4,Japan Pro-,3,35.646771,139.676588
...,...,...,...,...
326,Zama,1,35.499205,139.420453
327,Instituto,1,38.376782,-0.419217
328,Kanagawa Pref,2,35.447354,139.641635
329,Loreto,1,-5.000000,-75.000000


In [23]:
locations_coordinate.to_csv('coordinate_japan.csv', index=False)

In [24]:
locations_coordinate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331 entries, 0 to 330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Location   331 non-null    object 
 1   Frequency  331 non-null    int64  
 2   Latitude   331 non-null    float64
 3   Longitude  331 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 10.5+ KB


In [46]:
import folium
from branca.colormap import LinearColormap
import pandas as pd
import numpy as np

# Sample data loading (replace this with your actual DataFrame)
# df = pd.DataFrame({...})

# Create a base map centered on an average location
m = folium.Map(location=[20, 0], zoom_start=2)

# Define a colormap for frequency values
colormap = LinearColormap(colors=['orange', 'red', 'brown', 'black','white','blue'], vmin=locations_coordinate['Frequency'].min(), vmax=locations_coordinate['Frequency'].max())

# Add markers to the map
for _, row in locations_coordinate.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=np.log(row['Frequency']+10)*5,  # Adjust size scaling factor as needed
        color=colormap(row['Frequency']),
        fill=True,
        fill_color=colormap(row['Frequency']),
        fill_opacity=0.6,
        popup=folium.Popup(f"{row['Location']}: {row['Frequency']}", parse_html=True)
    ).add_to(m)

# Add color legend
colormap.caption = 'Frequency'
colormap.add_to(m)

# Display map
m.save("world_map.html")
