In [1]:
import pandas as pd
import spacy
from tqdm import tqdm
from spacy.matcher import PhraseMatcher

In [2]:
csv_file = 'preprocessed_0f_japan.csv' 

In [3]:
df = pd.read_csv(csv_file)

In [4]:
df

Unnamed: 0,text,date,processed_text
0,\Japan earthquake: Nearly 250 missing as hope ...,Jan-07,japan earthquake nearly missing as hope for s...
1,Replying to @aviationbrk,Jan-07,replying to
2,ارتفاع حصيلة ضحايا زلزال اليابان إلى 326 قتيلا...,Jan-06,ارتفاع حصيلة ضحايا زلزال اليابان إلى قتيلا ومف...
3,Replying to @rawsalerts,Jan-06,replying to
4,Informative and Helpful article by the founder...,Jan-06,informative and helpful article by the founder...
...,...,...,...
488,M7.4 earthquake / 7 on Japanese shindo scale /...,Jan-01,earthquake on japanese shindo scale ishikawa ...
489,"Region: Near West Coast of Honshu, Japan Mag: ...",Jan-01,region near west coast of honshu japan mag utc...
490,"Preliminary: 6.7 earthquake, Near West Coast o...",Jan-01,preliminary earthquake near west coast of hons...
491,Very strong mag. 6.7 #earthquake - Near West C...,Jan-01,very strong mag earthquake near west coast of ...


In [5]:
def extract_locations(text):
    nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab)
    # Add custom location names to the matcher
    location_patterns = []
    for loc in location_patterns:
        matcher.add("LOCATION", None, nlp(loc))

    doc = nlp(text)
    locations = []
    
    # Extract locations using NER
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            locations.append(ent.text)

    # Extract additional locations using the custom matcher
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        locations.append(span.text)
    
    return locations

In [8]:
all_locations = []
for text in tqdm(df['processed_text'], desc="Extracting Locations", total=len(df)):
    locations = extract_locations(text)
    all_locations.extend(locations)

# Print or use the list of all locations
print(all_locations)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Extracting Locations: 100%|██████████████████████████████████████████████████████████| 493/493 [03:51<00:00,  2.13it/s]

['japan', 'أعلنت', 'japan', 'japan', 'japan', 'japan', 'japan', 'saudi arabia', 'japan korea', 'korea', 'manhattan', 'israel', 'miami', 'japan', 'japan', 'japan', 'china', 'indonesia', 'iran', 'japan', 'japan', 'japan', 'japan', 'japan', 'el terremoto', 'north korea', 'south korea', 'japan', 'korea', 'japan', 'north korea', 'china', 'china', 'china', 'japan', 'china', 'japan', 'japan', 'china', 'indonesia', 'iran', 'japan', 'japan', 'osaka', 'bangkok', 'korea', 'japan', 'iran', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'ukraine', 'gaza', 'the red sea', 'japan', 'south korea', 'iran', 'japan', 'japan', 'japan', 'أناميزو', 'japan', 'japan', 'japan', 'japan', 'china', 'indonesia', 'iran', 'japan', 'united states turkiye', 'japan', 'japan', 'japan', 'kitakyushu city', 'japan', 'japantsunami', 'japan', 'japan', 'kitakyushu shopping district', 'japan', 'japan', 'northeast', 'japan', 'japan', 'japan', 'จดหมายปร', 'japan', 'japan', 'nort




In [11]:
locations_df = pd.DataFrame(all_locations, columns=['Location'])

In [13]:
locations_df

Unnamed: 0,Location
0,japan
1,أعلنت
2,japan
3,japan
4,japan
...,...
758,west coast
759,japan
760,west coast
761,japan


In [14]:
locations_df.to_csv('extracted_locations_japan.csv', index=False)

In [15]:
from geopy.geocoders import Nominatim

In [16]:
def get_coordinates(location):
    geolocator = Nominatim(user_agent="location_finder")
    try:
        location_info = geolocator.geocode(location)
        if location_info:
            return location_info.latitude, location_info.longitude
        else:
            return None, None
    except:
        return None, None

In [17]:
get_coordinates('raichur')

(16.083333, 77.166667)

In [18]:
locations_data = []

In [19]:
unique_locations = set(all_locations)
for location in tqdm(unique_locations, desc="Geocoding Locations"):
    frequency = all_locations.count(location)
    latitude, longitude = get_coordinates(location)
    if latitude is not None and longitude is not None:
        locations_data.append({'Location': location, 'Frequency': frequency, 'Latitude': latitude, 'Longitude': longitude})


Geocoding Locations: 100%|█████████████████████████████████████████████████████████████| 80/80 [01:43<00:00,  1.29s/it]


In [20]:
locations_coordinate = pd.DataFrame(locations_data)

In [21]:
locations_coordinate

Unnamed: 0,Location,Frequency,Latitude,Longitude
0,rápida,1,-3.467559,-80.233213
1,mahadev,1,24.622499,82.173876
2,delhi,1,28.627393,77.171695
3,the united states,1,47.828163,-122.598365
4,united states turkiye,1,40.759698,-73.920210
...,...,...,...,...
62,china,9,35.000074,104.999927
63,الأولى,1,31.180487,31.200973
64,اپنی,1,43.084762,40.817049
65,southafrica,1,-26.169444,28.194085


In [22]:
locations_coordinate.to_csv('coordinate_japan.csv', index=False)