In [1]:
import pandas as pd
import spacy
from tqdm import tqdm
from spacy.matcher import PhraseMatcher

In [2]:
csv_file = 'preprocessed_0f_japan.csv' 

In [3]:
df = pd.read_csv(csv_file)

In [4]:
df

Unnamed: 0,text,date,processed_text
0,\Japan earthquake: Nearly 250 missing as hope ...,Jan-07,japan earthquake nearly missing as hope for s...
1,Replying to @aviationbrk,Jan-07,replying to
2,ارتفاع حصيلة ضحايا زلزال اليابان إلى 326 قتيلا...,Jan-06,ارتفاع حصيلة ضحايا زلزال اليابان إلى قتيلا ومف...
3,Replying to @rawsalerts,Jan-06,replying to
4,Informative and Helpful article by the founder...,Jan-06,informative and helpful article by the founder...
...,...,...,...
488,M7.4 earthquake / 7 on Japanese shindo scale /...,Jan-01,earthquake on japanese shindo scale ishikawa ...
489,"Region: Near West Coast of Honshu, Japan Mag: ...",Jan-01,region near west coast of honshu japan mag utc...
490,"Preliminary: 6.7 earthquake, Near West Coast o...",Jan-01,preliminary earthquake near west coast of hons...
491,Very strong mag. 6.7 #earthquake - Near West C...,Jan-01,very strong mag earthquake near west coast of ...


In [5]:
def extract_locations(text):
    nlp = spacy.load("0fner_model_custom_3")
    matcher = PhraseMatcher(nlp.vocab)
    # Add custom location names to the matcher
    location_patterns = []
    for loc in location_patterns:
        matcher.add("LOCATION", None, nlp(loc))

    doc = nlp(text)
    locations = []
    
    # Extract locations using NER
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            locations.append(ent.text)

    # Extract additional locations using the custom matcher
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        locations.append(span.text)
    
    return locations

In [6]:
all_locations1 = []
flags=[]
for text in tqdm(df['processed_text'], desc="Extracting Locations", total=len(df)):
    locations = extract_locations(text)
    all_locations1.extend(locations)
    
    # Set the flag to 1 if locations were found, otherwise 0
    if locations:
        flags.append(1)
    else:
        flags.append(0)

# Add the flag column to the DataFrame
df['flag1'] = flags
# Print or use the list of all locations
print(all_locations1)

Extracting Locations:   0%|          | 0/493 [00:00<?, ?it/s]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Extracting Locations: 100%|██████████| 493/493 [04:15<00:00,  1.93it/s]

['japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'gaza', 'japan', 'japan jan', 'japan', 'japan', 'japan', 'japan noto peninsula', 'japan', 'japan', 'japan hanaka airport harvard', 'japanearthquake', 'japan', 'japan tokyo', 'japan', 'japan tokyo', 'japan south korean opposition leader stabbed russian missile strikes', 'tokyo', 'japan', 'tokyo', 'tokyo', 'مساء_الخير_والسعاده زد_رصيدك الاسعار_الجديدة', 'البحر_الأحمر العام_الجديد', 'japan', 'tokyo', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'asia', 'japan japanearthquake', 'japan', 'japan', 'اليمن_ينتصر_لفلسطين', 'jesusiscomingsoon jesusislord jesussaves', 'japan', 'japan', 'japan', 'japan', 'japan', 'japan', 'kanazawa', 'kanazawa', 'japan', 'japan', 'japan', 'japan', 'japan japan', 'japan', 'japan', 'japan japan', 'japan', 'japan', 'kanazawa', 'gaza', 'japan suffering', 'japan', 'japan', 'kanazawa', 'japan', 'japan', 'japan', 'japan', 'japan japan japon earthquake japan', 'japan', 'japan', 'japan', 'jap




In [7]:
count_ones = df['flag1'].sum()

In [8]:
count_ones

95

In [9]:
locations_df = pd.DataFrame(all_locations1, columns=['Location'])

In [10]:
locations_df

Unnamed: 0,Location
0,japan
1,japan
2,japan
3,japan
4,japan
...,...
100,japan
101,japan
102,japan
103,japan


In [11]:
locations_df.to_csv('extracted_locations_japan.csv', index=False)

In [12]:
from geopy.geocoders import Nominatim

In [13]:
def get_coordinates(location):
    geolocator = Nominatim(user_agent="location_finder")
    try:
        location_info = geolocator.geocode(location)
        if location_info:
            return location_info.latitude, location_info.longitude
        else:
            return None, None
    except:
        return None, None

In [17]:
get_coordinates('raichur')

(16.083333, 77.166667)

In [14]:
locations_data = []

In [15]:
unique_locations = set(all_locations)
for location in tqdm(unique_locations, desc="Geocoding Locations"):
    frequency = all_locations.count(location)
    latitude, longitude = get_coordinates(location)
    if latitude is not None and longitude is not None:
        locations_data.append({'Location': location, 'Frequency': frequency, 'Latitude': latitude, 'Longitude': longitude})


NameError: name 'all_locations' is not defined

In [20]:
locations_coordinate = pd.DataFrame(locations_data)

In [21]:
locations_coordinate

Unnamed: 0,Location,Frequency,Latitude,Longitude
0,rápida,1,-3.467559,-80.233213
1,mahadev,1,24.622499,82.173876
2,delhi,1,28.627393,77.171695
3,the united states,1,47.828163,-122.598365
4,united states turkiye,1,40.759698,-73.920210
...,...,...,...,...
62,china,9,35.000074,104.999927
63,الأولى,1,31.180487,31.200973
64,اپنی,1,43.084762,40.817049
65,southafrica,1,-26.169444,28.194085


In [22]:
locations_coordinate.to_csv('coordinate_japan.csv', index=False)