In [2]:
import pandas as pd
import spacy
from tqdm import tqdm
from spacy.matcher import PhraseMatcher

In [9]:
csv_file = 'preprocessed_0f_mexico.csv' 

In [10]:
df = pd.read_csv(csv_file)

In [11]:
df

Unnamed: 0,text,date,category,processed_text
0,RT @FCBarcelona: Our solidarity with the victi...,20_9_2017,not_humanitarian,rt our solidarity with the victims of the eart...
1,Mexico earthquake: Many children killed at pri...,20_9_2017,injured_or_dead_people,mexico earthquake many children killed at prim...
2,Obamaâ€™s Response To The Earthquake In #Mexic...,20_9_2017,not_humanitarian,obamaâ response to the earthquake in mexicocit...
3,RT @AmichaiStein1: #BREAKING: Israel search &a...,20_9_2017,rescue_volunteering_or_donation_effort,rt breaking israel search amp rescue mission h...
4,RT @AmichaiStein1: #BREAKING: Israel search &a...,20_9_2017,rescue_volunteering_or_donation_effort,rt breaking israel search amp rescue mission h...
...,...,...,...,...
1375,Carlos santana donates $100k to mexico earthqu...,06_10_2017,rescue_volunteering_or_donation_effort,carlos santana donates to mexico earthquake re...
1376,Seth troxler donates 10k to earthquake relief ...,06_10_2017,rescue_volunteering_or_donation_effort,seth troxler donates to earthquake relief effo...
1377,Entercom/San Francisco Stations Raise Funds Fo...,06_10_2017,rescue_volunteering_or_donation_effort,entercom san francisco stations raise funds fo...
1378,Mexico Earthquakes | International Medical Cor...,06_10_2017,other_relevant_information,mexico earthquakes international medical corps


In [12]:
def extract_locations(text):
    nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab)
    # Add custom location names to the matcher
    location_patterns = []
    for loc in location_patterns:
        matcher.add("LOCATION", None, nlp(loc))

    doc = nlp(text)
    locations = []
    
    # Extract locations using NER
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            locations.append(ent.text)

    # Extract additional locations using the custom matcher
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        locations.append(span.text)
    
    return locations

In [13]:
all_locations1 = []
for text in tqdm(df['processed_text'], desc="Extracting Locations", total=len(df)):
    locations = extract_locations(text)
    all_locations1.extend(locations)

# Print or use the list of all locations
print(all_locations1)

Extracting Locations: 100%|██████████| 1380/1380 [23:43<00:00,  1.03s/it]

['mexico', 'mexico', 'israel', 'mexico', 'israel', 'mexico', 'israel', 'mexico', 'mexico', 'mexico city', 'mexico', 'mexico', 'mexicoâ city', 'rt mexico city airport', 'mexico city', 'mexico city', 'mexico', 'mexico', 'mexico city', 'mexico city', 'mexico city', 'mexico', 'mexico city', 'mexico', 'mexico', 'mexico', 'rt mexico', 'mexico', 'mexico', 'aá¼ÿ', 'mexico', 'mexico', 'mexico', 'mexico', 'new zealand', 'mexico city', 'mexico city', 'rt mexico', 'central mexico', 'central mexico', 'central mexico', 'central mexico', 'mexico', 'mexico', 'mexico city', 'mexico', 'mexico', 'mexico', 'dona', 'mexico city', 'rt mexico city', 'rt mexico city', 'rt mexico city', 'mexico', 'mexico', 'mexico', 'mexico city', 'mexico', 'mexico', 'mexico', 'mexico', 'mexico', 'mexico', 'mexico', 'mexico', 'mexico', 'mexico city', 'mexico', 'mexico', 'mexico city', 'mexico', 'jj_worldtour mexico', 'mexico', 'jj_worldtour mexico', 'mexico', 'jj_worldtour mexico', 'mexico', 'jj_worldtour mexico', 'mexico', 'm




In [15]:
locations_df1 = pd.DataFrame(all_locations1, columns=['Location'])

In [16]:
locations_df1

Unnamed: 0,Location
0,mexico
1,mexico
2,israel
3,mexico
4,israel
...,...
1234,mexico
1235,san francisco
1236,mexico
1237,mexico


In [17]:
locations_df1.to_csv('extracted_locations_mexico.csv', index=False)

In [18]:
from geopy.geocoders import Nominatim

In [19]:
def get_coordinates(location):
    geolocator = Nominatim(user_agent="location_finder")
    try:
        location_info = geolocator.geocode(location)
        if location_info:
            return location_info.latitude, location_info.longitude
        else:
            return None, None
    except:
        return None, None

In [20]:
locations_data = []

In [21]:
unique_locations = set(all_locations1)
for location in tqdm(unique_locations, desc="Geocoding Locations"):
    frequency = all_locations1.count(location)
    latitude, longitude = get_coordinates(location)
    if latitude is not None and longitude is not None:
        locations_data.append({'Location': location, 'Frequency': frequency, 'Latitude': latitude, 'Longitude': longitude})


Geocoding Locations: 100%|██████████| 122/122 [02:08<00:00,  1.05s/it]


In [22]:
locations_coordinate1 = pd.DataFrame(locations_data)

In [23]:
locations_coordinate1

Unnamed: 0,Location,Frequency,Latitude,Longitude
0,kuwait,1,29.379653,47.973417
1,jojutla mexico,2,18.600025,-99.229972
2,new zealand,1,-41.500083,172.834408
3,us,1,39.783730,-100.445882
4,deafâ,1,52.748620,-8.029006
...,...,...,...,...
93,benito,1,36.821616,-120.436283
94,costa rica,1,10.273563,-84.073910
95,springfield,2,39.799017,-89.643957
96,america,1,39.783730,-100.445882


In [24]:
locations_coordinate1.to_csv('coordinate_mexico.csv', index=False)