In [19]:
import pandas as pd
import spacy
from tqdm import tqdm
from spacy.matcher import PhraseMatcher

In [20]:
csv_file = 'preprocessed_0f_iraq_iran.csv' 

In [21]:
df1 = pd.read_csv(csv_file)

In [22]:
df1

Unnamed: 0,text,date,category,processed_text
0,RT @PressTV: UPDATE: Death toll from Iran’s qu...,13_11_2017,injured_or_dead_people,rt update death toll from iran quake rises to ...
1,RT @CAFOD: We pray for all those affected by t...,13_11_2017,not_humanitarian,rt we pray for all those affected by the earth...
2,RT @ReutersWorld: JUST IN: Death toll reaches ...,13_11_2017,injured_or_dead_people,rt just in death toll reaches in iran earthqua...
3,RT @LaylaAlhussein: A magnitude 7.2 earthquake...,13_11_2017,other_relevant_information,rt magnitude earthquake struck northern iraq i...
4,RT @LaylaAlhussein: A magnitude 7.2 earthquake...,13_11_2017,other_relevant_information,rt magnitude earthquake struck northern iraq i...
...,...,...,...,...
592,Damage proxy maps of 5 cities near the epicent...,18_11_2017,infrastructure_and_utility_damage,damage proxy maps of cities near the epicenter...
593,#Iran Daily: US Treasury Blocks Donations for ...,18_11_2017,rescue_volunteering_or_donation_effort,iran daily us treasury blocks donations for e...
594,#Kurdish children's situation after the #earth...,18_11_2017,affected_individuals,kurdish children situation after the earthqua...
595,#IRAN'S #EARTHQUAKE EXPOSES #POLITICAL RIFTS A...,18_11_2017,other_relevant_information,iran earthquake exposes political rifts and i...


In [23]:
def extract_locations(text):
    nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab)
    # Add custom location names to the matcher
    location_patterns = []
    for loc in location_patterns:
        matcher.add("LOCATION", None, nlp(loc))

    doc = nlp(text)
    locations = []
    
    # Extract locations using NER
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            locations.append(ent.text)

    # Extract additional locations using the custom matcher
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        locations.append(span.text)
    
    return locations

In [24]:
all_locations1 = []
for text in tqdm(df1['processed_text'], desc="Extracting Locations", total=len(df1)):
    locations = extract_locations(text)
    all_locations1.extend(locations)

# Print or use the list of all locations
print(all_locations1)

Extracting Locations: 100%|██████████████████████████████████████████████████████████| 597/597 [05:24<00:00,  1.84it/s]

['iran', 'iraq', 'iran', 'iran', 'iraq', 'iraq', 'iraq', 'iraq', 'iraq', 'iraq', 'iraq', 'iraq', 'iraq', 'iraq', 'iraq', 'iraq', 'iran', 'iraq', 'iran', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'turkey', 'iraq', 'iran', 'iraq', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iran', 'iraq', 'iran', 'iraq', 'uae', 'iraq', 'iraq', 'iran', 'iran', 'iraq', 'iraq', 'iran', 'najaf', 'iraq', 'iran', 'baghdad', 'iran', 'iraq', 'iran', 'iraq', 'iraq', 'iran', 'iran', 'iran', 'iraq', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iraq', 'pakistan', 'iran', 'iraq', 'kuwait', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'oott', 'iraq', 'halabja', 'iraq', 'iran', 'iraq', 'iraq', 'iran', 'iran', 'iraq', 'iraq', 'iran', 'bhuj', 'haiti', 'iraq', 'iraq', 'iraq', 'kurdistan', 'kurdistan', 'kurdistan', 'kurdistan', 'iran', 'iran', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iran', 'iraq', 'iraq', 'iran', 'the middle east', '




In [25]:
locations_df1 = pd.DataFrame(all_locations1, columns=['Location'])

In [26]:
locations_df1

Unnamed: 0,Location
0,iran
1,iraq
2,iran
3,iran
4,iraq
...,...
827,iran
828,iran
829,iran
830,iran


In [27]:
locations_df1.to_csv('extracted_locations_iraq_iran.csv', index=False)

In [28]:
from geopy.geocoders import Nominatim

In [29]:
def get_coordinates(location):
    geolocator = Nominatim(user_agent="location_finder")
    try:
        location_info = geolocator.geocode(location)
        if location_info:
            return location_info.latitude, location_info.longitude
        else:
            return None, None
    except:
        return None, None

In [30]:
get_coordinates('raichur')

(16.083333, 77.166667)

In [31]:
locations_data = []

In [32]:
unique_locations = set(all_locations1)
for location in tqdm(unique_locations, desc="Geocoding Locations"):
    frequency = all_locations1.count(location)
    latitude, longitude = get_coordinates(location)
    if latitude is not None and longitude is not None:
        locations_data.append({'Location': location, 'Frequency': frequency, 'Latitude': latitude, 'Longitude': longitude})


Geocoding Locations: 100%|█████████████████████████████████████████████████████████████| 41/41 [00:44<00:00,  1.10s/it]


In [33]:
locations_coordinate1 = pd.DataFrame(locations_data)

In [34]:
locations_coordinate1

Unnamed: 0,Location,Frequency,Latitude,Longitude
0,smart city,1,47.079588,15.410957
1,turkey,3,38.959759,34.924965
2,washington,1,38.895037,-77.036543
3,kermanshah province,6,34.486732,46.808727
4,japan,1,36.574844,139.239418
5,baghdad,1,33.30617,44.387221
6,halabja,4,35.179206,45.987368
7,najaf,1,32.001023,44.329993
8,israel,11,30.812425,34.859476
9,the middle east,1,35.869605,-84.121049


In [35]:
locations_coordinate1.to_csv('coordinate_iraq_iran.csv', index=False)