In [9]:
import pandas as pd
import spacy
from tqdm import tqdm
from spacy.matcher import PhraseMatcher

In [10]:
csv_file = 'preprocessed_0f_iraq_iran.csv' 

In [23]:
df = pd.read_csv(csv_file)

In [24]:
df

Unnamed: 0,text,date,category,processed_text
0,RT @PressTV: UPDATE: Death toll from Iran’s qu...,13_11_2017,injured_or_dead_people,rt update death toll from iran quake rises to ...
1,RT @CAFOD: We pray for all those affected by t...,13_11_2017,not_humanitarian,rt we pray for all those affected by the earth...
2,RT @ReutersWorld: JUST IN: Death toll reaches ...,13_11_2017,injured_or_dead_people,rt just in death toll reaches in iran earthqua...
3,RT @LaylaAlhussein: A magnitude 7.2 earthquake...,13_11_2017,other_relevant_information,rt magnitude earthquake struck northern iraq i...
4,RT @LaylaAlhussein: A magnitude 7.2 earthquake...,13_11_2017,other_relevant_information,rt magnitude earthquake struck northern iraq i...
...,...,...,...,...
592,Damage proxy maps of 5 cities near the epicent...,18_11_2017,infrastructure_and_utility_damage,damage proxy maps of cities near the epicenter...
593,#Iran Daily: US Treasury Blocks Donations for ...,18_11_2017,rescue_volunteering_or_donation_effort,iran daily us treasury blocks donations for e...
594,#Kurdish children's situation after the #earth...,18_11_2017,affected_individuals,kurdish children situation after the earthqua...
595,#IRAN'S #EARTHQUAKE EXPOSES #POLITICAL RIFTS A...,18_11_2017,other_relevant_information,iran earthquake exposes political rifts and i...


In [27]:
def extract_locations(text):
    nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab)
    # Add custom location names to the matcher
    location_patterns = []
    for loc in location_patterns:
        matcher.add("LOCATION", None, nlp(loc))

    doc = nlp(text)
    locations = []
    
    # Extract locations using NER
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            locations.append(ent.text)

    # Extract additional locations using the custom matcher
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        locations.append(span.text)
    
    return locations

In [29]:
data = {
    'processed_text': [
        "I live in iraq. morocco is a beautiful city.",
        "I visited tehran last year. It's a nice place.",
        "There's a conference in iraq and iran next month.",
        "I'm planning to move to iraq iraq soon."
    ]
}

# Create DataFrame
df1 = pd.DataFrame(data)

In [32]:
all_locations1 = []
for text in tqdm(df1['processed_text'], desc="Extracting Locations", total=len(df1)):
    locations = extract_locations(text)
    all_locations1.extend(locations)

# Print or use the list of all locations
print(all_locations1)

Extracting Locations: 100%|██████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.02it/s]

['iraq', 'morocco', 'iraq', 'iran', 'iraq', 'iraq']





In [33]:
locations_df1 = pd.DataFrame(all_locations1, columns=['Location'])

In [34]:
locations_df1

Unnamed: 0,Location
0,iraq
1,morocco
2,iraq
3,iran
4,iraq
5,iraq


In [35]:
locations_df1.to_csv('extracted_locations1.csv', index=False)

In [36]:
from geopy.geocoders import Nominatim

In [37]:
def get_coordinates(location):
    geolocator = Nominatim(user_agent="location_finder")
    try:
        location_info = geolocator.geocode(location)
        if location_info:
            return location_info.latitude, location_info.longitude
        else:
            return None, None
    except:
        return None, None

In [40]:
get_coordinates('raichur')

(16.083333, 77.166667)

In [42]:
locations_data = []

In [43]:
unique_locations = set(all_locations1)
for location in tqdm(unique_locations, desc="Geocoding Locations"):
    frequency = all_locations1.count(location)
    latitude, longitude = get_coordinates(location)
    if latitude is not None and longitude is not None:
        locations_data.append({'Location': location, 'Frequency': frequency, 'Latitude': latitude, 'Longitude': longitude})


Geocoding Locations: 100%|███████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.32s/it]


In [44]:
locations_coordinate1 = pd.DataFrame(locations_data)

In [45]:
locations_coordinate1

Unnamed: 0,Location,Frequency,Latitude,Longitude
0,iran,1,32.647531,54.564352
1,morocco,1,28.334772,-10.371338
2,iraq,4,33.095579,44.174977


In [46]:
locations_coordinate1.to_csv('locations_coordinate1.csv', index=False)

In [28]:
# locations_series = df['processed_text'].apply(extract_locations)
locations_series = []
for text in tqdm(df['processed_text'], desc="Extracting Locations", total=len(df)):
    locations_series.append(extract_locations(text))
# Concatenate all the lists into a single list
all_locations = [location for locations_list in locations_series for location in locations_list]

# Optional: If you want to remove duplicates from the list
all_locations = list(set(all_locations))

# Print or use the list of all locations
print(all_locations)

Extracting Locations: 100%|██████████████████████████████████████████████████████████| 597/597 [05:37<00:00,  1.77it/s]

['mexico', 'oott', 'smart city', 'baghdad', 'us', 'turkey', 'washington', 'dubai', 'iran city', 'tehran', 'uae', 'italy', 'al miqdadiyah', 'shirwana', 'morocco', 'najaf', 'kamran', 'kuwait', 'toronto', 'middle east', 'karzin county', 'kabul', 'bhuj', 'russia', 'buhari', 'kurdistan', 'kermanshah province', 'algeria', 'emsr', 'azerbaijan', 'the middle east', 'costa rica', 'halabja', 'israel', 'pakistan', 'china', 'iran', 'iraq', 'haiti', 'persia', 'japan']





In [None]:
location_df = pd.DataFrame({'Location': all_locations})

# Save the DataFrame to a file (e.g., CSV)
location_df.to_csv('locations.csv', index=False)  # Change the file format and name as needed