In [1]:
import pandas as pd
import spacy
from tqdm import tqdm
from spacy.matcher import PhraseMatcher

In [2]:
csv_file = 'preprocessed_0f_turkey.csv' 

In [3]:
df = pd.read_csv(csv_file)

In [4]:
df

Unnamed: 0,text,date,processed_text
0,Hey Turkish users are getting a rate limit ...,2023-02-06 13:04:30+00:00,hey turkish users are getting rate limit excee...
1,May Allah have mercy on all those who passed a...,2023-02-06 13:04:30+00:00,may allah have mercy on all those who passed a...
2,We Stand with Turkey,2023-02-06 13:04:31+00:00,we stand with turkey
3,🔔 ( M3.9 occurred 21 km E of ( 37 min ago (lo...,2023-02-06 13:04:33+00:00,occurred km of min ago local time smile more ...
4,🚨 Former Newcastle United star Christian Atsu ...,2023-02-06 13:04:33+00:00,former newcastle united star christian atsu i...
...,...,...,...
36772,Massive thanks to you and the entire team fo...,2023-02-10 23:57:03+00:00,massive thanks to you and the entire team for...
36773,"PLEASE donate to AHBAP charity, the only chari...",2023-02-10 23:58:12+00:00,please donate to ahbap charity the only charit...
36774,"🙏😭😭😭I thought my heart would burst from grief,...",2023-02-10 23:58:20+00:00,thought my heart would burst from grief tears...
36775,Alhamdulilllah they are safe and sound ❤️ ...,2023-02-10 23:58:21+00:00,alhamdulillah they are safe and sound


In [5]:
def extract_locations(text):
    nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab)
    # Add custom location names to the matcher
    location_patterns = []
    for loc in location_patterns:
        matcher.add("LOCATION", None, nlp(loc))

    doc = nlp(text)
    locations = []
    
    # Extract locations using NER
    for ent in doc.ents:
        if ent.label_ == 'GPE' or ent.label_ == 'LOC':
            locations.append(ent.text)

    # Extract additional locations using the custom matcher
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        locations.append(span.text)
    
    return locations

In [6]:
all_locations1 = []
flags=[]
for text in tqdm(df['processed_text'], desc="Extracting Locations", total=len(df)):
    locations = extract_locations(text)
    all_locations1.extend(locations)
    
    # Set the flag to 1 if locations were found, otherwise 0
    if locations:
        flags.append(1)
    else:
        flags.append(0)

# Add the flag column to the DataFrame
df['flag1'] = flags
# Print or use the list of all locations
print(all_locations1)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Extracting Locations:   1%|          | 245/36777 [02:09<5:21:07,  1.90it/s]


KeyboardInterrupt: 

In [12]:
locations_df1 = pd.DataFrame(all_locations1, columns=['Location'])

In [13]:
locations_df1

Unnamed: 0,Location
0,turkey
1,oha
2,turkey
3,israel
4,turkey
...,...
2619,syria
2620,turkey
2621,syria
2622,turkey


In [14]:
locations_df1.to_csv('extracted_locations_turkey.csv', index=False)

In [15]:
from geopy.geocoders import Nominatim

In [16]:
def get_coordinates(location):
    geolocator = Nominatim(user_agent="location_finder")
    try:
        location_info = geolocator.geocode(location)
        if location_info:
            return location_info.latitude, location_info.longitude
        else:
            return None, None
    except:
        return None, None

In [17]:
locations_data = []

In [18]:
unique_locations = set(all_locations1)
for location in tqdm(unique_locations, desc="Geocoding Locations"):
    frequency = all_locations1.count(location)
    latitude, longitude = get_coordinates(location)
    if latitude is not None and longitude is not None:
        locations_data.append({'Location': location, 'Frequency': frequency, 'Latitude': latitude, 'Longitude': longitude})


Geocoding Locations: 100%|███████████████████████████████████████████████████████████| 174/174 [02:36<00:00,  1.11it/s]


In [19]:
locations_coordinate1 = pd.DataFrame(locations_data)

In [20]:
locations_coordinate1

Unnamed: 0,Location,Frequency,Latitude,Longitude
0,baghdad,1,33.306170,44.387221
1,the middle east,5,35.869605,-84.121049
2,middle east,7,39.301416,-76.588848
3,al arabiya,1,17.015257,54.093151
4,california,4,36.701463,-118.755997
...,...,...,...,...
142,japan,5,36.574844,139.239418
143,bhai,1,32.057482,76.275716
144,rojava,1,24.168196,-13.892143
145,canada,4,61.066692,-107.991707


In [22]:
locations_coordinate1.to_csv('coordinate_turkey.csv', index=False)