In [1]:
import pandas as pd
import re

full_data = pd.read_csv('./archive/tweets.csv', low_memory=False)

only_english = full_data[full_data['language'] == 'en']

# Remove the source and isVerified column

only_english = only_english.drop(['source'], axis=1)
only_english = only_english.drop(['isVerified'], axis=1)
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # remove mentions
    text = re.sub(r'#', '', text) # remove hashtags
    text = re.sub(r'\s+', ' ', text) # remove extra whitespace
    return text

only_english['content'] = only_english['content'].apply(clean_text)

# Transform the date column to datetime format
only_english['date'] = pd.to_datetime(only_english['date'])
only_english['date'] = only_english['date'].dt.strftime('%Y-%m-%d %H:%M:%S') # stolen from main file


# print the number of rows and columns
print(only_english.shape)
only_english.head()



(189626, 9)


Unnamed: 0,date,content,hashtags,like_count,rt_count,followers_count,language,coordinates,place
1,2023-02-21 03:29:07,New search &amp; rescue work is in progress in...,"['Hatay', 'earthquakes', 'Türkiye', 'TurkiyeQu...",1.0,0.0,5697.0,en,,
2,2023-02-21 03:29:04,Can't imagine those who still haven't recovere...,"['Turkey', 'earthquake', 'turkeyearthquake2023...",0.0,0.0,1.0,en,,
3,2023-02-21 03:28:06,its a highkey sign for all of us to ponder ove...,"['turkeyearthquake2023', 'earthquake', 'Syria']",0.0,0.0,3.0,en,,
5,2023-02-21 03:27:27,"See how strong was the Earthquake of Feb 20, 2...","['Earthquake', 'Hatay', 'Turkey', 'turkeyearth...",0.0,0.0,21836.0,en,,
6,2023-02-21 03:27:11,More difficult news today on top of struggles ...,"['Türkiye', 'Syria', 'earthquake', 'Canadians']",1.0,0.0,675.0,en,,


In [43]:
from geotext import GeoText

def dest_text(text):
    places = GeoText(text).cities
    if places:
        return places[0]
    else:
        return None

only_english['city_mention'] = only_english['content'].apply(dest_text)
only_english.dropna(subset=['city_mention'], inplace=True)

only_english.head()

only_english.to_csv('./archive/only_english.csv', index=False)

In [44]:
from tqdm import tqdm
from geopy.geocoders import Nominatim #for geocoding


data = pd.read_csv("./archive/only_english.csv")

# Create a new column with the distance from location after finding *km or *miles in text.
data['distance'] = data['content'].apply(lambda x: re.findall(r'\d+km|\d+miles', x))

# Extract the number from the distance column and convert it to float.
data['distance'] = data['distance'].apply(lambda x: float(re.findall(r'\d+', x[0])[0]) if len(x) > 0 else None)

# Create a geolocator object with a custom user_agent
geolocator = Nominatim(user_agent="my-custom-user-agent")

# Define a function to get the coordinates of a location
def get_coordinates(row):
    try:
        # Use geolocator to get the location's coordinates
        location = geolocator.geocode(row['city_mention'])
        return pd.Series({'latitude': location.latitude, 'longitude': location.longitude})
    except:
        return pd.Series({'latitude': None, 'longitude': None})

# Remove duplicate city names within the array
seen_cities = set()
data['city_mention'] = data['city_mention'].apply(lambda x: x if x not in seen_cities else None if None else seen_cities.add(x) or x)

# Apply the get_coordinates function to the city column to create a new coordinates column
tqdm.pandas(desc="Geocoding progress")
data[['latitude', 'longitude']] = data.progress_apply(lambda row: get_coordinates(row), axis=1)

# Print the amount of tweets with location and distance.
print(f"Amount of tweets with coordinates: {len(data[data['latitude'].notnull()])}")
print(f"Amount of tweets with distance: {len(data[data['distance'].notnull()])}")

# save the data to a new csv file called "full_data.csv"
data.to_csv("./archive/english_data_with_locations.csv", index=False)

Geocoding progress: 100%|██████████| 17942/17942 [2:29:31<00:00,  2.00it/s]  


Amount of tweets with coordinates: 17937
Amount of tweets with distance: 348


In [4]:
# Combine files translated dataset 1 and 2 

data1 = pd.read_csv("translated_dataset3.csv")
data2 = pd.read_csv("translated_dataset3_2.csv")

full_data = pd.concat([data1, data2], ignore_index=True)
full_data.to_csv("full_data_with_english.csv", index=False)
