In [35]:
#pip install geopy

In [1]:
import pandas as pd
import os

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

In [10]:
# List of input CSV file paths

input_files = [
    r"output_files\a_son_at_the_front_gpe.csv",
    r"output_files\summer_gpe.csv",
    r"output_files\the_age_of_innocence_gpe.csv",
    r"output_files\the_children_gpe.csv", 
    r"output_files\the_custom_of_the_country_gpe.csv",
    r"output_files\the_fruit_of_the_tree_gpe.csv",
    r"output_files\the_glimpses_of_the_moon_gpe.csv",
    r"output_files\the_house_of_mirth_gpe.csv",
    r"output_files\the_mother_recompense_gpe.csv",
    r"output_files\the_reef_gpe.csv", 
    r"output_files\the_valley_of_decision_gpe.csv",
    r"output_files\twilight_sleep_gpe.csv"
]

In [11]:
pwd

'C:\\Users\\usuario\\ELENA\\it-training uzh\\it-training uzh\\Python Data Analytics Essentials\\Edith Warton\\entities\\GPE'

In [12]:
output_dir = "filtered_gpe"
os.makedirs(output_dir, exist_ok=True)

for file_path in input_files:
    df = pd.read_csv(file_path)
    df_b = df.drop(["COREF", "start_token", "end_token", "prop", "cat"], axis=1)
    title = os.path.basename(file_path).replace("_gpe.csv", "")
    df_b["Titles"] = [title] * len(df_b)
    output_path = os.path.join(output_dir, os.path.basename(file_path))
    df_b.to_csv(output_path, index=False)

So now we have all our GPEs saved in the directory "filtered gpe". Now let's combine all of them into a unique csv dataframe

In [13]:
all_dfs = []

for file_path in input_files:
    df = pd.read_csv(file_path)
    df_b = df.drop(["COREF", "start_token", "end_token", "prop", "cat"], axis=1)
    title = os.path.basename(file_path).replace("_gpe.csv", "")
    df_b["Titles"] = [title] * len(df_b)
    all_dfs.append(df_b)

# Concatenate all dataframes into one
combined_df = pd.concat(all_dfs, ignore_index=True)

In [14]:
combined_df

Unnamed: 0,text,Titles
0,Montmartre,a_son_at_the_front
1,America,a_son_at_the_front
2,England,a_son_at_the_front
3,London,a_son_at_the_front
4,Paris,a_son_at_the_front
...,...,...
953,Ceylon,twilight_sleep
954,India,twilight_sleep
955,Egypt,twilight_sleep
956,Cairo,twilight_sleep


Now let's substitute those titles for ones that look better

In [15]:
# Example mapping dictionary
title_map = {
    "a_son_at_the_front" : "A Son at the Front", 
    "summer" : "Summer", 
    "the_age_of_innocence" : "The Age of Innocence",
    "the_children" : "The Children",
    "the_custom_of_the_country" :  "The Custom of the Country",
    "the_fruit_of_the_tree" : "The Fruit of the Tree",
    "the_glimpses_of_the_moon" : "The Glimpses of the Moon",
    "the_house_of_mirth" : "The House of Mirth",
    "the_mother_recompense" : "The Mother Recompense",
    "the_reef" : "The Reef",
    "the_valley_of_decision" : "The Valley of Decision",
    "twilight_sleep" : "Twilight Sleep"
}

# Replace the titles using .replace with the mapping
combined_df["Titles"] = combined_df["Titles"].replace(title_map)


In [16]:
combined_df

Unnamed: 0,text,Titles
0,Montmartre,A Son at the Front
1,America,A Son at the Front
2,England,A Son at the Front
3,London,A Son at the Front
4,Paris,A Son at the Front
...,...,...
953,Ceylon,Twilight Sleep
954,India,Twilight Sleep
955,Egypt,Twilight Sleep
956,Cairo,Twilight Sleep


We are going to use the Python Library "Geopy" (https://geopy.readthedocs.io/en/stable/) to add the coordinates (LAT/LON) of the text column.

In [18]:
geolocator = Nominatim(user_agent="my_geo_app")

We can also see that geopy outputs the coordinates of the capital of "Great Britain" (London), if the name of a country is provided. That is great, as we will have many examples of that in our full data analysis. 

Let's now produce a csv dataframe with all the lattitudes and longitudes of our datafarme.

In [21]:
final_locations = combined_df["text"].to_list()

In [22]:
# Create empty lists to store results
latitudes = []
longitudes = []

for loc in final_locations:
    try:
        location = geolocator.geocode(loc)
        if location:
            latitudes.append(location.latitude)
            longitudes.append(location.longitude)
        else:
            latitudes.append(None)
            longitudes.append(None)
        print(f"Processed {loc}: lat={latitudes[-1]}, lon={longitudes[-1]}")
    except Exception as e:
        latitudes.append(None)
        longitudes.append(None)
        print(f"Error on {loc}: {e}")
    time.sleep(1)

Processed Montmartre: lat=48.8854619, lon=2.3391535
Processed America: lat=39.7837304, lon=-100.445882
Processed England: lat=52.5310214, lon=-1.2649062
Processed London: lat=51.5074456, lon=-0.1277653
Processed Paris: lat=48.8588897, lon=2.320041
Processed Deauville: lat=49.3595018, lon=0.0746638
Processed Aix: lat=50.4912838, lon=3.2907097
Processed Berlin: lat=52.5173885, lon=13.3951309
Processed St. Moritz: lat=46.4978958, lon=9.8392428
Processed Sicily: lat=37.587794, lon=14.155048
Processed North Africa: lat=41.362599, lon=2.159665
Processed Lille: lat=50.6365654, lon=3.0635282
Processed French: lat=46.603354, lon=1.8883335
Processed Africa: lat=11.5024338, lon=17.7578122
Processed France: lat=46.603354, lon=1.8883335
Processed Sedan: lat=49.7033759, lon=4.9433409
Processed Palermo: lat=38.1112268, lon=13.3524434
Processed New York: lat=40.7127281, lon=-74.0060152
Processed China: lat=35.000074, lon=104.999927
Processed 1870: lat=14.6003268, lon=121.1640697
Processed Madrid: lat=

Processed Newlands: lat=-33.9742832, lon=18.4572951
Processed Lausanne: lat=46.5218269, lon=6.6327025
Processed Luyden: lat=None, lon=None
Processed Saratoga: lat=43.0833231, lon=-73.8712155
Processed Washington Square: lat=39.9470379, lon=-75.1523257
Processed Manhattan: lat=40.7896239, lon=-73.9598939
Processed Chippendale: lat=-33.8863291, lon=151.1998211
Processed Baltimore: lat=39.2908816, lon=-76.610759
Processed Maryland: lat=39.5162401, lon=-76.9382069
Processed St. Austrey: lat=52.6566941, lon=-1.5644977
Processed Trevenna: lat=-25.7491891, lon=28.2010395
Processed Cornwall: lat=50.416667, lon=-4.75
Processed Gloucestershire: lat=51.7643786, lon=-2.1880662
Processed Skuytercliff: lat=None, lon=None
Processed Patroon: lat=31.6229546, lon=-93.9810245
Processed Russia: lat=64.6863136, lon=97.7453061
Processed Italy: lat=42.6384261, lon=12.674297
Processed Manzoni: lat=41.8901796, lon=12.5068166
Processed Florence: lat=43.7697955, lon=11.2556404
Processed Cowes: lat=50.7633176, lo

Processed Dinard: lat=48.6320379, lon=-2.0580178
Processed Versailles: lat=48.8035403, lon=2.1266886
Processed Rio: lat=-22.9110137, lon=-43.2093727
Processed Liverpool: lat=53.4071991, lon=-2.99168
Processed Bordeaux: lat=44.841225, lon=-0.5800364
Processed Cibour: lat=None, lon=None
Processed Hendaye: lat=43.3641518, lon=-1.7616499
Processed Chamonix: lat=45.9246705, lon=6.8727506
Processed Apex City: lat=-26.2141266, lon=28.3302017
Processed New York: lat=40.7127281, lon=-74.0060152
Processed Apex: lat=35.7325352, lon=-78.8505516
Processed Atalanta: lat=-27.421916, lon=-49.778863
Processed England: lat=52.5310214, lon=-1.2649062
Processed York: lat=53.9656579, lon=-1.0743052
Processed Cracow: lat=50.0469432, lon=19.9971534
Processed Georgia: lat=32.3293809, lon=-83.1137366
Processed Nebraska: lat=41.7370229, lon=-99.5873816
Processed Berlin: lat=52.5173885, lon=13.3951309
Processed Cavaleeria: lat=None, lon=None
Processed Europe: lat=51.0, lon=10.0
Processed California: lat=36.70146

Processed Lakewood: lat=39.7085736, lon=-105.0846694
Processed Quebec: lat=52.4760892, lon=-71.8258668
Processed Montreal: lat=45.5031824, lon=-73.5698065
Processed Saranac: lat=44.651559, lon=-73.743668
Processed Hanaford----: lat=37.9571937, lon=-88.8405465
Processed Michigan: lat=43.6211955, lon=-84.6824346
Processed Chicago: lat=41.8755616, lon=-87.6244212
Processed Versailles: lat=48.8035403, lon=2.1266886
Processed Monte Carlo: lat=43.7402961, lon=7.426559
Processed Paris: lat=48.8588897, lon=2.320041
Processed Como: lat=45.9394759, lon=9.1494101
Processed Newport: lat=41.4899827, lon=-71.3137707
Processed California: lat=36.7014631, lon=-118.755997
Processed Canada: lat=61.0666922, lon=-107.991707
Processed Florida: lat=27.7567667, lon=-81.4639835
Processed New York: lat=40.7127281, lon=-74.0060152
Processed New Hampshire: lat=43.4849133, lon=-71.6553992
Processed India: lat=22.3511148, lon=78.6677428
Processed Venice: lat=45.4371908, lon=12.3345898
Processed Milan: lat=45.46419

Processed Quebec: lat=52.4760892, lon=-71.8258668
Processed France: lat=46.603354, lon=1.8883335
Processed Brooklyn: lat=40.6526006, lon=-73.9497211
Processed Bridgeport: lat=41.1792695, lon=-73.1887863
Processed San Francisco: lat=37.7792588, lon=-122.4193286
Processed Philadelphia: lat=39.9527237, lon=-75.1635262
Processed Albany: lat=41.000028, lon=19.9999619
Processed Taormina: lat=37.8512218, lon=15.2830191
Processed Undergrounds: lat=51.1330306, lon=23.4712833
Processed Elevateds: lat=None, lon=None
Processed Paris: lat=48.8534951, lon=2.3483915
Processed Spain: lat=39.3260685, lon=-4.8379791
Processed Italy: lat=42.6384261, lon=12.674297
Processed Rome: lat=41.8933203, lon=12.4829321
Processed Bethesda: lat=38.9812726, lon=-77.1233587
Processed New _ York: lat=40.7127281, lon=-74.0060152
Processed Meridia: lat=36.1510793, lon=-95.9886215
Processed Europe: lat=51.0, lon=10.0
Processed Brooklyn Bridge: lat=40.7062175, lon=-73.9970208
Processed Lilla: lat=50.6365654, lon=3.0635282


Processed Padua: lat=45.4077172, lon=11.8734455
Processed Syria: lat=34.6401861, lon=39.0494106
Processed Giorgione: lat=43.9935489, lon=11.9599467
Processed Ucalegon: lat=None, lon=None
Processed Heiligenstern: lat=None, lon=None
Processed Innsbruck: lat=47.2654296, lon=11.3927685
Processed Pomerania: lat=53.7735064, lon=12.5755471
Processed Ratisbon: lat=49.0195333, lon=12.0974869
Processed Ravenna: lat=44.3640607, lon=12.0590095
Processed Posilipo: lat=34.4204007, lon=-119.6254464
Processed Capri: lat=40.5488429, lon=14.2283708
Processed Vesuvius: lat=40.8213963, lon=14.4261967
Processed Pompeii: lat=40.7517375, lon=14.4905636
Processed Gamba: lat=28.275194, lon=88.5133005
Processed Procida: lat=40.7649991, lon=14.0239063
Processed Sorrento: lat=40.624906, lon=14.374836
Processed Germany: lat=51.1638175, lon=10.4478313
Processed Venetia: lat=45.4371908, lon=12.3345898
Processed America: lat=39.7837304, lon=-100.445882
Processed Camaldoli: lat=40.6526226, lon=15.1751616
Processed Gar

In [23]:
print(len(latitudes))
print(len(longitudes))
print(len(final_locations))

958
958
958


In [24]:
# Optionally, add latitudes and longitudes back to a DataFrame column for further use
df = pd.DataFrame({'Location': final_locations, 'Latitude': latitudes, 'Longitude': longitudes})

In [25]:
df

Unnamed: 0,Location,Latitude,Longitude
0,Montmartre,48.885462,2.339154
1,America,39.783730,-100.445882
2,England,52.531021,-1.264906
3,London,51.507446,-0.127765
4,Paris,48.858890,2.320041
...,...,...,...
953,Ceylon,43.533567,-94.631646
954,India,22.351115,78.667743
955,Egypt,26.254049,29.267547
956,Cairo,30.044388,31.235726


In [26]:
titles = combined_df["Titles"].to_list()

In [27]:
df["Titles"] = titles

In [28]:
df

Unnamed: 0,Location,Latitude,Longitude,Titles
0,Montmartre,48.885462,2.339154,A Son at the Front
1,America,39.783730,-100.445882,A Son at the Front
2,England,52.531021,-1.264906,A Son at the Front
3,London,51.507446,-0.127765,A Son at the Front
4,Paris,48.858890,2.320041,A Son at the Front
...,...,...,...,...
953,Ceylon,43.533567,-94.631646,Twilight Sleep
954,India,22.351115,78.667743,Twilight Sleep
955,Egypt,26.254049,29.267547,Twilight Sleep
956,Cairo,30.044388,31.235726,Twilight Sleep


In [29]:
df.to_csv("wharton_coordinates.csv")