In [1]:
#pip install geopy

In [2]:
import pandas as pd
import os

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

In [11]:
# List of input CSV file paths

input_files = [
    r"output_files\a_dark_nights_work_gpe.csv",
    r"output_files\cranford_gpe.csv",
    r"output_files\mary_barton_gpe.csv",
    r"output_files\my_lady_ludlow_gpe.csv", 
    r"output_files\north_and_south_gpe.csv",
    r"output_files\ruth_gpe.csv",
    r"output_files\sylvia_lovers_gpe.csv",
    r"output_files\wives_and_daughters_gpe.csv"
]

In [12]:
pwd

'C:\\Users\\usuario\\ELENA\\it-training uzh\\it-training uzh\\Python Data Analytics Essentials\\Elizabeth Gaskell\\Entities\\GPE'

In [13]:
output_dir = "filtered_gpe"
os.makedirs(output_dir, exist_ok=True)

for file_path in input_files:
    df = pd.read_csv(file_path)
    df_b = df.drop(["COREF", "start_token", "end_token", "prop", "cat"], axis=1)
    title = os.path.basename(file_path).replace("_gpe.csv", "")
    df_b["Titles"] = [title] * len(df_b)
    output_path = os.path.join(output_dir, os.path.basename(file_path))
    df_b.to_csv(output_path, index=False)

So now we have all our GPEs saved in the directory "filtered gpe". Now let's combine all of them into a unique csv dataframe

In [14]:
all_dfs = []

for file_path in input_files:
    df = pd.read_csv(file_path)
    df_b = df.drop(["COREF", "start_token", "end_token", "prop", "cat"], axis=1)
    title = os.path.basename(file_path).replace("_gpe.csv", "")
    df_b["Titles"] = [title] * len(df_b)
    all_dfs.append(df_b)

# Concatenate all dataframes into one
combined_df = pd.concat(all_dfs, ignore_index=True)

In [15]:
combined_df

Unnamed: 0,text,Titles
0,Hamley,a_dark_nights_work
1,Eton,a_dark_nights_work
2,London,a_dark_nights_work
3,#¿NOMBRE?,a_dark_nights_work
4,Paris,a_dark_nights_work
...,...,...
510,Atherstone,wives_and_daughters
511,Carthage,wives_and_daughters
512,Edinburgh,wives_and_daughters
513,Sussex Place,wives_and_daughters


Now let's substitute those titles for ones that look better

In [16]:
# Example mapping dictionary
title_map = {
    "a_dark_nights_work" : "A Dark Nights Work", 
    "cranford_gpe" : "Cranford",
    "mary_barton" : "Mary Barton",
    "my_lady_ludlow" : "My Lady Ludlow",
    "north_and_south" : "North and South", 
    "ruth" : "Ruth", 
    "sylvia_lovers" : "Sylvia Lovers",
    "wives_and_daughters" : "Wives and Daughters"    
}

# Replace the titles using .replace with the mapping
combined_df["Titles"] = combined_df["Titles"].replace(title_map)


In [17]:
combined_df

Unnamed: 0,text,Titles
0,Hamley,A Dark Nights Work
1,Eton,A Dark Nights Work
2,London,A Dark Nights Work
3,#¿NOMBRE?,A Dark Nights Work
4,Paris,A Dark Nights Work
...,...,...
510,Atherstone,Wives and Daughters
511,Carthage,Wives and Daughters
512,Edinburgh,Wives and Daughters
513,Sussex Place,Wives and Daughters


We are going to use the Python Library "Geopy" (https://geopy.readthedocs.io/en/stable/) to add the coordinates (LAT/LON) of the text column.

In [18]:
geolocator = Nominatim(user_agent="my_geo_app")

We can also see that geopy outputs the coordinates of the capital of "Great Britain" (London), if the name of a country is provided. That is great, as we will have many examples of that in our full data analysis. 

Let's now produce a csv dataframe with all the lattitudes and longitudes of our datafarme.

In [19]:
final_locations = combined_df["text"].to_list()

In [20]:
# Create empty lists to store results
latitudes = []
longitudes = []

for loc in final_locations:
    try:
        location = geolocator.geocode(loc)
        if location:
            latitudes.append(location.latitude)
            longitudes.append(location.longitude)
        else:
            latitudes.append(None)
            longitudes.append(None)
        print(f"Processed {loc}: lat={latitudes[-1]}, lon={longitudes[-1]}")
    except Exception as e:
        latitudes.append(None)
        longitudes.append(None)
        print(f"Error on {loc}: {e}")
    time.sleep(1)

Processed Hamley: lat=-34.0817399, lon=137.599391
Processed Eton: lat=51.4885381, lon=-0.6091736
Processed London: lat=51.5074456, lon=-0.1277653
Processed #¿NOMBRE?: lat=9.5821089, lon=-79.4688881
Processed Paris: lat=48.8588897, lon=2.320041
Processed Rome: lat=41.8933203, lon=12.4829321
Processed England: lat=52.5310214, lon=-1.2649062
Processed Scotland: lat=56.7861112, lon=-4.1140518
Processed South Wales: lat=51.6808582, lon=-3.1305139
Processed Cambridge: lat=52.2055314, lon=0.1186637
Processed Ellinor: lat=38.392241, lon=-96.4319914
Processed Shropshire: lat=52.6523394, lon=-2.6435641
Processed Hartwell: lat=34.3528825, lon=-82.932087
Processed Yorkshire: lat=53.9825271, lon=-1.38525
Processed America: lat=39.7837304, lon=-100.445882
Processed Liverpool: lat=53.4071991, lon=-2.99168
Processed Easter: lat=34.6456223, lon=-102.3968676
Processed Hinton: lat=53.3990512, lon=-117.5886772
Processed August: lat=37.979919, lon=-121.2621572
Processed Westley: lat=37.5504905, lon=-121.21

Processed Canada: lat=61.0666922, lon=-107.991707
Processed at Toronto , in Canada: lat=47.5954626, lon=-52.7124159
Processed Toronto: lat=43.6534817, lon=-79.3839347
Processed Merica: lat=39.7837304, lon=-100.445882
Processed Scotland: lat=56.7861112, lon=-4.1140518
Processed London: lat=51.5074456, lon=-0.1277653
Processed Connington: lat=31.5722356, lon=-106.2168014
Processed Cavistock: lat=None, lon=None
Processed Hanbury Court: lat=-37.9875809, lon=145.1568973
Processed England: lat=52.5310214, lon=-1.2649062
Processed called,â€”then: lat=None, lon=None
Processed Shrewsbury: lat=52.707755, lon=-2.7540658
Processed Oakfield: lat=46.0989408, lon=-68.1500246
Processed Hanbury: lat=52.2688447, lon=-2.0527598
Processed Weymouth: lat=50.6096257, lon=-2.4543424
Processed Oxford: lat=51.7520131, lon=-1.2578499
Processed Germany: lat=51.1638175, lon=10.4478313
Processed Hathaway: lat=38.8578662, lon=-81.0831674
Processed Paris: lat=48.8534951, lon=2.3483915
Processed Versailles: lat=48.803

Processed Scarborough: lat=54.2820009, lon=-0.4011868
Processed Heaven: lat=51.507865, lon=-0.1245597
Processed Scaurside - hill: lat=None, lon=None
Processed Helmsby: lat=39.2641309, lon=-76.7616005
Processed Lincolnshire: lat=53.1823034, lon=-0.2031209
Processed Baden: lat=48.0076713, lon=16.2343693
Processed Scotland: lat=56.7861112, lon=-4.1140518
Processed Glasgow: lat=55.861155, lon=-4.2501687
Processed England: lat=52.5310214, lon=-1.2649062
Processed Monkshaven: lat=None, lon=None
Processed Europe: lat=51.0, lon=10.0
Processed France: lat=46.603354, lon=1.8883335
Processed Portsmouth: lat=50.800031, lon=-1.0906023
Processed Plymouth: lat=50.3714122, lon=-4.1424451
Processed York: lat=53.9656579, lon=-1.0743052
Processed GREENLAND: lat=77.6192349, lon=-42.8125967
Processed Yorkshire: lat=53.9825271, lon=-1.38525
Processed Amerikay: lat=-21.0002179, lon=-61.0006565
Processed his Hollands - and - water: lat=None, lon=None
Processed Haytersbank: lat=None, lon=None
Processed Sheba: 

Processed Africa: lat=11.5024338, lon=17.7578122
Processed Henwick: lat=52.2059209, lon=-2.2451216
Processed Stratford: lat=52.1927803, lon=-1.70634
Processed Arracuoba: lat=None, lon=None
Processed Worcester: lat=42.2625621, lon=-71.8018877
Processed Mdme: lat=14.7030829, lon=-17.4470745
Processed London Kirkpatricks: lat=None, lon=None
Processed Switzerland: lat=46.7985624, lon=8.2319736
Processed Bishopsfield: lat=52.4044203, lon=-8.4719012
Processed BAY: lat=30.2481693, lon=-85.6593633
Processed November: lat=50.8766916, lon=7.1559314
Processed Feversham: lat=44.3400541, lon=-80.3760922
Processed Harrow: lat=51.5968272, lon=-0.3373046
Processed Cape Town: lat=-33.9288301, lon=18.4172197
Processed ADIEUX: lat=48.3743343, lon=2.6621678
Processed Birmingham: lat=52.4796992, lon=-1.9026911
Processed Atherstone: lat=52.5772439, lon=-1.5433954
Processed Carthage: lat=32.1570412, lon=-94.338114
Processed Edinburgh: lat=55.9533456, lon=-3.1883749
Processed Sussex Place: lat=35.863248, lon=

In [21]:
print(len(latitudes))
print(len(longitudes))
print(len(final_locations))

515
515
515


In [22]:
# Optionally, add latitudes and longitudes back to a DataFrame column for further use
df = pd.DataFrame({'Location': final_locations, 'Latitude': latitudes, 'Longitude': longitudes})

In [23]:
df

Unnamed: 0,Location,Latitude,Longitude
0,Hamley,-34.081740,137.599391
1,Eton,51.488538,-0.609174
2,London,51.507446,-0.127765
3,#¿NOMBRE?,9.582109,-79.468888
4,Paris,48.858890,2.320041
...,...,...,...
510,Atherstone,52.577244,-1.543395
511,Carthage,32.157041,-94.338114
512,Edinburgh,55.953346,-3.188375
513,Sussex Place,35.863248,-78.607858


In [24]:
titles = combined_df["Titles"].to_list()

In [25]:
df["Titles"] = titles

In [26]:
df

Unnamed: 0,Location,Latitude,Longitude,Titles
0,Hamley,-34.081740,137.599391,A Dark Nights Work
1,Eton,51.488538,-0.609174,A Dark Nights Work
2,London,51.507446,-0.127765,A Dark Nights Work
3,#¿NOMBRE?,9.582109,-79.468888,A Dark Nights Work
4,Paris,48.858890,2.320041,A Dark Nights Work
...,...,...,...,...
510,Atherstone,52.577244,-1.543395,Wives and Daughters
511,Carthage,32.157041,-94.338114,Wives and Daughters
512,Edinburgh,55.953346,-3.188375,Wives and Daughters
513,Sussex Place,35.863248,-78.607858,Wives and Daughters


In [28]:
df.to_csv("gaskell_coordinates.csv")