In [35]:
#pip install geopy

In [1]:
import pandas as pd
import os

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

In [6]:
# List of input CSV file paths

input_files = [
    r"output_files\a_connecticut_yankee_gpe.csv",
    r"output_files\a_horse_tale_gpe.csv",
    r"output_files\Personal_Recollections_of_Joan_of_Arc_gpe.csv",
    r"output_files\The_Adventures_of_Tom_Sawyer_gpe.csv", 
    r"output_files\The_American_Claimant_NER_gpe.csv",
    r"output_files\The_Gilded_Age_gpe.csv",
    r"output_files\The_Mysterious_Stranger_gpe.csv",
    r"output_files\The_Prince_and_the_Pauper_gpe.csv",
    r"output_files\The_Tragedy_of_Pudd’nhead_Wilson_gpe.csv",
    r"output_files\Tom_Sawyer_Abroad_gpe.csv", 
    r"output_files\Tom_Sawyer_Detective_gpe.csv"
]

In [8]:
output_dir = "filtered_gpe"
os.makedirs(output_dir, exist_ok=True)

for file_path in input_files:
    df = pd.read_csv(file_path)
    df_b = df.drop(["COREF", "start_token", "end_token", "prop", "cat"], axis=1)
    title = os.path.basename(file_path).replace("_gpe.csv", "")
    df_b["Titles"] = [title] * len(df_b)
    output_path = os.path.join(output_dir, os.path.basename(file_path))
    df_b.to_csv(output_path, index=False)

So now we have all our GPEs saved in the directory "filtered gpe". Now let's combine all of them into a unique csv dataframe

In [9]:
all_dfs = []

for file_path in input_files:
    df = pd.read_csv(file_path)
    df_b = df.drop(["COREF", "start_token", "end_token", "prop", "cat"], axis=1)
    title = os.path.basename(file_path).replace("_gpe.csv", "")
    df_b["Titles"] = [title] * len(df_b)
    all_dfs.append(df_b)

# Concatenate all dataframes into one
combined_df = pd.concat(all_dfs, ignore_index=True)

In [11]:
combined_df

Unnamed: 0,text,Titles
0,England,a_connecticut_yankee
1,HARTFORD,a_connecticut_yankee
2,Whitsunday,a_connecticut_yankee
3,Camelot,a_connecticut_yankee
4,Arkansas,a_connecticut_yankee
...,...,...
570,Louisiana,Tom_Sawyer_Detective
571,St. Louis,Tom_Sawyer_Detective
572,Elexandria,Tom_Sawyer_Detective
573,Iowa,Tom_Sawyer_Detective


Now let's substitute those titles for ones that look better

In [15]:
# Example mapping dictionary
title_map = {
    "a_connecticut_yankee" : "A Connecticut Yankee",
    "a_horse_tale" : "A Horse Tale",
    "Personal_Recollections_of_Joan_of_Arc" : "Personal Recollections of Joan of Arc",
    "The_Adventures_of_Tom_Sawyer" : "The Adventures of Tom Sawyer",
    "The_American_Claimant" : "The American Claimant",
    "The_Gilded_Age" : "The Gilded Age", 
    "The_Mysterious_Stranger" : "The Mysterious Stranger",
    "The_Prince_and_the_Pauper" : "The Prince and the Pauper",
    "The_Tragedy_of_Pudd’nhead_Wilson" :  "The Tragedy of Pudd’nhead Wilson",
    "Tom_Sawyer_Abroad" : "Tom Sawyer Abroad", 
    "Tom_Sawyer_Detective" : "Tom Sawyer Detective"
}

# Replace the titles using .replace with the mapping
combined_df["Titles"] = combined_df["Titles"].replace(title_map)


In [16]:
combined_df

Unnamed: 0,text,Titles
0,England,A Connecticut Yankee
1,HARTFORD,A Connecticut Yankee
2,Whitsunday,A Connecticut Yankee
3,Camelot,A Connecticut Yankee
4,Arkansas,A Connecticut Yankee
...,...,...
570,Louisiana,Tom Sawyer Detective
571,St. Louis,Tom Sawyer Detective
572,Elexandria,Tom Sawyer Detective
573,Iowa,Tom Sawyer Detective


We are going to use the Python Library "Geopy" (https://geopy.readthedocs.io/en/stable/) to add the coordinates (LAT/LON) of the text column.

In [17]:
geolocator = Nominatim(user_agent="my_geo_app")

We can also see that geopy outputs the coordinates of the capital of "Great Britain" (London), if the name of a country is provided. That is great, as we will have many examples of that in our full data analysis. 

Let's now produce a csv dataframe with all the lattitudes and longitudes of our datafarme.

In [18]:
final_locations = combined_df["text"].to_list()

In [19]:
# Create empty lists to store results
latitudes = []
longitudes = []

for loc in final_locations:
    try:
        location = geolocator.geocode(loc)
        if location:
            latitudes.append(location.latitude)
            longitudes.append(location.longitude)
        else:
            latitudes.append(None)
            longitudes.append(None)
        print(f"Processed {loc}: lat={latitudes[-1]}, lon={longitudes[-1]}")
    except Exception as e:
        latitudes.append(None)
        longitudes.append(None)
        print(f"Error on {loc}: {e}")
    time.sleep(1)

Processed England: lat=52.5310214, lon=-1.2649062
Processed HARTFORD: lat=41.764582, lon=-72.6908547
Processed Whitsunday: lat=-20.6939435, lon=147.7217804
Processed Camelot: lat=-33.9583333, lon=18.6666667
Processed Arkansas: lat=35.2048883, lon=-92.4479108
Processed Carlion: lat=35.8836337, lon=-78.9281394
Error on Egypt: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Egypt&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
Processed India: lat=22.3511148, lon=78.6677428
Processed East Hartford: lat=41.767914, lon=-72.644512
Processed Land: lat=60.2340258, lon=9.8743057
Processed London: lat=51.5074456, lon=-0.1277653
Processed France: lat=46.603354, lon=1.8883335
Processed America: lat=39.7837304, lon=-100.445882
Processed Ireland: lat=52.865196, lon=-7.9794599
Processed Scotland: lat=56.7861112, lon=-4.1140518
Processed Go

Processed Dinant: lat=50.2591813, lon=4.9130588
Processed HARTFORD: lat=41.764582, lon=-72.6908547
Processed St. Petersburg: lat=59.938732, lon=30.316229
Processed England: lat=52.5310214, lon=-1.2649062
Processed Constantinople: lat=41.006381, lon=28.9758715
Processed America: lat=39.7837304, lon=-100.445882
Processed the United States: lat=47.8291375, lon=-122.5970742
Processed Rubbage: lat=50.9029697, lon=-1.5524802
Processed Coonville: lat=39.3984029, lon=-82.3393186
Processed Guisborne: lat=36.6930692, lon=-76.2575166
Processed Nottingham: lat=52.9534193, lon=-1.1496461
Processed Sherwood Forest: lat=53.2078695, lon=-1.0824054
Processed Hellum: lat=53.2386461, lon=6.8384474
Processed Port: lat=46.1635975, lon=5.5710285
Processed Glory: lat=31.3679749, lon=-83.1434854
Processed ALABAMA: lat=33.2588817, lon=-86.8295337
Processed Coosa: lat=32.9181762, lon=-86.2327497
Processed St. Louis: lat=38.6280278, lon=-90.1910154
Processed Europe: lat=51.0, lon=10.0
Processed Texas: lat=31.263

Processed Alton: lat=38.8908583, lon=-90.1843091
Processed Camden: lat=39.9448402, lon=-75.1198911
Processed Amboy: lat=43.887871, lon=-94.1585259
Processed Naples: lat=40.8358846, lon=14.2487679
Processed Greece: lat=38.9953683, lon=21.9877132
Processed Philadelphian: lat=39.9665585, lon=-75.177895
Processed Germantown: lat=39.1712341, lon=-77.2655648
Processed Tunkhannock: lat=41.5385159, lon=-75.946844
Processed Harrisburg: lat=40.2663107, lon=-76.8861122
Processed Magnolia: lat=30.2095594, lon=-95.7507897
Processed Louis: lat=0.4105973, lon=9.4328279
Processed Massachusetts: lat=42.3788774, lon=-72.032366
Error on Stone 's Landing: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Stone+%27s+Landing&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
Processed Goose Run: lat=40.5410456, lon=-80.6366493
Processed Columbus: lat=

Processed Europe: lat=51.0, lon=10.0
Processed Yonder: lat=51.5583678, lon=5.0589003
Processed Windsor: lat=42.2858536, lon=-82.9780695
Processed Heathennesse: lat=None, lon=None
Processed Christian England: lat=42.3366323, lon=-71.1449366
Processed Scone: lat=56.4204776, lon=-3.39916
Processed Kingsale: lat=36.6956786, lon=-76.8125708
Processed Flanders: lat=51.0962462, lon=4.1786291
Processed Hamburg: lat=53.550341, lon=10.000654
Processed Huntingdon: lat=40.3446332, lon=-78.0281185
Processed Connecticut: lat=41.6500201, lon=-72.7342163
Processed New Haven: lat=41.3082138, lon=-72.9250518
Processed Papistry.--J. Heneage Jesse â€™s London: lat=None, lon=None
Processed London.--Ibid: lat=None, lon=None
Processed Anacreon: lat=48.8385103, lon=-1.6006345
Processed America: lat=39.7837304, lon=-100.445882
Processed southwest Missouri: lat=39.2974026, lon=-94.719359
Processed Florence: lat=43.7697955, lon=11.2556404
Processed Settignano: lat=43.7829916, lon=11.3208169
Processed Dawson 's L

In [20]:
print(len(latitudes))
print(len(longitudes))
print(len(final_locations))

575
575
575


In [21]:
# Optionally, add latitudes and longitudes back to a DataFrame column for further use
df = pd.DataFrame({'Location': final_locations, 'Latitude': latitudes, 'Longitude': longitudes})

In [22]:
df

Unnamed: 0,Location,Latitude,Longitude
0,England,52.531021,-1.264906
1,HARTFORD,41.764582,-72.690855
2,Whitsunday,-20.693943,147.721780
3,Camelot,-33.958333,18.666667
4,Arkansas,35.204888,-92.447911
...,...,...,...
570,Louisiana,30.870388,-92.007126
571,St. Louis,38.628028,-90.191015
572,Elexandria,,
573,Iowa,41.921673,-93.312270


In [23]:
titles = combined_df["Titles"].to_list()

In [24]:
df["Titles"] = titles

In [25]:
df

Unnamed: 0,Location,Latitude,Longitude,Titles
0,England,52.531021,-1.264906,A Connecticut Yankee
1,HARTFORD,41.764582,-72.690855,A Connecticut Yankee
2,Whitsunday,-20.693943,147.721780,A Connecticut Yankee
3,Camelot,-33.958333,18.666667,A Connecticut Yankee
4,Arkansas,35.204888,-92.447911,A Connecticut Yankee
...,...,...,...,...
570,Louisiana,30.870388,-92.007126,Tom Sawyer Detective
571,St. Louis,38.628028,-90.191015,Tom Sawyer Detective
572,Elexandria,,,Tom Sawyer Detective
573,Iowa,41.921673,-93.312270,Tom Sawyer Detective


In [27]:
df.to_csv("twain_coordinates.csv")