And now that we have understood how to extract Person Entities in one Book, let's do the same thing in all the books!

In [18]:
import pandas as pd 
import os

In [19]:
# List of input CSV file paths

input_files = [
    r"Files\a_connecticut_yankee.entities.csv",
    r"Files\a_horse_tale.entities.csv",
    r"Files\Personal_Recollections_of_Joan_of_Arc.entities.csv", 
    r"Files\The_Adventures_of_Tom_Sawyer.entities.csv",
    r"Files\The_American_Claimant_NER.entities.csv",
    r"Files\The_Gilded_Age.entities.csv",
    r"Files\The_Mysterious_Stranger.entities.csv",
    r"Files\The_Prince_and_the_Pauper.entities.csv",
    r"Files\The_Tragedy_of_Pudd’nhead_Wilson.entities.csv", 
    r"Files\Tom_Sawyer_Abroad.entities.csv",
    r"Files\Tom_Sawyer_Detective.entities.csv",
]

In [20]:
# Create output directory if it does not exist
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)

In [22]:
# Process each file and save results
for file_path in input_files:
    # Read the CSV file
    data = pd.read_csv(file_path, delimiter=';')
    
    # Filter rows where 'cat' is 'GPE'
    person = data[data["cat"] == "GPE"]
    
    # Remove duplicates based on 'text' column
    df = person.drop_duplicates(subset=['text'])
    
    # Filter rows where 'prop' is 'PROP'
    gpe_name = df[df["prop"] == "PROP"]
    
    # Remove duplicates based on 'COREF' column
    unique_gpe_name = gpe_name.drop_duplicates(subset=['COREF'])
    
    # Create output filename based on input file
    base_name = os.path.basename(file_path)
    output_name = base_name.replace(".entities.csv", "_gpe.csv")
    
    # Save the unique personal names to CSV in output folder
    unique_gpe_name.to_csv(os.path.join(output_dir, output_name), index=False)
    
    # Optional: print number of unique characters found
    print(f"{output_name}: {len(unique_gpe_name)} unique locations")

a_connecticut_yankee_gpe.csv: 62 unique locations
a_horse_tale_gpe.csv: 26 unique locations
Personal_Recollections_of_Joan_of_Arc_gpe.csv: 69 unique locations
The_Adventures_of_Tom_Sawyer_gpe.csv: 22 unique locations
The_American_Claimant_NER_gpe.csv: 57 unique locations
The_Gilded_Age_gpe.csv: 187 unique locations
The_Mysterious_Stranger_gpe.csv: 14 unique locations
The_Prince_and_the_Pauper_gpe.csv: 47 unique locations
The_Tragedy_of_Pudd’nhead_Wilson_gpe.csv: 43 unique locations
Tom_Sawyer_Abroad_gpe.csv: 40 unique locations
Tom_Sawyer_Detective_gpe.csv: 8 unique locations


And now let's do a last csv dataframe with the unique number of characters!

In [23]:
book = ["A Connecticut Yankee", "A Horse Tale", "Personal Recollections of Joan of Arc", "The Adventures of Tom Sawyer",
        "The American Claimant", "The Gilded Age", "The Mysterious Stranger", "The Prince and the Pauper",
        "The Tragedy of Pudd’nhead Wilson", "Tom Sawyer Abroad", "Tom Sawyer Detective"]

In [24]:
number_of_locations = [62, 26, 69, 22, 57, 187, 14, 47, 43, 40, 8]

In [25]:
len(book)

11

In [26]:
len(number_of_locations)

11

In [27]:
unique_number_of_locations = pd.DataFrame({"Titles":book, "Number of Locations":number_of_locations})

In [28]:
unique_number_of_locations

Unnamed: 0,Titles,Number of Locations
0,A Connecticut Yankee,62
1,A Horse Tale,26
2,Personal Recollections of Joan of Arc,69
3,The Adventures of Tom Sawyer,22
4,The American Claimant,57
5,The Gilded Age,187
6,The Mysterious Stranger,14
7,The Prince and the Pauper,47
8,The Tragedy of Pudd’nhead Wilson,43
9,Tom Sawyer Abroad,40


In [29]:
unique_number_of_locations.to_csv("twain_locations.csv")