And now that we have understood how to extract Person Entities in one Book, let's do the same thing in all the books!

In [20]:
import pandas as pd 
import os

In [21]:
# List of input CSV file paths

input_files = [
    r"Files\a_christmas_carol.entities.csv",
    r"Files\a_tale_of_two_cities.entities.csv",
    r"Files\barnaby_rudge.entities.csv",
    r"Files\bleak_house.entities.csv", 
    r"Files\david_copperfield.entities.csv",
    r"Files\Dombey_and_son.entities.csv",
    r"Files\great_expectations.entities.csv",
    r"Files\hard_times.entities.csv",
    r"Files\little_dorrit.entities.csv",
    r"Files\martin_chuzzlewit.entities.csv", 
    r"Files\nicholas_nickleby.entities.csv",
    r"Files\old_curiosity.entities.csv",
    r"Files\oliver_twist.entities.csv",
    r"Files\our_mutual_friend.entities.csv",
    r"Files\the_battle_of_life.entities.csv",
    r"Files\the_chimes.entities.csv",
    r"Files\the_cricket_on_the_heart.entities.csv",
    r"Files\the_haunted_man.entities.csv",
    r"Files\the_pickwick_papers.entities.csv"
]

In [22]:
# Create output directory if it does not exist
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)

In [23]:
# Process each file and save results
for file_path in input_files:
    # Read the CSV file
    data = pd.read_csv(file_path, delimiter=';')
    
    # Filter rows where 'cat' is 'GPE'
    person = data[data["cat"] == "GPE"]
    
    # Remove duplicates based on 'text' column
    df = person.drop_duplicates(subset=['text'])
    
    # Filter rows where 'prop' is 'PROP'
    gpe_name = df[df["prop"] == "PROP"]
    
    # Remove duplicates based on 'COREF' column
    unique_gpe_name = gpe_name.drop_duplicates(subset=['COREF'])
    
    # Create output filename based on input file
    base_name = os.path.basename(file_path)
    output_name = base_name.replace(".entities.csv", "_gpe.csv")
    
    # Save the unique personal names to CSV in output folder
    unique_gpe_name.to_csv(os.path.join(output_dir, output_name), index=False)
    
    # Optional: print number of unique characters found
    print(f"{output_name}: {len(unique_gpe_name)} unique locations")

a_christmas_carol_gpe.csv: 7 unique locations
a_tale_of_two_cities_gpe.csv: 59 unique locations
barnaby_rudge_gpe.csv: 72 unique locations
bleak_house_gpe.csv: 122 unique locations
david_copperfield_gpe.csv: 113 unique locations
Dombey_and_son_gpe.csv: 98 unique locations
great_expectations_gpe.csv: 75 unique locations
hard_times_gpe.csv: 34 unique locations
little_dorrit_gpe.csv: 127 unique locations
martin_chuzzlewit_gpe.csv: 88 unique locations
nicholas_nickleby_gpe.csv: 93 unique locations
old_curiosity_gpe.csv: 34 unique locations
oliver_twist_gpe.csv: 53 unique locations
our_mutual_friend_gpe.csv: 121 unique locations
the_battle_of_life_gpe.csv: 8 unique locations
the_chimes_gpe.csv: 5 unique locations
the_cricket_on_the_heart_gpe.csv: 6 unique locations
the_haunted_man_gpe.csv: 11 unique locations
the_pickwick_papers_gpe.csv: 101 unique locations


And now let's do a last csv dataframe with the unique number of characters!

In [24]:
book = ["A Christmas Carol", "A Tale of Two Cities", "Barnaby Rudge", "Bleak House", "David Copperfield", 
        "Dombey and Son", "Great Expectations", "Hard Times", "Little Dorrit", "Martin Chuzzlewit", 
        "Nicholas Nickleby", "Old Curiosity", "Oliver Twist", "Our Mutual Friend", "The Battle of Life",
        "The Chimes", "The Cricket", "The Haunted Man", "The Pickwick Papers"]

In [25]:
number_of_locations = [7, 59, 72, 122, 113, 98, 75, 34, 127, 88, 93, 34, 53, 121, 8, 5, 6, 11, 101]

In [26]:
len(book)

19

In [27]:
len(number_of_locations)

19

In [28]:
unique_number_of_locations = pd.DataFrame({"Titles":book, "Number of Locations":number_of_locations})

In [29]:
unique_number_of_locations

Unnamed: 0,Titles,Number of Locations
0,A Christmas Carol,7
1,A Tale of Two Cities,59
2,Barnaby Rudge,72
3,Bleak House,122
4,David Copperfield,113
5,Dombey and Son,98
6,Great Expectations,75
7,Hard Times,34
8,Little Dorrit,127
9,Martin Chuzzlewit,88


In [30]:
unique_number_of_locations.to_csv("dickens_locations.csv")