And now that we have understood how to extract Person Entities in one Book, let's do the same thing in all the books!

In [32]:
import pandas as pd 
import os

In [33]:
# List of input CSV file paths

input_files = [
    r"Files\a_christmas_carol.entities.csv",
    r"Files\a_tale_of_two_cities.entities.csv",
    r"Files\barnaby_rudge.entities.csv",
    r"Files\bleak_house.entities.csv", 
    r"Files\david_copperfield.entities.csv",
    r"Files\Dombey_and_son.entities.csv",
    r"Files\great_expectations.entities.csv",
    r"Files\hard_times.entities.csv",
    r"Files\little_dorrit.entities.csv",
    r"Files\martin_chuzzlewit.entities.csv", 
    r"Files\nicholas_nickleby.entities.csv",
    r"Files\old_curiosity.entities.csv",
    r"Files\oliver_twist.entities.csv",
    r"Files\our_mutual_friend.entities.csv",
    r"Files\the_battle_of_life.entities.csv",
    r"Files\the_chimes.entities.csv",
    r"Files\the_cricket_on_the_heart.entities.csv",
    r"Files\the_haunted_man.entities.csv",
    r"Files\the_pickwick_papers.entities.csv"
]

In [34]:
# Create output directory if it does not exist
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)

In [35]:
# Process each file and save results
for file_path in input_files:
    # Read the CSV file
    data = pd.read_csv(file_path, delimiter=';')
    
    # Filter rows where 'cat' is 'PER'
    person = data[data["cat"] == "PER"]
    
    # Remove duplicates based on 'text' column
    df = person.drop_duplicates(subset=['text'])
    
    # Filter rows where 'prop' is 'PROP'
    personal_name = df[df["prop"] == "PROP"]
    
    # Remove duplicates based on 'COREF' column
    unique_personal_name = personal_name.drop_duplicates(subset=['COREF'])
    
    # Create output filename based on input file
    base_name = os.path.basename(file_path)
    output_name = base_name.replace(".entities.csv", "_person.csv")
    
    # Save the unique personal names to CSV in output folder
    unique_personal_name.to_csv(os.path.join(output_dir, output_name), index=False)
    
    # Optional: print number of unique characters found
    print(f"{output_name}: {len(unique_personal_name)} unique characters")

a_christmas_carol_person.csv: 68 unique characters
a_tale_of_two_cities_person.csv: 160 unique characters
barnaby_rudge_person.csv: 246 unique characters
bleak_house_person.csv: 444 unique characters
david_copperfield_person.csv: 442 unique characters
Dombey_and_son_person.csv: 481 unique characters
great_expectations_person.csv: 278 unique characters
hard_times_person.csv: 162 unique characters
little_dorrit_person.csv: 467 unique characters
martin_chuzzlewit_person.csv: 476 unique characters
nicholas_nickleby_person.csv: 422 unique characters
old_curiosity_person.csv: 237 unique characters
oliver_twist_person.csv: 172 unique characters
our_mutual_friend_person.csv: 574 unique characters
the_battle_of_life_person.csv: 69 unique characters
the_chimes_person.csv: 63 unique characters
the_cricket_on_the_heart_person.csv: 64 unique characters
the_haunted_man_person.csv: 63 unique characters
the_pickwick_papers_person.csv: 525 unique characters


And now let's do a last csv dataframe with the unique number of characters!

In [36]:
book = ["A Christmas Carol", "A Tale of Two Cities", "Barnaby Rudge", "Bleak House", "David Copperfield", 
        "Dombey and Son", "Great Expectations", "Hard Times", "Little Dorrit", "Martin Chuzzlewit", 
        "Nicholas Nickleby", "Old Curiosity", "Oliver Twist", "Our Mutual Friend", "The Battle of Life",
        "The Chimes", "The Cricket", "The Haunted Man", "The Pickwick Papers"]

In [37]:
number_of_characters = [68, 160, 246, 444, 442, 481, 278, 162, 467, 476, 422, 237, 172, 574, 69, 63, 64, 63, 525]

In [38]:
len(book)

19

In [39]:
len(number_of_characters)

19

In [40]:
unique_number_of_characters = pd.DataFrame({"Titles":book, "Number of Characters":number_of_characters})

In [41]:
unique_number_of_characters

Unnamed: 0,Titles,Number of Characters
0,A Christmas Carol,68
1,A Tale of Two Cities,160
2,Barnaby Rudge,246
3,Bleak House,444
4,David Copperfield,442
5,Dombey and Son,481
6,Great Expectations,278
7,Hard Times,162
8,Little Dorrit,467
9,Martin Chuzzlewit,476


In [42]:
unique_number_of_characters.to_csv("dickens_characters.csv")