And now that we have understood how to extract Person Entities in one Book, let's do the same thing in all the books!

In [26]:
import pandas as pd 
import os

In [27]:
# List of input CSV file paths

input_files = [
    r"Files\a_son_at_the_front.entities.csv",
    r"Files\summer.entities.csv",
    r"Files\the_age_of_innocence.entities.csv",
    r"Files\the_children.entities.csv", 
    r"Files\the_custom_of_the_country.entities.csv",
    r"Files\the_fruit_of_the_tree.entities.csv",
    r"Files\the_glimpses_of_the_moon.entities.csv",
    r"Files\the_house_of_mirth.entities.csv",
    r"Files\the_mother_recompense.entities.csv",
    r"Files\the_reef.entities.csv", 
    r"Files\the_valley_of_decision.entities.csv",
    r"Files\twilight_sleep.entities.csv"
]

In [28]:
# Create output directory if it does not exist
output_dir = "output_files"
os.makedirs(output_dir, exist_ok = True)

In [29]:
# Process each file and save results
for file_path in input_files:
    # Read the CSV file
    data = pd.read_csv(file_path, delimiter=';')
    
    # Filter rows where 'cat' is 'PER'
    person = data[data["cat"] == "PER"]
    
    # Remove duplicates based on 'text' column
    df = person.drop_duplicates(subset=['text'])
    
    # Filter rows where 'prop' is 'PROP'
    personal_name = df[df["prop"] == "PROP"]
    
    # Remove duplicates based on 'COREF' column
    unique_personal_name = personal_name.drop_duplicates(subset=['COREF'])
    
    # Create output filename based on input file
    base_name = os.path.basename(file_path)
    output_name = base_name.replace(".entities.csv", "_person.csv")
    
    # Save the unique personal names to CSV in output folder
    unique_personal_name.to_csv(os.path.join(output_dir, output_name), index=False)
    
    # Optional: print number of unique characters found
    print(f"{output_name}: {len(unique_personal_name)} unique characters")

a_son_at_the_front_person.csv: 187 unique characters
summer_person.csv: 79 unique characters
the_age_of_innocence_person.csv: 298 unique characters
the_children_person.csv: 156 unique characters
the_custom_of_the_country_person.csv: 229 unique characters
the_fruit_of_the_tree_person.csv: 157 unique characters
the_glimpses_of_the_moon_person.csv: 164 unique characters
the_house_of_mirth_person.csv: 198 unique characters
the_mother_recompense_person.csv: 152 unique characters
the_reef_person.csv: 97 unique characters
the_valley_of_decision_person.csv: 325 unique characters
twilight_sleep_person.csv: 164 unique characters


And now let's do a last csv dataframe with the unique number of characters!

In [30]:
book = ["A Son at the Front", "Summer", "The Age of Innocence", "The Children", "The Custom of the Country",
        "The Fruit of the Tree", "The Glimpses of the Moon", "The House of Mirth", "The Mother Recompense",
        "The Reef", "The Valley of Decision", "Twilight Sleep"]

In [31]:
number_of_characters = [187, 79, 298, 156, 229, 157, 164, 198, 152, 97, 325, 164]

In [32]:
len(book)

12

In [33]:
len(number_of_characters)

12

In [34]:
unique_number_of_characters = pd.DataFrame({"Titles":book, "Number of Characters":number_of_characters})

In [35]:
unique_number_of_characters

Unnamed: 0,Titles,Number of Characters
0,A Son at the Front,187
1,Summer,79
2,The Age of Innocence,298
3,The Children,156
4,The Custom of the Country,229
5,The Fruit of the Tree,157
6,The Glimpses of the Moon,164
7,The House of Mirth,198
8,The Mother Recompense,152
9,The Reef,97


In [36]:
unique_number_of_characters.to_csv("wharton_characters.csv")