And now that we have understood how to extract Person Entities in one Book, let's do the same thing in all the books!

In [25]:
import pandas as pd 
import os

In [26]:
# List of input CSV file paths

input_files = [
    r"Files\a_connecticut_yankee.entities.csv",
    r"Files\a_horse_tale.entities.csv",
    r"Files\Personal_Recollections_of_Joan_of_Arc.entities.csv",
    r"Files\The_Adventures_of_Tom_Sawyer.entities.csv", 
    r"Files\The_American_Claimant_NER.entities.csv",
    r"Files\The_Gilded_Age.entities.csv",
    r"Files\The_Mysterious_Stranger.entities.csv",
    r"Files\The_Prince_and_the_Pauper.entities.csv",
    r"Files\The_Tragedy_of_Pudd’nhead_Wilson.entities.csv",
    r"Files\Tom_Sawyer_Abroad.entities.csv",
    r"Files\Tom_Sawyer_Detective.entities.csv"
]

In [27]:
# Create output directory if it does not exist
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)

In [28]:
# Process each file and save results
for file_path in input_files:
    # Read the CSV file
    data = pd.read_csv(file_path, delimiter=';')
    
    # Filter rows where 'cat' is 'PER'
    person = data[data["cat"] == "PER"]
    
    # Remove duplicates based on 'text' column
    df = person.drop_duplicates(subset=['text'])
    
    # Filter rows where 'prop' is 'PROP'
    personal_name = df[df["prop"] == "PROP"]
    
    # Remove duplicates based on 'COREF' column
    unique_personal_name = personal_name.drop_duplicates(subset=['COREF'])
    
    # Create output filename based on input file
    base_name = os.path.basename(file_path)
    output_name = base_name.replace(".entities.csv", "_person.csv")
    
    # Save the unique personal names to CSV in output folder
    unique_personal_name.to_csv(os.path.join(output_dir, output_name), index=False)
    
    # Optional: print number of unique characters found
    print(f"{output_name}: {len(unique_personal_name)} unique characters")

a_connecticut_yankee_person.csv: 227 unique characters
a_horse_tale_person.csv: 66 unique characters
Personal_Recollections_of_Joan_of_Arc_person.csv: 155 unique characters
The_Adventures_of_Tom_Sawyer_person.csv: 132 unique characters
The_American_Claimant_NER_person.csv: 169 unique characters
The_Gilded_Age_person.csv: 422 unique characters
The_Mysterious_Stranger_person.csv: 56 unique characters
The_Prince_and_the_Pauper_person.csv: 149 unique characters
The_Tragedy_of_Pudd’nhead_Wilson_person.csv: 154 unique characters
Tom_Sawyer_Abroad_person.csv: 70 unique characters
Tom_Sawyer_Detective_person.csv: 60 unique characters


And now let's do a last csv dataframe with the unique number of characters!

In [29]:
book = ["A Connecticut Yankee", "A Horse Tale", "Personal Recollections of Joan of Arc", "The Adventures of Tom Sawyer",
        "The American Claimant", "The Gilded Age", "The Mysterious Stranger", "The Prince and the Pauper",
        "The Tragedy of Pudd’nhead Wilson", "Tom Sawyer Abroad", "Tom Sawyer Detective"]

In [30]:
number_of_characters = [227, 66, 155, 132, 169, 422, 56, 149, 154, 70, 60]

In [31]:
len(book)

11

In [32]:
len(number_of_characters)

11

In [33]:
unique_number_of_characters = pd.DataFrame({"Titles":book, "Number of Characters":number_of_characters})

In [34]:
unique_number_of_characters

Unnamed: 0,Titles,Number of Characters
0,A Connecticut Yankee,227
1,A Horse Tale,66
2,Personal Recollections of Joan of Arc,155
3,The Adventures of Tom Sawyer,132
4,The American Claimant,169
5,The Gilded Age,422
6,The Mysterious Stranger,56
7,The Prince and the Pauper,149
8,The Tragedy of Pudd’nhead Wilson,154
9,Tom Sawyer Abroad,70


In [35]:
unique_number_of_characters.to_csv("twain_characters.csv")