And now that we have understood how to extract Person Entities in one Book, let's do the same thing in all the books!

In [25]:
import pandas as pd 
import os

In [26]:
# List of input CSV file paths

input_files = [
    r"Files\a_dark_nights_work.entities.csv",
    r"Files\cranford.entities.csv",
    r"Files\mary_barton.entities.csv",
    r"Files\my_lady_ludlow.entities.csv", 
    r"Files\north_and_south.entities.csv",
    r"Files\ruth.entities.csv",
    r"Files\sylvia_lovers.entities.csv",
    r"Files\wives_and_daughters.entities.csv"
]

In [27]:
# Create output directory if it does not exist
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)

In [28]:
# Process each file and save results
for file_path in input_files:
    # Read the CSV file
    data = pd.read_csv(file_path, delimiter=';')
    
    # Filter rows where 'cat' is 'PER'
    person = data[data["cat"] == "PER"]
    
    # Remove duplicates based on 'text' column
    df = person.drop_duplicates(subset=['text'])
    
    # Filter rows where 'prop' is 'PROP'
    personal_name = df[df["prop"] == "PROP"]
    
    # Remove duplicates based on 'COREF' column
    unique_personal_name = personal_name.drop_duplicates(subset=['COREF'])
    
    # Create output filename based on input file
    base_name = os.path.basename(file_path)
    output_name = base_name.replace(".entities.csv", "_person.csv")
    
    # Save the unique personal names to CSV in output folder
    unique_personal_name.to_csv(os.path.join(output_dir, output_name), index=False)
    
    # Optional: print number of unique characters found
    print(f"{output_name}: {len(unique_personal_name)} unique characters")

a_dark_nights_work_person.csv: 141 unique characters
cranford_person.csv: 205 unique characters
mary_barton_person.csv: 290 unique characters
my_lady_ludlow_person.csv: 202 unique characters
north_and_south_person.csv: 311 unique characters
ruth_person.csv: 214 unique characters
sylvia_lovers_person.csv: 286 unique characters
wives_and_daughters_person.csv: 414 unique characters


And now let's do a last csv dataframe with the unique number of characters!

In [29]:
book = ["A Dark Nights Work", "Cranford", "Mary Barton", "My Lady Ludlow", "North and South", "Ruth", "Sylvia Lovers",
        "Wives and Daughters"]

In [30]:
number_of_characters = [141, 205, 290, 202, 311, 214, 286, 414]

In [31]:
len(book)

8

In [32]:
len(number_of_characters)

8

In [33]:
unique_number_of_characters = pd.DataFrame({"Titles":book, "Number of Characters":number_of_characters})

In [34]:
unique_number_of_characters

Unnamed: 0,Titles,Number of Characters
0,A Dark Nights Work,141
1,Cranford,205
2,Mary Barton,290
3,My Lady Ludlow,202
4,North and South,311
5,Ruth,214
6,Sylvia Lovers,286
7,Wives and Daughters,414


In [35]:
unique_number_of_characters.to_csv("gaskell_characters.csv")