And now that we have understood how to extract Person Entities in one Book, let's do the same thing in all the books!

In [1]:
import pandas as pd 
import os

In [7]:
# List of input CSV file paths

input_files = [
    r"Files\a_dark_nights_work.entities.csv",
    r"Files\cranford.entities.csv",
    r"Files\mary_barton.entities.csv",
    r"Files\my_lady_ludlow.entities.csv", 
    r"Files\north_and_south.entities.csv",
    r"Files\ruth.entities.csv",
    r"Files\sylvia_lovers.entities.csv",
    r"Files\wives_and_daughters.entities.csv"
]

In [8]:
# Create output directory if it does not exist
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)

In [9]:
# Process each file and save results
for file_path in input_files:
    # Read the CSV file
    data = pd.read_csv(file_path, delimiter=';')
    
    # Filter rows where 'cat' is 'GPE'
    person = data[data["cat"] == "GPE"]
    
    # Remove duplicates based on 'text' column
    df = person.drop_duplicates(subset=['text'])
    
    # Filter rows where 'prop' is 'PROP'
    gpe_name = df[df["prop"] == "PROP"]
    
    # Remove duplicates based on 'COREF' column
    unique_gpe_name = gpe_name.drop_duplicates(subset=['COREF'])
    
    # Create output filename based on input file
    base_name = os.path.basename(file_path)
    output_name = base_name.replace(".entities.csv", "_gpe.csv")
    
    # Save the unique personal names to CSV in output folder
    unique_gpe_name.to_csv(os.path.join(output_dir, output_name), index=False)
    
    # Optional: print number of unique characters found
    print(f"{output_name}: {len(unique_gpe_name)} unique locations")

a_dark_nights_work_gpe.csv: 38 unique locations
cranford_gpe.csv: 51 unique locations
mary_barton_gpe.csv: 78 unique locations
my_lady_ludlow_gpe.csv: 57 unique locations
north_and_south_gpe.csv: 65 unique locations
ruth_gpe.csv: 47 unique locations
sylvia_lovers_gpe.csv: 91 unique locations
wives_and_daughters_gpe.csv: 88 unique locations


And now let's do a last csv dataframe with the unique number of characters!

In [10]:
book = ["A Dark Nights Work", "Cranford", "Mary Barton", "My Lady Ludlow", "North and South", "Ruth", "Sylvia Lovers",
        "Wives and Daughters"]

In [11]:
number_of_locations = [38, 51, 78, 57, 65, 47, 91, 88]

In [12]:
len(book)

8

In [13]:
len(number_of_locations)

8

In [14]:
unique_number_of_locations = pd.DataFrame({"Titles":book, "Number of Locations":number_of_locations})

In [15]:
unique_number_of_locations

Unnamed: 0,Titles,Number of Locations
0,A Dark Nights Work,38
1,Cranford,51
2,Mary Barton,78
3,My Lady Ludlow,57
4,North and South,65
5,Ruth,47
6,Sylvia Lovers,91
7,Wives and Daughters,88


In [16]:
unique_number_of_locations.to_csv("gaskell_locations.csv")