And now that we have understood how to extract Person Entities in one Book, let's do the same thing in all the books!

In [8]:
import pandas as pd 
import os

In [9]:
pwd

'C:\\Users\\usuario\\ELENA\\it-training uzh\\it-training uzh\\Python Data Analytics Essentials\\Edith Warton\\entities\\GPE'

In [10]:
# List of input CSV file paths

input_files = [
    r"Files\a_son_at_the_front.entities.csv",
    r"Files\summer.entities.csv",
    r"Files\the_age_of_innocence.entities.csv",
    r"Files\the_children.entities.csv", 
    r"Files\the_custom_of_the_country.entities.csv",
    r"Files\the_fruit_of_the_tree.entities.csv",
    r"Files\the_glimpses_of_the_moon.entities.csv",
    r"Files\the_house_of_mirth.entities.csv",
    r"Files\the_mother_recompense.entities.csv",
    r"Files\the_reef.entities.csv", 
    r"Files\the_valley_of_decision.entities.csv",
    r"Files\twilight_sleep.entities.csv"
]

In [11]:
# Create output directory if it does not exist
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)

In [12]:
# Process each file and save results
for file_path in input_files:
    # Read the CSV file
    data = pd.read_csv(file_path, delimiter=';')
    
    # Filter rows where 'cat' is 'GPE'
    person = data[data["cat"] == "GPE"]
    
    # Remove duplicates based on 'text' column
    df = person.drop_duplicates(subset=['text'])
    
    # Filter rows where 'prop' is 'PROP'
    gpe_name = df[df["prop"] == "PROP"]
    
    # Remove duplicates based on 'COREF' column
    unique_gpe_name = gpe_name.drop_duplicates(subset=['COREF'])
    
    # Create output filename based on input file
    base_name = os.path.basename(file_path)
    output_name = base_name.replace(".entities.csv", "_gpe.csv")
    
    # Save the unique personal names to CSV in output folder
    unique_gpe_name.to_csv(os.path.join(output_dir, output_name), index=False)
    
    # Optional: print number of unique characters found
    print(f"{output_name}: {len(unique_gpe_name)} unique locations")

a_son_at_the_front_gpe.csv: 113 unique locations
summer_gpe.csv: 26 unique locations
the_age_of_innocence_gpe.csv: 118 unique locations
the_children_gpe.csv: 77 unique locations
the_custom_of_the_country_gpe.csv: 106 unique locations
the_fruit_of_the_tree_gpe.csv: 56 unique locations
the_glimpses_of_the_moon_gpe.csv: 97 unique locations
the_house_of_mirth_gpe.csv: 53 unique locations
the_mother_recompense_gpe.csv: 54 unique locations
the_reef_gpe.csv: 45 unique locations
the_valley_of_decision_gpe.csv: 145 unique locations
twilight_sleep_gpe.csv: 68 unique locations


And now let's do a last csv dataframe with the unique number of characters!

In [13]:
book = ["A Son at the Front", "Summer", "The Age of Innocence", "The Children", "The Custom of the Country",
        "The Fruit of the Tree", "The Glimpses of the Moon", "The House of Mirth", "The Mother Recompense",
        "The Reef", "The Valley of Decision", "Twilight Sleep"]

In [15]:
number_of_locations = [113, 26, 118, 77, 106, 56, 97, 53, 54, 45, 145, 68]

In [16]:
len(book)

12

In [17]:
len(number_of_locations)

12

In [18]:
unique_number_of_locations = pd.DataFrame({"Titles":book, "Number of Locations":number_of_locations})

In [19]:
unique_number_of_locations

Unnamed: 0,Titles,Number of Locations
0,A Son at the Front,113
1,Summer,26
2,The Age of Innocence,118
3,The Children,77
4,The Custom of the Country,106
5,The Fruit of the Tree,56
6,The Glimpses of the Moon,97
7,The House of Mirth,53
8,The Mother Recompense,54
9,The Reef,45


In [20]:
unique_number_of_locations.to_csv("wharton_locations.csv")