In [1]:
import pandas as pd
import json
import glob
import os

In [2]:
# Folder containing your JSON files
json_folder = "Files/"
json_files = glob.glob(os.path.join(json_folder, "*.book"))

In [3]:
# List to store each book's data as a dictionary
all_books_data = []

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        book_data = json.load(f)
    
    characters = book_data.get("characters", [])
    gender_counts = {}

    # Count genders
    for ch in characters:
        g = ch.get("g") or {}
        gender = g.get("argmax")
        if gender:
            gender_counts[gender] = gender_counts.get(gender, 0) + 1
    
    # Get book title and author
    book_title = book_data.get("title", os.path.basename(file_path))
    
    # Combine counts with book info
    book_record = {"Titles": book_title}
    book_record.update(gender_counts)  # Adds Male/Female/Plural counts
    all_books_data.append(book_record)

In [4]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_books_data)

# Fill missing columns with 0 (in case some books don't have a gender category)
df = df.fillna(0)

In [5]:
df

Unnamed: 0,Titles,he/him/his,she/her,they/them/their
0,a_son_at_the_front.book,181,152,273
1,summer.book,85,83,119
2,the_age_of_innocence.book,194,193,278
3,the_children.book,141,168,218
4,the_custom_of_the_country.book,258,188,318
5,the_fruit_of_the_tree.book,190,146,252
6,the_glimpses_of_the_moon.book,87,104,216
7,the_house_of_mirth.book,156,162,237
8,the_mother_recompense.book,113,110,129
9,the_reef.book,106,122,165


Let's just fix the titles so that we can later on combine dataframes if we wan to do that. 

In [6]:
book = ["A Son at the Front", "Summer", "The Age of Innocence", "The Children", "The Custom of the Country",
        "The Fruit of the Tree", "The Glimpses of the Moon", "The House of Mirth", "The Mother Recompense",
        "The Reef", "The Valley of Decision", "Twilight Sleep"]

In [7]:
df["Titles"] = book

And let's add one extra column with the name of the author

In [8]:
len(df)

12

In [9]:
df["Author"] = ["Edith Wharton"] * 12

In [10]:
df.rename(columns={'he/him/his': 'male', "they/them/their" : "plural", "she/her" : "female"}, inplace=True)

In [11]:
df

Unnamed: 0,Titles,male,female,plural,Author
0,A Son at the Front,181,152,273,Edith Wharton
1,Summer,85,83,119,Edith Wharton
2,The Age of Innocence,194,193,278,Edith Wharton
3,The Children,141,168,218,Edith Wharton
4,The Custom of the Country,258,188,318,Edith Wharton
5,The Fruit of the Tree,190,146,252,Edith Wharton
6,The Glimpses of the Moon,87,104,216,Edith Wharton
7,The House of Mirth,156,162,237,Edith Wharton
8,The Mother Recompense,113,110,129,Edith Wharton
9,The Reef,106,122,165,Edith Wharton


And now let's save that to a csv dataframe!

In [12]:
df.to_csv("wharton_gender_counts.csv")