In [1]:
import pandas as pd
import json
import glob
import os

In [2]:
# Folder containing your JSON files
json_folder = "Files/"
json_files = glob.glob(os.path.join(json_folder, "*.book"))

In [3]:
# List to store each book's data as a dictionary
all_books_data = []

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        book_data = json.load(f)
    
    characters = book_data.get("characters", [])
    gender_counts = {}

    # Count genders
    for ch in characters:
        g = ch.get("g") or {}
        gender = g.get("argmax")
        if gender:
            gender_counts[gender] = gender_counts.get(gender, 0) + 1
    
    # Get book title and author
    book_title = book_data.get("title", os.path.basename(file_path))
    
    # Combine counts with book info
    book_record = {"Titles": book_title}
    book_record.update(gender_counts)  # Adds Male/Female/Plural counts
    all_books_data.append(book_record)

In [4]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_books_data)

# Fill missing columns with 0 (in case some books don't have a gender category)
df = df.fillna(0)

In [5]:
df

Unnamed: 0,Titles,he/him/his,she/her,they/them/their,ze/zem/zir/hir
0,a_connecticut_yankee.book,379,86,359,0.0
1,a_horse_tale.book,54,39,52,0.0
2,Personal_Recollections_of_Joan_of_Arc.book,208,90,245,0.0
3,The_Adventures_of_Tom_Sawyer.book,148,57,165,0.0
4,The_American_Claimant_NER.book,215,59,151,0.0
5,The_Gilded_Age.book,391,174,366,0.0
6,The_Mysterious_Stranger.book,112,40,118,0.0
7,The_Prince_and_the_Pauper.book,259,48,181,1.0
8,The_Tragedy_of_Pudd’nhead_Wilson.book,167,67,99,0.0
9,Tom_Sawyer_Abroad.book,77,23,74,0.0


Let's just fix the titles so that we can later on combine dataframes if we wan to do that. 

In [6]:
book = ["A Connecticut Yankee", "A Horse Tale", "Personal Recollections of Joan of Arc", "The Adventures of Tom Sawyer",
        "The American Claimant", "The Gilded Age", "The Mysterious Stranger", "The Prince and the Pauper",
        "The Tragedy of Pudd’nhead Wilson", "Tom Sawyer Abroad", "Tom Sawyer Detective"]

In [7]:
df["Titles"] = book

And let's add one extra column with the name of the author

In [8]:
len(df)

11

In [9]:
df["Author"] = ["Mark Twain"] * 11

In [10]:
df.rename(columns={'he/him/his': 'male', "they/them/their" : "plural", "she/her" : "female"}, inplace=True)

In [11]:
df

Unnamed: 0,Titles,male,female,plural,ze/zem/zir/hir,Author
0,A Connecticut Yankee,379,86,359,0.0,Mark Twain
1,A Horse Tale,54,39,52,0.0,Mark Twain
2,Personal Recollections of Joan of Arc,208,90,245,0.0,Mark Twain
3,The Adventures of Tom Sawyer,148,57,165,0.0,Mark Twain
4,The American Claimant,215,59,151,0.0,Mark Twain
5,The Gilded Age,391,174,366,0.0,Mark Twain
6,The Mysterious Stranger,112,40,118,0.0,Mark Twain
7,The Prince and the Pauper,259,48,181,1.0,Mark Twain
8,The Tragedy of Pudd’nhead Wilson,167,67,99,0.0,Mark Twain
9,Tom Sawyer Abroad,77,23,74,0.0,Mark Twain


And now let's save that to a csv dataframe!

In [12]:
df.to_csv("twain_gender_counts.csv")