In [1]:
import pandas as pd
import json
import glob
import os

In [2]:
# Folder containing your JSON files
json_folder = "Files/"
json_files = glob.glob(os.path.join(json_folder, "*.book"))

In [3]:
# List to store each book's data as a dictionary
all_books_data = []

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        book_data = json.load(f)
    
    characters = book_data.get("characters", [])
    gender_counts = {}

    # Count genders
    for ch in characters:
        g = ch.get("g") or {}
        gender = g.get("argmax")
        if gender:
            gender_counts[gender] = gender_counts.get(gender, 0) + 1
    
    # Get book title and author
    book_title = book_data.get("title", os.path.basename(file_path))
    
    # Combine counts with book info
    book_record = {"Titles": book_title}
    book_record.update(gender_counts)  # Adds Male/Female/Plural counts
    all_books_data.append(book_record)

In [4]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_books_data)

# Fill missing columns with 0 (in case some books don't have a gender category)
df = df.fillna(0)

In [5]:
df

Unnamed: 0,Titles,he/him/his,they/them/their,she/her
0,a_christmas_carol.book,92,82,33
1,a_tale_of_two_cities.book,360,321,147
2,barnaby_rudge.book,548,675,200
3,bleak_house.book,671,514,383
4,david_copperfield.book,578,407,431
5,Dombey_and_son.book,608,505,419
6,great_expectations.book,414,212,140
7,hard_times.book,202,197,190
8,little_dorrit.book,666,628,479
9,martin_chuzzlewit.book,700,707,344


Let's just fix the titles so that we can later on combine dataframes if we wan to do that. 

In [6]:
book = ["A Christmas Carol", "A Tale of Two Cities", "Barnaby Rudge", "Bleak House", "David Copperfield", 
        "Dombey and Son", "Great Expectations", "Hard Times", "Little Dorrit", "Martin Chuzzlewit", 
        "Nicholas Nickleby", "Old Curiosity", "Oliver Twist", "Our Mutual Friend", "The Battle of Life",
        "The Chimes", "The Cricket", "The Haunted Man", "The Pickwick Papers"]

In [7]:
df["Titles"] = book

And let's add one extra column with the name of the author

In [8]:
len(df)

19

In [9]:
df["Author"] = ["Charles Dickens"] * 19

And let's change "he/him/his" for male, "they/them/their" for plural, and "she/her" for female

In [10]:
df.rename(columns={'he/him/his': 'male', "they/them/their" : "plural", "she/her" : "female"}, inplace=True)

In [11]:
df

Unnamed: 0,Titles,male,plural,female,Author
0,A Christmas Carol,92,82,33,Charles Dickens
1,A Tale of Two Cities,360,321,147,Charles Dickens
2,Barnaby Rudge,548,675,200,Charles Dickens
3,Bleak House,671,514,383,Charles Dickens
4,David Copperfield,578,407,431,Charles Dickens
5,Dombey and Son,608,505,419,Charles Dickens
6,Great Expectations,414,212,140,Charles Dickens
7,Hard Times,202,197,190,Charles Dickens
8,Little Dorrit,666,628,479,Charles Dickens
9,Martin Chuzzlewit,700,707,344,Charles Dickens


And now let's save that to a csv dataframe!

In [12]:
df.to_csv("dickens_gender_counts.csv")