In [1]:
import pandas as pd
import json
import glob
import os

In [2]:
# Folder containing your JSON files
json_folder = "Files/"
json_files = glob.glob(os.path.join(json_folder, "*.book"))

In [3]:
# List to store each book's data as a dictionary
all_books_data = []

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        book_data = json.load(f)
    
    characters = book_data.get("characters", [])
    gender_counts = {}

    # Count genders
    for ch in characters:
        g = ch.get("g") or {}
        gender = g.get("argmax")
        if gender:
            gender_counts[gender] = gender_counts.get(gender, 0) + 1
    
    # Get book title and author
    book_title = book_data.get("title", os.path.basename(file_path))
    
    # Combine counts with book info
    book_record = {"Titles": book_title}
    book_record.update(gender_counts)  # Adds Male/Female/Plural counts
    all_books_data.append(book_record)

In [4]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_books_data)

# Fill missing columns with 0 (in case some books don't have a gender category)
df = df.fillna(0)

In [5]:
df

Unnamed: 0,Titles,she/her,he/him/his,they/them/their
0,a_dark_nights_work.book,84,149,151
1,cranford.book,113,133,143
2,mary_barton.book,224,347,354
3,my_lady_ludlow.book,127,207,157
4,north_and_south.book,206,306,423
5,ruth.book,196,267,312
6,sylvia_lovers.book,270,349,474
7,wives_and_daughters.book,374,350,510


Let's just fix the titles so that we can later on combine dataframes if we wan to do that. 

In [6]:
book = ["A Dark Nights Work", "Cranford", "Mary Barton", "My Lady Ludlow", "North and South", "Ruth", "Sylvia Lovers",
        "Wives and Daughters"]

In [7]:
df["Titles"] = book

And let's add one extra column with the name of the author

In [8]:
len(df)

8

In [9]:
df["Author"] = ["Elizabeth Gaskell"] * 8

In [10]:
df.rename(columns={'he/him/his': 'male', "they/them/their" : "plural", "she/her" : "female"}, inplace=True)

In [11]:
df

Unnamed: 0,Titles,female,male,plural,Author
0,A Dark Nights Work,84,149,151,Elizabeth Gaskell
1,Cranford,113,133,143,Elizabeth Gaskell
2,Mary Barton,224,347,354,Elizabeth Gaskell
3,My Lady Ludlow,127,207,157,Elizabeth Gaskell
4,North and South,206,306,423,Elizabeth Gaskell
5,Ruth,196,267,312,Elizabeth Gaskell
6,Sylvia Lovers,270,349,474,Elizabeth Gaskell
7,Wives and Daughters,374,350,510,Elizabeth Gaskell


And now let's save that to a csv dataframe!

In [12]:
df.to_csv("gaskell_gender_counts.csv")