In [2]:
from src.util_functions.retrieve_data_from_csv import read_data_from_csv
import pandas as pd

# read from characters file
characters_filepath = "data/fifth_clean_up_data/stage_5_characters.csv"
characters_csv_data = read_data_from_csv(characters_filepath)

# make a df
characters_df = pd.DataFrame(characters_csv_data[1:], columns=characters_csv_data[0])
characters_df.columns

Index(['given_name', 'middle_name', 'maiden_name', 'surname', 'alias',
       'nickname', 'title_prefix', 'title_suffix', 'name_order', 'full_name',
       'fandom', 'gender', 'race', 'rpf_or_fic'],
      dtype='object')

In [3]:
# plotly imports

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
# all-characters gender percentages (all time, entire set)

total_chars = characters_df.get(["full_name"]).count()
total_chars = total_chars.rename(index={"full_name":"total_num_of_characters"})
print(total_chars)

total_gender_percentages = characters_df.get(["full_name","gender"]).groupby("gender").count()
total_gender_percentages = total_gender_percentages.rename(
    columns={"full_name": "count"}
)

total_gender_percentages.index = pd.Categorical( # to set a custom order!
    total_gender_percentages.index, 
    [
        "M | Other",
        "F | Other",
        "F",
        "Other",
        "M | F | Other",
        "Ambig",
        "M",
    ]
)
total_gender_percentages = total_gender_percentages.sort_index()

print(total_gender_percentages) # TO VISUALISE



total_num_of_characters    922
dtype: int64
               count
M | Other          5
F | Other         10
F                368
Other              2
M | F | Other      4
Ambig             12
M                521


In [29]:
# visualising gender totals

gender_labels = total_gender_percentages.index
gender_values = total_gender_percentages["count"]

gender_distr_pie = go.Figure(
    data=[
        go.Pie(
            labels=gender_labels,
            values=gender_values,
            textinfo="label+value", # what you want to label your pie slices
            textposition="outside",
            #insidetextorientation="horizontal", # how you want that text to be oriented
            #automargin=False,
            sort=False,
            marker=dict(
                colors=[
                    "darkturquoise",
                    "red",
                    "hotpink",
                    "yellow",
                    "gold",
                    "green",
                    "cornflowerblue",
                ]
            )
        )
    ]
)

gender_distr_pie.update_layout(
    title="All characters' genders - ao3 data (incl. femslash rankings) from 2013-2023",
    #showlegend=False, # if you want it to not show the legend
)

gender_distr_pie.show()


In [6]:
# how many male vs female vs other characters a fandom has on average

average_gender_per_fandom = characters_df.copy().get(
    ["full_name","fandom","gender"]
)
average_gender_per_fandom.insert(loc=3, column="women", 
    value=(average_gender_per_fandom.gender.where( # only replacing values in one column!
        cond=(characters_df.gender == "F") | (characters_df.gender == "F | Other"),
    )) # pipe acts as OR here, and needs brackets around conditions!
)
average_gender_per_fandom.insert(loc=3, column="men", 
    value=(average_gender_per_fandom.gender.where( # only replacing values in one column!
        cond=(characters_df.gender == "M") | (characters_df.gender == "M | Other") ,
    )) # pipe acts as OR here, and needs brackets around conditions!
)
average_gender_per_fandom.insert(loc=5, column="characters of other or ambiguous gender", 
    value=(average_gender_per_fandom.gender.where( # only replacing values in one column!
        (characters_df.gender != "F") & (characters_df.gender != "F | Other") & (characters_df.gender != "M") & (characters_df.gender != "M | Other")
    )) # & acts as AND
)
average_gender_per_fandom = average_gender_per_fandom.groupby(by="fandom", dropna=False).count().get(
    ["men","women","characters of other or ambiguous gender"]
).mean(0).round(2)

print(average_gender_per_fandom) # TO VISUALISE


men                                        2.50
women                                      1.80
characters of other or ambiguous gender    0.09
dtype: float64


In [7]:
# all-characters race percentages (all time, entire set)

total_race_percentages = characters_df.get(["full_name","race"]).groupby("race").count()
total_race_percentages = total_race_percentages.rename(
    columns={"full_name": "count"}
).sort_values(by="count", ascending=False) 

print(total_race_percentages.sort_index().head(10)) # TO VISUALISE


                          count
race                           
Af Lat                        4
Am Ind                        3
Am Ind / E Asian (Multi)      1
Ambig                        57
As Ind                        3
As Ind / S Asian (Multi)      1
Asian (Multi)                 3
Black                        13
Black (Multi)                 5
Central As                    1


In [30]:
# pie chart

all_race_percent_labels = total_race_percentages.index
all_race_percent_values = total_race_percentages["count"]

all_race_percent_pie = go.Figure(
    data=[
        go.Pie(
            labels=all_race_percent_labels,
            values=all_race_percent_values,
            textinfo="label", # what you want to label your pie slices
            insidetextorientation="horizontal", # how you want that text to be oriented
            automargin=False,
            # marker=dict(
                # colors=template colours or builtin colour sequence
            # )
        )
    ]
)

all_race_percent_pie.update_traces(textposition='inside')
all_race_percent_pie.update_layout(
    title="All characters' racial groups - ao3 2013-2023",
    #showlegend=False, # if you want it to not show the legend
    uniformtext_minsize=10, 
    uniformtext_mode='hide'
)

all_race_percent_pie.show()

In [9]:
# making all groupings dict

all_groupings = {
    "north, west, middle and eastern europe": [
        'White', 
        'White (Multi)',
        'Romani', 
        'Eu Ind (Multi)', 
    ],
    "black (incl afro-latin)": [
        'Black', 
        'Black (Multi)',
        'Af Lat', 
    ],
    "south europe and (rest of) latin": [
        'Latin',
        'Latin (Multi)', 
        'SE Eu', 
        'SE Eu (Multi)', 
    ],
    "middle-east and north-africa": [
        'MENA', 
        'MENA (Multi)', 
    ],
    "east asia": [
        'E Asian', 
        'E Asian (Multi)', 
    ],
    "(rest of) asia": [
        'S Asian', 
        'S Asian (Multi)',
        'SE Asian', 
        'SE Asian (Multi)', 
        'As Ind',
        'As Ind / S Asian (Multi)', 
        'Asian (Multi)', 
        'Central As', 
    ],
    "american & polynesian indigenous": [
        'Am Ind', 
        'Am Ind / E Asian (Multi)', 
        'Māori Ind',
        'Māori Ind (Multi)', 
    ],
    "other": {
        'Ambig': "ambiguous or differing casting", 
        'N.H.': "non-human", 
        'Unknown': "unknown",
    },
}

# template_multi_plot_donuts = make_subplots(rows=2, cols=4, specs=[[
#     {'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}, 
# ],[
#     {'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}, 
# ]], )
#     # make one col & one specs dict per subplot you need
#     # I would assume we can also add more rows?? to be tested

# row_counter = 1
# col_counter = 1
# for group in all_groupings:
#     if group != "other":
#         template_multi_plot_donuts.add_trace(go.Pie(
#             labels=all_groupings[group], 
#             values=[total_race_percentages.loc[index]["count"] for index in all_groupings[group]], #the values are the issue ok
#             title=group, 
#             scalegroup='one',
#             #sort=False, # if you want to keep it in its original order rather than sorting by size
#         ), row_counter, col_counter)
#         if col_counter == 4:
#             row_counter += 1
#             col_counter = 1
#         else: col_counter += 1

# template_multi_plot_donuts.add_trace(go.Pie(
#     labels=[all_groupings["other"][key] for key in all_groupings["other"]], 
#     values=[total_race_percentages.loc[index]["count"] for index in all_groupings["other"]],
#     title="other",
#     scalegroup='one',
#     #sort=False, # if you want to keep it in its original order rather than sorting by size
# ), 2, 4) 

# #colours = [] # colours in same order as labels once again

# template_multi_plot_donuts.update_traces(
#     textinfo='label+percent',
#     textposition='inside',
#     # marker=dict(
#     #     colors=colours, # to use colours
#     #     line=dict(color='#000000', width=2) # to add outline
#     # )
# )
# template_multi_plot_donuts.update_layout(
#     title="Diagram title goes here", 
#     showlegend=False,
#     uniformtext_minsize=10, 
#     uniformtext_mode='hide'
# )

# template_multi_plot_donuts.show()

In [27]:
# or as a bar stack after all??

template_stack=go.Figure()

# iterate over values or index of values
for index in total_race_percentages.index:
    if "White" in index or ("E Asian" in index and "Ind" not in index) or index in all_groupings["other"].keys():
        continue

    # determine variables with conditionals
    for group in all_groupings:
        if group == "other":
            if index in all_groupings[group].keys():
                stack_label = all_groupings[group][index]
        elif index in all_groupings[group]:
            if group == "north, west, middle and eastern europe":
                stack_label = "romani & european indigenous"
            else:
                stack_label = group


    # add trace to figure
    template_stack.add_trace(
        go.Bar(
            x=[stack_label], 
                # stack_label needs to be an array/series/etc of some kind
                # should only be one value per stack, this is what groups the stacks!
            y=total_race_percentages.loc[index], 
                # value you want to portray in each portion of the stack
            text=index,
            textposition="inside",
                # what you want this value to be labelled as
            #marker_color=portion_colour
                # what colour you want this value to be
        )
    )

template_stack.update_layout(
    barmode='stack', 
    showlegend=False, 
    title="Racial groups excluding east asians, white people, and ambiguous, unknown and non-human characters",
    uniformtext_minsize=8, 
    uniformtext_mode='hide',
    xaxis_tickangle=10,
    
)

template_stack.show()


In [11]:
# find fandoms with lowest (aka no) racial diversity

racial_div_by_fandom = characters_df.get(
    ["full_name","fandom","race"]
).groupby(
    ["fandom", "race"]
).count()
racial_div_by_fandom = racial_div_by_fandom.rename(columns={"full_name": "count"})
#print(racial_div_by_fandom)

plural_vs_monoracial_fandoms = pd.DataFrame([
    characters_df.get(["fandom"]).nunique().rename(index={"fandom":"total_fandoms"})
    # counting all unique fandom names for total
])
plural_vs_monoracial_fandoms.insert(
    loc=1, 
    column="fandoms_with_only_one_racial_group", 
    value=racial_div_by_fandom.where( # there is only one racial group in the fandom
        racial_div_by_fandom.groupby(
            racial_div_by_fandom.index.droplevel(1)
        ).count() == 1 
    ).count().rename(
        index={"count": "fandoms_with_only_one_racial_group"}
    )["fandoms_with_only_one_racial_group"]
)
plural_vs_monoracial_fandoms.insert(
    loc=2, 
    column="fandoms_with_multiple_racial_groups", 
    value=plural_vs_monoracial_fandoms["total_fandoms"] - plural_vs_monoracial_fandoms["fandoms_with_only_one_racial_group"]
    # total minus monoracial fandoms
)
plural_vs_monoracial_fandoms = plural_vs_monoracial_fandoms.transpose().rename(columns={0: "count"})
# total is 210
# we have 123 fandoms that have only one race in em... huh..

print(plural_vs_monoracial_fandoms) # TO VISUALISE!


                                     count
total_fandoms                          210
fandoms_with_only_one_racial_group     123
fandoms_with_multiple_racial_groups     87


In [12]:
# find fandoms with highest racial diversity (Genshin would likely be in here)

highest_racial_div = racial_div_by_fandom.where(
    racial_div_by_fandom.groupby(racial_div_by_fandom.index.droplevel(1)).count() > 1
).droplevel(1).dropna()
highest_racial_div = highest_racial_div.groupby(highest_racial_div.index).count().sort_values(
    by="count", ascending=False
).head(6) # everything else was under 5

print(highest_racial_div) # TO VISUALISE


                     count
fandom                    
Marvel                   8
The 100                  7
DC                       7
Star Wars                6
Teen Wolf                5
Genshin Impact | 原神      5


In [13]:
# how much racial diversity a fandom has on average 

average_racial_div = racial_div_by_fandom.groupby(racial_div_by_fandom.index.droplevel(1)).count().mean(0).round(2)
average_racial_div = average_racial_div.rename(index={"count":"average no of racial groups per fandom overall"})

print(average_racial_div) # TO VISUALISE

average no of racial groups per fandom overall    1.7
dtype: float64
