In [112]:
from src.util_functions.retrieve_data_from_csv import read_data_from_csv
import pandas as pd

# read from characters file
characters_filepath = "data/fifth_clean_up_data/stage_5_characters.csv"
characters_csv_data = read_data_from_csv(characters_filepath)

# make a df
characters_df = pd.DataFrame(characters_csv_data[1:], columns=characters_csv_data[0])
characters_df.columns

Index(['given_name', 'middle_name', 'maiden_name', 'surname', 'alias',
       'nickname', 'title_prefix', 'title_suffix', 'name_order', 'full_name',
       'fandom', 'gender', 'race', 'rpf_or_fic'],
      dtype='object')

In [120]:
# all-characters gender percentages (all time, entire set)

total_chars = characters_df.get(["full_name"]).count()
total_chars = total_chars.rename(index={"full_name":"total_num_of_characters"})
print(total_chars)

total_gender_percentages = characters_df.get(["full_name","gender"]).groupby("gender").count()
total_gender_percentages = total_gender_percentages.rename(columns={"full_name": "count"})
print(total_gender_percentages.sort_values(by="count",ascending=False)) # TO VISUALISE



total_num_of_characters    922
dtype: int64
               count
gender              
M                520
F                368
Ambig             12
F | Other         10
M | Other          5
M | F | Other      4
Other              2


In [126]:
# how many male vs female vs other characters a fandom has on average

average_gender_per_fandom = characters_df.copy().get(
    ["full_name","fandom","gender"]
)
average_gender_per_fandom.insert(loc=3, column="women", 
    value=(average_gender_per_fandom.gender.where( # only replacing values in one column!
        cond=(characters_df.gender == "F") | (characters_df.gender == "F | Other"),
    )) # pipe acts as OR here, and needs brackets around conditions!
)
average_gender_per_fandom.insert(loc=3, column="men", 
    value=(average_gender_per_fandom.gender.where( # only replacing values in one column!
        cond=(characters_df.gender == "M") | (characters_df.gender == "M | Other") ,
    )) # pipe acts as OR here, and needs brackets around conditions!
)
average_gender_per_fandom.insert(loc=5, column="characters of other or ambiguous gender", 
    value=(average_gender_per_fandom.gender.where( # only replacing values in one column!
        (characters_df.gender != "F") & (characters_df.gender != "F | Other") & (characters_df.gender != "M") & (characters_df.gender != "M | Other")
    )) # & acts as AND
)
average_gender_per_fandom = average_gender_per_fandom.groupby(by="fandom", dropna=False).count().get(
    ["men","women","characters of other or ambiguous gender"]
).mean(0).round(2)

print(average_gender_per_fandom) # TO VISUALISE


men                                        2.50
women                                      1.80
characters of other or ambiguous gender    0.09
dtype: float64


In [115]:
# all-characters race percentages (all time, entire set)

total_race_percentages = characters_df.get(["full_name","race"]).groupby("race").count()
total_race_percentages = total_race_percentages.rename(columns={"full_name": "count"}) 

print(total_race_percentages.sort_values(by="count", ascending=False).head(10)) # TO VISUALISE




                 count
race                  
White              537
E Asian            163
Ambig               57
N.H.                35
Latin               27
Black               13
E Asian (Multi)     13
MENA                11
SE Asian             9
Latin (Multi)        7


In [130]:
# find fandoms with lowest (aka no) racial diversity

racial_div_by_fandom = characters_df.get(
    ["full_name","fandom","race"]
).groupby(
    ["fandom", "race"]
).count()
racial_div_by_fandom = racial_div_by_fandom.rename(columns={"full_name": "count"})
#print(racial_div_by_fandom)

plural_vs_monoracial_fandoms = pd.DataFrame([
    characters_df.get(["fandom"]).nunique().rename(index={"fandom":"total_fandoms"})
    # counting all unique fandom names for total
])
plural_vs_monoracial_fandoms.insert(
    loc=1, 
    column="fandoms_with_only_one_racial_group", 
    value=racial_div_by_fandom.where( # there is only one racial group in the fandom
        racial_div_by_fandom.groupby(
            racial_div_by_fandom.index.droplevel(1)
        ).count() == 1 
    ).count().rename(
        index={"count": "fandoms_with_only_one_racial_group"}
    )["fandoms_with_only_one_racial_group"]
)
plural_vs_monoracial_fandoms.insert(
    loc=2, 
    column="fandoms_with_multiple_racial_groups", 
    value=plural_vs_monoracial_fandoms["total_fandoms"] - plural_vs_monoracial_fandoms["fandoms_with_only_one_racial_group"]
    # total minus monoracial fandoms
)
plural_vs_monoracial_fandoms = plural_vs_monoracial_fandoms.transpose().rename(columns={0: "count"})
# total is 210
# we have 123 fandoms that have only one race in em... huh..

print(plural_vs_monoracial_fandoms) # TO VISUALISE!


                                     count
total_fandoms                          210
fandoms_with_only_one_racial_group     123
fandoms_with_multiple_racial_groups     87


In [117]:
# find fandoms with highest racial diversity (Genshin would likely be in here)

highest_racial_div = racial_div_by_fandom.where(
    racial_div_by_fandom.groupby(racial_div_by_fandom.index.droplevel(1)).count() > 1
).droplevel(1).dropna()
highest_racial_div = highest_racial_div.groupby(highest_racial_div.index).count().sort_values(
    by="count", ascending=False
).head(6) # everything else was under 5

print(highest_racial_div) # TO VISUALISE


                     count
fandom                    
Marvel                   8
The 100                  7
DC                       7
Star Wars                6
Teen Wolf                5
Genshin Impact | 原神      5


In [118]:
# how much racial diversity a fandom has on average 

average_racial_div = racial_div_by_fandom.groupby(racial_div_by_fandom.index.droplevel(1)).count().mean(0).round(2)
average_racial_div = average_racial_div.rename(index={"count":"average no of racial groups per fandom overall"})

print(average_racial_div) # TO VISUALISE

average no of racial groups per fandom overall    1.7
dtype: float64
