In [1]:
from src.util_functions.retrieve_data_from_csv import read_data_from_csv
import pandas as pd

# read from ships file
ships_filepath = "data/fifth_clean_up_data/stage_5_ships.csv"
ships_csv_data = read_data_from_csv(ships_filepath)

# make a df
ships_df = pd.DataFrame(ships_csv_data[1:], columns=ships_csv_data[0])
ships_df.columns

Index(['slash_ship', 'gen_ship', 'members_no', 'fandom', 'rpf_or_fic',
       'gender_combo', 'race_combo', 'member_1', 'member_2', 'member_3',
       'member_4'],
      dtype='object')

In [32]:
#all ships gender combo percentages
    #of which how many mlm vs het (incl x other) vs wlw

total_ships = ships_df.copy().get(["slash_ship"]).count()
total_ships = total_ships.rename(index={"slash_ship":"total_num_of_ships"})
print(total_ships)

total_gender_percentages = ships_df.copy().get(["slash_ship","gender_combo"]).groupby("gender_combo").count().rename(index={
    "F / M": "M / F",
    "Ambig / M": "M / Ambig",
    "Ambig / F": "F / Ambig",
    "M | Other / M": "M / M | Other"
})
total_gender_percentages = total_gender_percentages.groupby(total_gender_percentages.index).aggregate("sum")
total_gender_percentages = total_gender_percentages.rename(columns={"slash_ship": "count"}).sort_values(by="count")

print(total_gender_percentages) # TO VISUALISE

gender_combos_we_recognise = { # for annotation reference!
    "F / Other": "owl house witches",
    "F / F | Other": "sailor scouts",
    "M / Other": "venom",
    "M | Other / Ambig" : "loki x reader",
}

total_num_of_ships    600
dtype: int64
                               count
gender_combo                        
F / Ambig                          1
F / F | Other                      1
F / M / M                          1
F / Other                          1
M | Other / Ambig                  1
M | Other / M / M                  1
M / Other                          1
M | F | Other / M | F | Other      2
F | Other / F | Other              7
M / M | Other                      8
M / Ambig                         17
M / F                             85
F / F                            192
M / M                            282


In [42]:
# visualising gender stacks

import plotly.graph_objects as go
from plotly.subplots import make_subplots

gender_combo_fig=go.Figure()

fem_stack = ["F / F","F | Other / F | Other", "F / F | Other",]
masc_stack = ["M / M", "M / M | Other","M | Other / M / M",]
het_stack = ["M / F","F / Other","M / Other","F / M / M"]
ambig_gender_stack = ["M / Ambig","M | Other / Ambig", "F / Ambig","M | F | Other / M | F | Other"]

wlw_count = 0
mlm_count = 0
het_count = 0
ambig_count = 0

for combo in list(total_gender_percentages.index):
    if combo in fem_stack:
        label = "wlw"
        colours = ["red", "orange", "tomato"]
        colour = colours[wlw_count]
        wlw_count += 1
    elif combo in masc_stack:
        label = "mlm"
        colours = ["turquoise", "azure", "steelblue"]
        colour = colours[mlm_count]
        mlm_count += 1
    elif combo in het_stack:
        label = "non-same-sex"
        colours = ["silver", "grey", "gainsboro", "black"]
        colour = colours[het_count]
        het_count += 1
    elif combo in ambig_gender_stack:
        label = "ambiguous"
        colours = ["darkolivegreen", "limegreen", "mediumseagreen", "olive"]
        colour = colours[ambig_count]
        ambig_count += 1
    else: 
        print(combo)
    gender_combo_fig.add_trace(
        go.Bar(
            x=[label],
            y=total_gender_percentages.loc[combo],
            text=combo,
            marker_color=colour
        )
    )

gender_combo_fig.update_layout(
    barmode='stack', 
    showlegend=False, 
    title="Gender combinations of all ranked ships (incl. femslash ranking) - ao3 2013-2023"
)

gender_combo_fig.show()

In [59]:
#marketshare of fandoms (how many ships?)

ships_per_fandom = ships_df.copy().get(["fandom", "slash_ship", "gender_combo", "race_combo"])

fandom_market_share = ships_per_fandom.groupby("fandom").count()
fandom_market_share = fandom_market_share.where(
    (fandom_market_share["slash_ship"] / 600) >= 0.01
)["slash_ship"].sort_values(ascending=False)
# TO VISUALISE!

ships_per_fandom = ships_per_fandom.join(
    other=fandom_market_share, 
    on=ships_per_fandom.fandom, 
    how="inner", 
    rsuffix="_count"
).rename(
    columns={"slash_ship_count": "total_ships"}
)
ships_per_fandom.pop("key_0")


print(fandom_market_share.head(15))

fandom
Marvel                                               33.0
Youtube                                              24.0
Harry Potter Universe                                21.0
DC                                                   20.0
Homestuck                                            15.0
Genshin Impact | 原神                                  14.0
Supernatural                                         14.0
My Hero Academia | 僕のヒーローアカデミア                       14.0
Teen Wolf                                            13.0
Bangtan Boys / BTS                                   11.0
Stranger Things                                      11.0
Star Wars                                            10.0
Dragon Age                                           10.0
A Song of Ice and Fire / Game of Thrones Universe     9.0
Glee                                                  9.0
Name: slash_ship, dtype: float64


In [103]:

import plotly.express as px

labels = fandom_market_share.index
values = fandom_market_share.values

market_share_fig = go.Figure(
    data=[
        go.Pie(
            labels=labels,
            values=values,
            textinfo="label",
            insidetextorientation="horizontal",
            automargin=False,
            marker=dict(
                colors=[
                    "crimson", "red", "green", "dodgerblue", "orange", "gold"
                ] + px.colors.qualitative.Bold + px.colors.qualitative.Bold
            )
        )
    ]
)

market_share_fig.update_layout(
    title="Fandoms accounting for more than 1% of total ships during 2013-2023",
    showlegend=False,
    
)

market_share_fig.show()


In [4]:
#find fandoms with highest nums of wlw ships
#find fandoms with lowest nums of wlw ships

wlw_by_fandom = ships_per_fandom.copy().get(
    ["fandom", "total_ships", "gender_combo"]
)
wlw_by_fandom.gender_combo = wlw_by_fandom.gender_combo.where(
    (wlw_by_fandom.gender_combo == "F / F") | (wlw_by_fandom.gender_combo == "F | Other / F | Other") | (wlw_by_fandom.gender_combo == "F / F | Other")
)
wlw_by_fandom = wlw_by_fandom.groupby(["fandom"]).count().rename(
    columns={"gender_combo": "wlw_ships"}
).sort_values(by="wlw_ships", ascending=False)
wlw_by_fandom.insert(
    loc=2, 
    column="%_of_wlw_ships", 
    value=(wlw_by_fandom["wlw_ships"] / wlw_by_fandom["total_ships"] * 100).round(2)
)

no_wlw_fandoms = wlw_by_fandom["total_ships"].where(wlw_by_fandom["%_of_wlw_ships"] == 0).count()
only_wlw_fandoms = wlw_by_fandom["total_ships"].where(wlw_by_fandom["%_of_wlw_ships"] == 100).count()
most_wlw_ships = wlw_by_fandom["wlw_ships"].where(wlw_by_fandom["wlw_ships"] > 1).sort_values(ascending=False).dropna()
highest_percent_of_wlw = wlw_by_fandom["%_of_wlw_ships"].where(
    (wlw_by_fandom["wlw_ships"] > 1) & (wlw_by_fandom["%_of_wlw_ships"] < 100) & (wlw_by_fandom["%_of_wlw_ships"] >= 50)
).dropna().sort_values(ascending=False)

print(most_wlw_ships.head())
print(highest_percent_of_wlw) # (in mixed gender fandoms!)

fandom
DC                       11.0
Marvel                    9.0
Homestuck                 7.0
Steven Universe           7.0
Harry Potter Universe     6.0
Name: wlw_ships, dtype: float64
fandom
The 100                                75.00
Overwatch                              66.67
Riverdale                              66.67
Once Upon a Time                       66.67
Avatar: The last Airbender Universe    60.00
DC                                     55.00
Name: %_of_wlw_ships, dtype: float64


In [5]:
#”” for mlm ships

mlm_by_fandom = ships_per_fandom.copy().get(
    ["fandom", "total_ships", "gender_combo"]
)
mlm_by_fandom.gender_combo = mlm_by_fandom.gender_combo.where(
    (mlm_by_fandom.gender_combo == "M / M"
    ) | (mlm_by_fandom.gender_combo == "M | Other / M / M"
    ) | (mlm_by_fandom.gender_combo == "M / M | Other"
    ) | (mlm_by_fandom.gender_combo == "M | Other / M")
)
mlm_by_fandom = mlm_by_fandom.groupby(["fandom"]).count().rename(
    columns={"gender_combo": "mlm_ships"}
).sort_values(by="mlm_ships", ascending=False)
mlm_by_fandom.insert(
    loc=2, 
    column="%_of_mlm_ships", 
    value=(mlm_by_fandom["mlm_ships"] / mlm_by_fandom["total_ships"] * 100).round(2)
)

no_mlm_fandoms = mlm_by_fandom["total_ships"].where(mlm_by_fandom["%_of_mlm_ships"] == 0).count()
only_mlm_fandoms = mlm_by_fandom["total_ships"].where(mlm_by_fandom["%_of_mlm_ships"] == 100).count()
most_mlm_ships = mlm_by_fandom["mlm_ships"].where(mlm_by_fandom["mlm_ships"] > 1).sort_values(ascending=False).dropna()
highest_percent_of_mlm = mlm_by_fandom["%_of_mlm_ships"].where(
    (mlm_by_fandom["mlm_ships"] > 1) & (mlm_by_fandom["%_of_mlm_ships"] < 100) & (mlm_by_fandom["%_of_mlm_ships"] >= 50)
).dropna().sort_values(ascending=False)

print(most_mlm_ships.head(10))
print(highest_percent_of_mlm) # (in mixed gender fandoms!)

fandom
Youtube                           23.0
Marvel                            14.0
Bangtan Boys / BTS                11.0
Harry Potter Universe             10.0
My Hero Academia | 僕のヒーローアカデミア     9.0
One Direction                      8.0
Genshin Impact | 原神                8.0
DC                                 7.0
Haikyuu!! | ハイキュー!!                7.0
Homestuck                          6.0
Name: mlm_ships, dtype: float64
fandom
Youtube                           95.83
Haikyuu!! | ハイキュー!!               87.50
Voltron                           80.00
Naruto                            66.67
Criminal Minds                    66.67
Star Trek                         66.67
Yuri!!! on ICE | ユーリ!!! on ICE    66.67
My Hero Academia | 僕のヒーローアカデミア    64.29
Les Misérables                    60.00
Attack on Titan | 進撃の巨人           60.00
Genshin Impact | 原神               57.14
Star Wars                         50.00
Sherlock                          50.00
Hamilton                          50.00
Name

In [6]:
#”” for het ships

het_by_fandom = ships_per_fandom.copy().get(
    ["fandom", "total_ships", "gender_combo"]
)
het_by_fandom.gender_combo = het_by_fandom.gender_combo.where(
    (het_by_fandom.gender_combo == "F / M") | (het_by_fandom.gender_combo == "M / F")
)
het_by_fandom = het_by_fandom.groupby(["fandom"]).count().rename(
    columns={"gender_combo": "het_ships"}
).sort_values(by="het_ships", ascending=False)
het_by_fandom.insert(
    loc=2, 
    column="%_of_het_ships", 
    value=(het_by_fandom["het_ships"] / het_by_fandom["total_ships"] * 100).round(2)
)

no_het_fandoms = het_by_fandom["total_ships"].where(het_by_fandom["%_of_het_ships"] == 0).count()
only_het_fandoms = het_by_fandom["total_ships"].where(het_by_fandom["%_of_het_ships"] == 100).count()
most_het_ships = het_by_fandom["het_ships"].where(het_by_fandom["het_ships"] > 1).sort_values(ascending=False).dropna()
highest_percent_of_het = het_by_fandom["%_of_het_ships"].where(
    (het_by_fandom["het_ships"] > 1) & (het_by_fandom["%_of_het_ships"] < 100) & (het_by_fandom["%_of_het_ships"] >= 50)
).dropna().sort_values(ascending=False)

print(most_het_ships.head(3))
print(highest_percent_of_het) # (in mixed gender fandoms!)

# apparently GoT & Doctor Who have the best het pairings lmao they lead both rankings

fandom
Doctor Who                                           7.0
A Song of Ice and Fire / Game of Thrones Universe    6.0
Marvel                                               6.0
Name: het_ships, dtype: float64
fandom
Doctor Who                                           77.78
A Song of Ice and Fire / Game of Thrones Universe    66.67
Name: %_of_het_ships, dtype: float64


In [7]:
# combining these into one df now:

distr_of_gender_combos_dict = {
    "no_mlm_ship_fandoms": no_mlm_fandoms,
    "only_mlm_ship_fandoms": only_mlm_fandoms,
    "no_wlw_ship_fandoms": no_wlw_fandoms,
    "only_wlw_ship_fandoms": only_wlw_fandoms,
    "no_het_ship_fandoms": no_het_fandoms,
    "only_het_ship_fandoms": only_het_fandoms,
}
distr_of_gender_combos_series = pd.Series(distr_of_gender_combos_dict)
print(distr_of_gender_combos_series) # TO VISUALISE

no_mlm_ship_fandoms       77
only_mlm_ship_fandoms     85
no_wlw_ship_fandoms      105
only_wlw_ship_fandoms     55
no_het_ship_fandoms      163
only_het_ship_fandoms     11
dtype: int64


In [8]:
#how many mlm vs wlw vs het ships a fandom has on average

average_gender_combo_dict = {
    "ships": wlw_by_fandom["total_ships"].mean().round(2),
    "mlm": mlm_by_fandom["mlm_ships"].mean().round(2),
    "wlw": wlw_by_fandom["wlw_ships"].mean().round(2),
    "hets": het_by_fandom["het_ships"].mean().round(2)
}
average_gender_combo_per_fandom_series = pd.Series(average_gender_combo_dict)
print(average_gender_combo_per_fandom_series) # TO VISUALISE

ships    2.87
mlm      1.39
wlw      0.96
hets     0.41
dtype: float64


In [9]:
#all ships race combo percentages

from re import split

total_race_combo_counts = ships_df.get(["slash_ship","race_combo"])

unique_combos = sorted(list(set(total_race_combo_counts.race_combo)))
rename_dict = {}
for combo in unique_combos:
    split_version = split(r"\s\/\s", combo)
    sorted_version = sorted(split_version)
    reconcat_version = sorted_version[0]
    for item in sorted_version[1:]:
        reconcat_version += " / " + item
    if reconcat_version != combo:
        rename_dict[combo] = reconcat_version

total_race_combo_counts = total_race_combo_counts.groupby("race_combo").count().rename(
    index=rename_dict,
    columns={"slash_ship": "count"}
)
total_race_combo_counts = total_race_combo_counts.groupby(total_race_combo_counts.index).aggregate("sum").sort_values(
    by="count", ascending=False
)

print(total_race_combo_counts.head(20)) # TO VISUALISE
# top 20 (= any with 4+ ships) almost all involve white ppl, if not white ppl east asians instead, or nh/ambig, 
# no other poc/poc ships ranked that high!


                           count
race_combo                      
White                        286
E Asian                       83
Ambig / White                 36
Latin / White                 16
N.H.                          15
E Asian / White               15
Ambig                         14
Black / White                 11
Latin (Multi) / White          9
E Asian (Multi) / White        9
MENA / White                   8
Ambig / N.H.                   8
S Asian / White                6
Af Lat / White                 6
Ambig / E Asian                5
Black (Multi) / White          4
White / White (Multi)          4
E Asian / E Asian (Multi)      4
N.H. / White                   4
SE Asian (Multi) / White       4


In [10]:
#of which how many interracial vs same

interracial_ships = total_race_combo_counts.copy()
interracial_ships.insert(
    loc=1, 
    column="is_interracial_pairing",
    value=interracial_ships.index.str.contains("/")
)
interracial_ships.insert(
    loc=2, 
    column="is_ambig",
    value=interracial_ships.index.str.contains("Ambig")
)
# print(interracial_ships)

interracial_ships_counts = pd.Series(
    {
        "same_race_pairings": interracial_ships["count"].where(
            (interracial_ships.is_ambig == False) & (interracial_ships.is_interracial_pairing == False)
        ).aggregate("sum"),
        "interracial_pairings": interracial_ships["count"].where(
            (interracial_ships.is_ambig == False) & (interracial_ships.is_interracial_pairing == True)
        ).aggregate("sum"), 
        "ambiguous_pairings": interracial_ships["count"].where(
            interracial_ships.is_ambig == True
        ).aggregate("sum"), 
    }
)
interracial_ships_counts # TO VISUALISE

same_race_pairings      395.0
interracial_pairings    138.0
ambiguous_pairings       67.0
dtype: float64

In [20]:
# and how many do not involve white chars

non_white_ships = total_race_combo_counts.copy()
non_white_ships.insert(
    loc=1, 
    column="contains_white_person",
    value=non_white_ships.index.str.contains("White")
)
non_white_ships.insert(
    loc=2, 
    column="contains_e_asian_person",
    value=non_white_ships.index.str.contains("E Asian")
)
non_white_ships.insert(
    loc=3, 
    column="contains_ambig_person",
    value=non_white_ships.index.str.contains("Ambig")
)
non_white_ships.insert(
    loc=3, 
    column="contains_non_human",
    value=non_white_ships.index.str.contains("N.H.")
)

non_white_ships_counts = pd.Series(
    {
        "pairings_with_white_people": non_white_ships["count"].where(
            non_white_ships.contains_white_person == True
        ).aggregate("sum"), 
        "pairings_with_east_asian_people": non_white_ships["count"].where(
            non_white_ships.contains_e_asian_person == True
        ).aggregate("sum"), 
        "non_white_pairings": non_white_ships["count"].where(
            (non_white_ships.contains_ambig_person == False) & (
            non_white_ships.contains_non_human == False) & (
            non_white_ships.contains_white_person == False)
        ).aggregate("sum"),
        "non_white_or_east_asian_pairings": non_white_ships["count"].where(
            (non_white_ships.contains_ambig_person == False) & (
            non_white_ships.contains_non_human == False) & (
            non_white_ships.contains_white_person == False) & (
            non_white_ships.contains_e_asian_person == False)
        ).aggregate("sum"),
    }
)

non_white_ships_counts # this one's a big oof  # TO VISUALISE

pairings_with_white_people          440.0
pairings_with_east_asian_people     137.0
non_white_pairings                  113.0
non_white_or_east_asian_pairings     14.0
dtype: float64

In [18]:
#how much rpf vs fictional

rpf_vs_fic_df = ships_df.get(["slash_ship", "rpf_or_fic"])

rpf_vs_fic_df = rpf_vs_fic_df.groupby("rpf_or_fic").count().rename(columns={"slash_ship": "count"})

rpf_vs_fic_df["count"] # TO VISUALISE

rpf_or_fic
RPF           78
fictional    522
Name: count, dtype: int64