In [20]:
from visualisation.vis_utils.read_csv_to_df import df_from_csv
import pandas as pd

# read from ships file make a df
ships_df = df_from_csv("data/fifth_clean_up_data/stage_5_ships.csv")
ships_df.columns

Index(['slash_ship', 'gen_ship', 'members_no', 'fandom', 'rpf_or_fic',
       'gender_combo', 'race_combo', 'member_1', 'member_2', 'member_3',
       'member_4'],
      dtype='object')

In [21]:
#all ships gender combo percentages
    #of which how many mlm vs het (incl x other) vs wlw

total_ships = ships_df.copy().get(["slash_ship"]).count().rename(
    index={"slash_ship":"total_num_of_ships"}
)
print(total_ships)

total_gender_percentages = ships_df.copy().get(
    ["slash_ship","gender_combo"]
).groupby("gender_combo").count().rename(index={
    "F / M": "M / F",
    "Ambig / M": "M / Ambig",
    "Ambig / F": "F / Ambig",
    "M | Other / M": "M / M | Other"
})
total_gender_percentages = total_gender_percentages.groupby(
    total_gender_percentages.index
).aggregate("sum").rename(columns={"slash_ship": "count"}).sort_values(by="count")
print(total_gender_percentages) # TO VISUALISE

# gender_combos_we_recognise = { # for annotation reference!
#     "F / Other": "owl house witches",
#     "F / F | Other": "sailor scouts",
#     "M / Other": "venom",
#     "M | Other / Ambig" : "loki x reader",
# }

total_num_of_ships    600
dtype: int64
                               count
gender_combo                        
F / Ambig                          1
F / F | Other                      1
F / M / M                          1
F / Other                          1
M | Other / Ambig                  1
M | Other / M / M                  1
M / Other                          1
M | F | Other / M | F | Other      2
F | Other / F | Other              7
M / M | Other                      8
M / Ambig                         17
M / F                             85
F / F                            192
M / M                            282


In [22]:
# visualising gender stacks

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

gender_combo_fig=go.Figure()

wlw_count = 0
mlm_count = 0
het_count = 0
ambig_count = 0

for combo in list(total_gender_percentages.index):
    if combo in ["F / F","F | Other / F | Other", "F / F | Other",]:
        label = "wlw"
        colours = ["red", "orange", "tomato"]
        colour = colours[wlw_count]
        wlw_count += 1
    elif combo in ["M / M", "M / M | Other","M | Other / M / M",]:
        label = "mlm"
        colours = ["azure", "turquoise", "steelblue"]
        colour = colours[mlm_count]
        mlm_count += 1
    elif combo in ["M / F","F / Other","M / Other","F / M / M"]:
        label = "non-same-sex"
        colours = ["silver", "grey", "gainsboro", "black"]
        colour = colours[het_count]
        het_count += 1
    elif combo in ["M / Ambig","M | Other / Ambig", "F / Ambig","M | F | Other / M | F | Other"]:
        label = "ambiguous"
        colours = ["darkolivegreen", "limegreen", "mediumseagreen", "olive"]
        colour = colours[ambig_count]
        ambig_count += 1
    else: 
        print(combo)
    gender_combo_fig.add_trace(
        go.Bar(
            x=[label],
            y=total_gender_percentages.loc[combo],
            text=combo,
            marker_color=colour
        )
    )

gender_combo_fig.update_layout(
    barmode='stack', 
    showlegend=False, 
    title="Ship gender combinations (AO3 2013-2023)"
)
gender_combo_fig.show()

In [23]:
#marketshare of fandoms (how many ships?)

ships_per_fandom = ships_df.copy().get(["fandom", "slash_ship", "gender_combo", "race_combo"])

fandom_market_share = ships_per_fandom.copy().groupby("fandom").count()
fandom_market_share = fandom_market_share.where(
    (fandom_market_share["slash_ship"] / 600) >= 0.01
)["slash_ship"].sort_values(ascending=False)
# TO VISUALISE!

ships_per_fandom = ships_per_fandom.join(
    other=ships_per_fandom.copy().groupby("fandom").count()["slash_ship"], 
    on=ships_per_fandom.fandom, 
    how="inner", 
    rsuffix="_count"
).rename(
    columns={"slash_ship_count": "total_ships"}
)
ships_per_fandom.pop("key_0")

print(fandom_market_share.head(15))

fandom
Marvel                                               33.0
Youtube                                              24.0
Harry Potter Universe                                21.0
DC                                                   20.0
Homestuck                                            15.0
Genshin Impact | 原神                                  14.0
Supernatural                                         14.0
My Hero Academia | 僕のヒーローアカデミア                       14.0
Teen Wolf                                            13.0
Bangtan Boys / BTS                                   11.0
Stranger Things                                      11.0
Star Wars                                            10.0
Dragon Age                                           10.0
A Song of Ice and Fire / Game of Thrones Universe     9.0
Glee                                                  9.0
Name: slash_ship, dtype: float64


In [24]:
#visualising market share of fandoms

market_share_fig = go.Figure(
    data=[
        go.Pie(
            labels=fandom_market_share.index,
            values=fandom_market_share.values,
            textinfo="label",
            insidetextorientation="horizontal",
            automargin=False,
            marker=dict(
                colors=[
                    "crimson", "red", "green", "dodgerblue", "orange", "gold"
                ] + px.colors.qualitative.Bold + px.colors.qualitative.Bold
            )
        )
    ]
)
market_share_fig.update_layout(
    title="Fandoms accounting for more than 1% of ships (AO3 2013-2023)",
    showlegend=False,
)
market_share_fig.show()

In [25]:
#refactoring percentages for ship gender combos

ships_per_fandom_by_type = ships_per_fandom.copy().get(
    ["fandom", "total_ships", "gender_combo"]
)

# how many ships of type by fandom
ships_per_fandom_by_type["wlw_ships"] = ships_per_fandom_by_type.gender_combo.where(
    (ships_per_fandom_by_type.gender_combo == "F / F"
    ) | (ships_per_fandom_by_type.gender_combo == "F | Other / F | Other"
    ) | (ships_per_fandom_by_type.gender_combo == "F / F | Other")
)
ships_per_fandom_by_type["mlm_ships"] = ships_per_fandom_by_type.gender_combo.where(
    (ships_per_fandom_by_type.gender_combo == "M / M"
    ) | (ships_per_fandom_by_type.gender_combo == "M | Other / M / M"
    ) | (ships_per_fandom_by_type.gender_combo == "M / M | Other"
    ) | (ships_per_fandom_by_type.gender_combo == "M | Other / M")
)
ships_per_fandom_by_type["het_ships"] = ships_per_fandom_by_type.gender_combo.where(
    (ships_per_fandom_by_type.gender_combo == "F / M"
    ) | (ships_per_fandom_by_type.gender_combo == "M / F")
)
ships_per_fandom_by_type.pop("gender_combo")
ships_per_fandom_by_type = ships_per_fandom_by_type.groupby(["fandom"]).count() 
    # this makes fandom the index/columns -> no longer counted for length

for ship_type in ["mlm", "wlw", "het"]:
    # percent of total that is
    ships_per_fandom_by_type[f"%_of_{ship_type}_ships"] = (
        ships_per_fandom_by_type[f"{ship_type}_ships"] / ships_per_fandom_by_type["total_ships"] * 100
    ).round(2)

    # diff conditions it fulfills
    ships_per_fandom_by_type[f"no_{ship_type}"] = ships_per_fandom_by_type["total_ships"].where(
        ships_per_fandom_by_type[f"%_of_{ship_type}_ships"] == 0
    )
    ships_per_fandom_by_type[f"all_{ship_type}"] = ships_per_fandom_by_type["total_ships"].where(
        ships_per_fandom_by_type[f"%_of_{ship_type}_ships"] == 100
    )
    ships_per_fandom_by_type[f"more_than_50%_{ship_type}"] = ships_per_fandom_by_type[f"%_of_{ship_type}_ships"].where(
        (ships_per_fandom_by_type[f"{ship_type}_ships"] > 1
        ) & (ships_per_fandom_by_type[f"%_of_{ship_type}_ships"] < 100
        ) & (ships_per_fandom_by_type[f"%_of_{ship_type}_ships"] >= 50)
    )

print(ships_per_fandom_by_type.transpose())

fandom             5 Seconds of Summer  9-1-1  \
total_ships                        1.0    2.0   
wlw_ships                          0.0    0.0   
mlm_ships                          1.0    2.0   
het_ships                          0.0    0.0   
%_of_mlm_ships                   100.0  100.0   
no_mlm                             NaN    NaN   
all_mlm                            1.0    2.0   
more_than_50%_mlm                  NaN    NaN   
%_of_wlw_ships                     0.0    0.0   
no_wlw                             1.0    2.0   
all_wlw                            NaN    NaN   
more_than_50%_wlw                  NaN    NaN   
%_of_het_ships                     0.0    0.0   
no_het                             1.0    2.0   
all_het                            NaN    NaN   
more_than_50%_het                  NaN    NaN   

fandom             A Song of Ice and Fire / Game of Thrones Universe  \
total_ships                                                     9.00   
wlw_ships             

In [26]:
# making refactored diff amount df

total_gender_combos_dict = {
    "no_mlm_ship_fandoms": ships_per_fandom_by_type["no_mlm"].count(),
    "more_than_50%_mlm": ships_per_fandom_by_type["more_than_50%_mlm"].count(),
    "only_mlm_ship_fandoms": ships_per_fandom_by_type["all_mlm"].count(),
    "no_wlw_ship_fandoms": ships_per_fandom_by_type["no_wlw"].count(),
    "more_than_50%_wlw": ships_per_fandom_by_type["more_than_50%_wlw"].count(),
    "only_wlw_ship_fandoms": ships_per_fandom_by_type["all_wlw"].count(),
    "no_het_ship_fandoms": ships_per_fandom_by_type["no_het"].count(),
    "more_than_50%_het": ships_per_fandom_by_type["more_than_50%_het"].count(),
    "only_het_ship_fandoms": ships_per_fandom_by_type["all_het"].count(),
}
total_gender_combos_series = pd.Series(total_gender_combos_dict)
print(total_gender_combos_series) # TO VISUALISE


no_mlm_ship_fandoms       77
more_than_50%_mlm         14
only_mlm_ship_fandoms     85
no_wlw_ship_fandoms      105
more_than_50%_wlw          6
only_wlw_ship_fandoms     55
no_het_ship_fandoms      163
more_than_50%_het          2
only_het_ship_fandoms     11
dtype: int64


In [39]:
# no, over half and only this type (refactored)

no_half_only_labels = ["mlm ships", "wlw ships", "het ships"]
no_ships_values = total_gender_combos_series.get([
    "no_mlm_ship_fandoms", 
    "no_wlw_ship_fandoms", 
    "no_het_ship_fandoms"
])
over_half_values = total_gender_combos_series.get([
    "more_than_50%_mlm", 
    "more_than_50%_wlw", 
    "more_than_50%_het"
])
only_ships_values = total_gender_combos_series.get([
    "only_mlm_ship_fandoms", 
    "only_wlw_ship_fandoms", 
    "only_het_ship_fandoms"
])

no_half_only_fig = go.Figure(
    data=[
        go.Bar( #no
            x=no_half_only_labels,
            y=no_ships_values,
            text="no",
            marker_color='darkmagenta',
            yaxis='y', 
            offsetgroup=1,
        ),
        go.Bar( #over half
            x=no_half_only_labels,
            y=over_half_values,
            text="over 50%",
            marker_color='indigo',
            yaxis='y2', 
            offsetgroup=2,
        ),
        go.Bar( #only
            x=no_half_only_labels,
            y=only_ships_values,
            text="only",
            marker_color='darkorchid',
            yaxis='y', 
            offsetgroup=3,
        )
    ],
    layout={
        'yaxis': {'title': 'no/only'},
        'yaxis2': {'title': 'over 50%', 'overlaying': 'y', 'side': 'right'}
    }
)

no_half_only_fig.update_layout(
    barmode='group', 
    showlegend=False, 
    title="Fandoms with no, over half, or only ships of this type (AO3 2013-2023)")
no_half_only_fig.show()

In [28]:
#making refactored highest num df

most_wlw = ships_per_fandom_by_type["wlw_ships"].where(ships_per_fandom_by_type["wlw_ships"] > 1).sort_values(ascending=False).dropna()
most_mlm = ships_per_fandom_by_type["mlm_ships"].where(ships_per_fandom_by_type["mlm_ships"] > 1).sort_values(ascending=False).dropna()
most_het = ships_per_fandom_by_type["het_ships"].where(ships_per_fandom_by_type["het_ships"] > 1).sort_values(ascending=False).dropna()

highest_of_type = {
    "highest num of mlm ships": [most_mlm.head(3).values[num] for num in [0,1,2]],
    "highest num of wlw ships": [most_wlw.head(3).values[num] for num in [0,1,2]],
    "highest num of het ships": [most_het.head(3).values[num] for num in [0,1,2]],
    "highest mlm fandom": [list(most_mlm.head(3).index)[num] for num in [0,1,2]],
    "highest wlw fandom": [list(most_wlw.head(3).index)[num] for num in [0,1,2]],
    "highest het fandom": [list(most_het.head(3).index)[num] for num in [0,1,2]],
}
highest_index = ["1st", "2nd", "3rd"]

highest_of_type_df = pd.DataFrame(
    highest_of_type, 
    index=highest_index
)
print(highest_of_type_df)

     highest num of mlm ships  highest num of wlw ships  \
1st                      23.0                      11.0   
2nd                      14.0                       9.0   
3rd                      11.0                       7.0   

     highest num of het ships  highest mlm fandom highest wlw fandom  \
1st                       7.0             Youtube                 DC   
2nd                       6.0              Marvel             Marvel   
3rd                       6.0  Bangtan Boys / BTS    Steven Universe   

                                    highest het fandom  
1st                                         Doctor Who  
2nd  A Song of Ice and Fire / Game of Thrones Universe  
3rd                                             Marvel  


In [40]:
# top 3 per fandom (refactored)

type_labels = ["mlm", "wlw", "het"] 
top_3_values_df = highest_of_type_df.copy().get([
    "highest num of mlm ships", 
    "highest num of wlw ships", 
    "highest num of het ships"
])
top_3_fandoms_df = highest_of_type_df.copy().get([
    "highest mlm fandom", 
    "highest wlw fandom", 
    "highest het fandom"
])

top_3_fandoms_for_ships_by_type_fig = go.Figure(
    data=[
        go.Bar(
            x=type_labels,
            y=top_3_values_df.loc["1st"],
            text=top_3_fandoms_df.loc["1st"], # text that goes on each bar
            marker_color='gold',
        ),
        go.Bar(
            x=type_labels,
            y=top_3_values_df.loc["2nd"],
            text=top_3_fandoms_df.loc["2nd"].mask(
                cond=top_3_fandoms_df.loc["2nd"] == "A Song of Ice and Fire / Game of Thrones Universe", 
                other="GoT"
            ),
            marker_color='slategrey',
        ),
        go.Bar(
            x=type_labels,
            y=top_3_values_df.loc["3rd"],
            text=top_3_fandoms_df.loc["3rd"].mask(
                cond=top_3_fandoms_df.loc["3rd"] == "Bangtan Boys / BTS", 
                other="BTS"
            ).mask(
                cond=(top_3_fandoms_df.loc["3rd"] == "Homestuck") | (top_3_fandoms_df.loc["3rd"] == "Steven Universe"), 
                other="Homestuck & SU"
            ),
            marker_color='chocolate',
        )
    ]
)

top_3_fandoms_for_ships_by_type_fig.update_layout(
    barmode='group', 
    showlegend=False, 
    title="Top 3 fandoms with most ships of this type (AO3 2013-2023)", 
)
top_3_fandoms_for_ships_by_type_fig.show()

In [30]:
#how many mlm vs wlw vs het ships a fandom has on average

average_gender_combo_dict = {
    "ships": ships_per_fandom_by_type["total_ships"].mean().round(2),
    "mlm": ships_per_fandom_by_type["mlm_ships"].mean().round(2),
    "wlw": ships_per_fandom_by_type["wlw_ships"].mean().round(2),
    "hets": ships_per_fandom_by_type["het_ships"].mean().round(2)
}
average_gender_combo_per_fandom_series = pd.Series(average_gender_combo_dict)
print(average_gender_combo_per_fandom_series) # TO VISUALISE

ships    2.87
mlm      1.39
wlw      0.96
hets     0.41
dtype: float64


In [31]:
# visualising average number of ships per fandom

average_ships_per_fandom_fig = px.bar(
    data_frame=average_gender_combo_per_fandom_series.get(["mlm", "wlw", "hets"]),
    title="Average ships of this type per fandom (AO3 2013-2023)",
    text=["mlm", "wlw", "het"],
    labels={
        "index": "",
        "value": "average ships per fandom",
    },
)
average_ships_per_fandom_fig.update_xaxes(
    visible=False
).update_traces(
    marker_color='indianred'
).update_layout(
    showlegend=False,
)
average_ships_per_fandom_fig.show()

In [32]:
#all ships race combo percentages

from re import split

total_race_combo_counts = ships_df.get(["slash_ship","race_combo"])

unique_combos = sorted(list(set(total_race_combo_counts.race_combo)))
rename_dict = {}
for combo in unique_combos:
    sorted_split_version = sorted(split(r"\s\/\s", combo))
    reconcat_version = sorted_split_version[0]
    for item in sorted_split_version[1:]:
        reconcat_version += " / " + item
    if reconcat_version != combo:
        rename_dict[combo] = reconcat_version

total_race_combo_counts = total_race_combo_counts.groupby("race_combo").count().rename(
    index=rename_dict,
    columns={"slash_ship": "count"}
)
total_race_combo_counts = total_race_combo_counts.groupby(
    total_race_combo_counts.index
).aggregate("sum").sort_values(
    by="count", ascending=False
)

print(total_race_combo_counts.head(20)) # TO VISUALISE


                           count
race_combo                      
White                        286
E Asian                       83
Ambig / White                 36
Latin / White                 16
N.H.                          15
E Asian / White               15
Ambig                         14
Black / White                 11
Latin (Multi) / White          9
E Asian (Multi) / White        9
MENA / White                   8
Ambig / N.H.                   8
S Asian / White                6
Af Lat / White                 6
Ambig / E Asian                5
Black (Multi) / White          4
White / White (Multi)          4
E Asian / E Asian (Multi)      4
N.H. / White                   4
SE Asian (Multi) / White       4


In [33]:
#of which how many interracial vs same

interracial_ships = total_race_combo_counts.copy()
interracial_ships["is_interracial_pairing"] = interracial_ships.index.str.contains("/")
interracial_ships["is_ambig"] = interracial_ships.index.str.contains("Ambig")
interracial_ships_counts = pd.Series({
    "same_race_pairings": interracial_ships["count"].where(
        (interracial_ships.is_ambig == False) & (interracial_ships.is_interracial_pairing == False)
    ).aggregate("sum"),
    "interracial_pairings": interracial_ships["count"].where(
        (interracial_ships.is_ambig == False) & (interracial_ships.is_interracial_pairing == True)
    ).aggregate("sum"), 
    "ambiguous_pairings": interracial_ships["count"].where(
        interracial_ships.is_ambig == True
    ).aggregate("sum"), 
})
interracial_ships_counts # TO VISUALISE

same_race_pairings      395.0
interracial_pairings    138.0
ambiguous_pairings       67.0
dtype: float64

In [34]:
#visualising interracial pairings

interracial_labels = ["non-interracial", "interracial", "ambiguous"]
interracial_values = interracial_ships_counts.values

interracial_pie = go.Figure(
    data=[
        go.Pie(
            labels=interracial_labels,
            values=interracial_values,
            textinfo="label+percent",
            textposition="outside",
            marker=dict(
                colors=px.colors.qualitative.Prism
            )
        )
    ]
)
interracial_pie.update_layout(
    title="Interracial vs other ships (AO3 2013-2023)",
    showlegend=False, # if you want it to not show the legend
)
interracial_pie.show()

In [35]:
# and how many do not involve white chars

non_white_ships = total_race_combo_counts.copy()
non_white_ships["contains_white_person"] = non_white_ships.index.str.contains("White")
non_white_ships["contains_e_asian_person"] = non_white_ships.index.str.contains("E Asian")
non_white_ships["contains_ambig_person"] = non_white_ships.index.str.contains("Ambig")
non_white_ships["contains_non_human"] = non_white_ships.index.str.contains("N.H.")

non_white_ships_counts = pd.Series(
    {
        "pairings_with_white_people": non_white_ships["count"].where(
            non_white_ships.contains_white_person == True
        ).aggregate("sum"), 
        "pairings_with_east_asian_people": non_white_ships["count"].where(
            non_white_ships.contains_e_asian_person == True
        ).aggregate("sum"), 
        "non_white_pairings": non_white_ships["count"].where(
            (non_white_ships.contains_ambig_person == False) & (
            non_white_ships.contains_non_human == False) & (
            non_white_ships.contains_white_person == False)
        ).aggregate("sum"),
        "non_white_or_east_asian_pairings": non_white_ships["count"].where(
            (non_white_ships.contains_ambig_person == False) & (
            non_white_ships.contains_non_human == False) & (
            non_white_ships.contains_white_person == False) & (
            non_white_ships.contains_e_asian_person == False)
        ).aggregate("sum"),
    }
)

non_white_ships_counts # this one's a big oof  # TO VISUALISE

pairings_with_white_people          440.0
pairings_with_east_asian_people     137.0
non_white_pairings                  113.0
non_white_or_east_asian_pairings     14.0
dtype: float64

In [36]:
# visualising white east asian and other-poc pairings

non_white_ships_fig = px.bar(
    data_frame=non_white_ships_counts,
    title="Pairings with and without white and east asian characters (AO3 2013-2023)",
    text=["involve white ppl", "involve east asians", "non-white ships", "non-white & non-EA"],
    labels={
        "index": "characters involved",
        "value": "no of ships",
    }
)
non_white_ships_fig.update_traces(
    marker_color='green' # update colour here -> for all bars tho
).update_layout(
    showlegend=False,
).update_xaxes(
    visible=False, # to hide bottom axis annotations
)
non_white_ships_fig.show()

In [37]:
#how much rpf vs fictional

rpf_vs_fic_df = ships_df.get(
    ["slash_ship", "rpf_or_fic"]
).groupby("rpf_or_fic").count().rename(columns={"slash_ship": "count"})
rpf_vs_fic_df["count"] # TO VISUALISE

rpf_or_fic
RPF           78
fictional    522
Name: count, dtype: int64

In [38]:
#visualising rpf vs fic

rpf_pie = go.Figure(
    data=[
        go.Pie(
            labels=rpf_vs_fic_df["count"].index,
            values=rpf_vs_fic_df["count"].values,
            textinfo="label+percent",
            textposition="inside",
            marker=dict(
                colors=["deeppink", "purple", ]
            )
        )
    ]
)
rpf_pie.update_layout(
    title="Real Person Fic vs Fictional Ships (AO3 2013-2023)",
    showlegend=False, # if you want it to not show the legend
)
rpf_pie.show()