In [None]:
import os
import plotly.io as pio
import pandas as pd
import plotly.express as px

from collections import Counter

In [7]:
DATA_DIR = "data/"

## Datasets

In [8]:
def load_dataset(dataset: str) -> pd.DataFrame:

    # Load original dataframe
    data_df = pd.read_csv(
        os.path.join(
            DATA_DIR,
            f"{dataset}/{dataset}_train.csv"
        )
    )

    # Add dataset columns
    data_df["Dataset"] = dataset

    return data_df


def load_train_test_dataset(dataset: str) -> pd.DataFrame:

    # Load original dataframe
    train_df = pd.read_csv(
        os.path.join(
            DATA_DIR,
            f"{dataset}/{dataset}_train.csv"
        )
    )

    test_df = pd.read_csv(
        os.path.join(
            DATA_DIR,
            f"{dataset}/{dataset}_test.csv"
        )
    )

    # Add dataset columns
    train_df["Split"] = "train"
    test_df["Split"] = "test"

    train_df["Dataset"] = dataset
    test_df["Dataset"] = dataset

    # Combine train and test splits
    data_df = pd.concat(
        [train_df, test_df],
        axis=0,
        ignore_index=True
    )

    return data_df

In [9]:
data_df = pd.DataFrame()

dataset_list = (
    "basil",
    "buzzfeed",
    "clef22",
    "clickbait",
    "fingerprints",
    "pheme",
    "politifact",
    "propaganda",
    "shadesoftruth",
    "twittercovid",
    "webis"
)

for dataset in dataset_list:

    # Load datasets that contain train and test splits
    if dataset in ("clef22", "fingerprints", "shadesoftruth"):
        data_df = pd.concat(
            [data_df, load_train_test_dataset(dataset)],
            axis=0,
            ignore_index=True
        )

    # Load datasets without splits
    else:
        data_df = pd.concat(
            [data_df, load_dataset(dataset)],
            axis=0,
            ignore_index=True
        )

data_df["text"] = data_df["text"].astype(str)

data_df

Unnamed: 0,text,labels,Dataset,Split,id
0,Rep. Michael Grimm (R-N.Y.) is expected to be ...,no-bias,basil,,
1,Politico was first to report the news of the e...,no-bias,basil,,
2,"Grimm's lawyer, William McGinley, said in a st...",no-bias,basil,,
3,"""We are disappointed by the government’s decis...",contains-bias,basil,,
4,"""From the beginning, the government has pursue...",contains-bias,basil,,
...,...,...,...,...,...
57974,The shooter responsible for injuring nine peop...,true,webis,,
57975,"On the topic of climate change, Hillary Clinto...",true,webis,,
57976,A poll of voters nationwide and focus groups i...,true,webis,,
57977,Three of the Baltimore police officers previou...,true,webis,,


### Statistics

In [10]:
stats_df = data_df.copy()

stats_df['title_length'] = stats_df['text'].apply(lambda x: len(x))
stats_df['word_count'] = stats_df["text"].apply(lambda x: len(str(x).split(" ")))
stats_df['char_count'] = stats_df["text"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
stats_df['sentence_count'] = stats_df["text"].apply(lambda x: len(str(x).split(".")))
stats_df['avg_word_length'] = stats_df['char_count'] / stats_df['word_count']
stats_df['avg_sentence_lenght'] = stats_df['word_count'] / stats_df['sentence_count']

if "id" in stats_df.columns:
    stats_df = stats_df\
        .drop("id", axis=1)\
        .groupby(["Dataset"], as_index=False)\
        .median()

stats_df

  .median()


Unnamed: 0,Dataset,title_length,word_count,char_count,sentence_count,avg_word_length,avg_sentence_lenght
0,basil,137.0,23.0,116.0,2.0,5.086957,9.5
1,buzzfeed,64.0,11.0,55.0,1.0,5.190909,10.0
2,clef22,3035.0,504.0,2530.0,25.0,5.019697,19.333333
3,clickbait,72.0,12.0,61.0,1.0,5.166667,10.5
4,fingerprints,2664.0,433.0,2233.0,23.0,5.204743,18.611111
5,pheme,125.0,16.0,109.0,3.0,6.5,6.0
6,politifact,2088.0,350.0,1746.0,18.0,5.116129,18.84
7,propaganda,140.0,23.0,118.0,1.0,5.058824,23.0
8,shadesoftruth,1637.0,276.0,1357.0,16.0,5.047114,18.914216
9,twittercovid,257.5,40.0,214.0,4.0,5.287302,9.633333


In [11]:
stats_df_long = stats_df.melt(
    id_vars="Dataset",
    var_name="Stat",
    value_name="Value",
    value_vars=stats_df.columns[1:]
)

stats_df_long

Unnamed: 0,Dataset,Stat,Value
0,basil,title_length,137.000000
1,buzzfeed,title_length,64.000000
2,clef22,title_length,3035.000000
3,clickbait,title_length,72.000000
4,fingerprints,title_length,2664.000000
...,...,...,...
61,politifact,avg_sentence_lenght,18.840000
62,propaganda,avg_sentence_lenght,23.000000
63,shadesoftruth,avg_sentence_lenght,18.914216
64,twittercovid,avg_sentence_lenght,9.633333


In [None]:
# Format for plotting
stats_df_long["Dataset"] = stats_df_long["Dataset"].str.capitalize()
stats_df_long["Stat"] = stats_df_long["Stat"].str.replace("_", " ").str.capitalize()

fig = px.bar(
    data_frame=stats_df_long,
    x="Stat",
    y="Value",
    color="Dataset",
    barmode="group",
    text="Value",
    log_y=True
)

fig.update_layout(xaxis_title=None,xaxis={'categoryorder':'total descending'},showlegend=True, legend=dict(orientation="h",   # show entries horizontally
                     xanchor = "center",  # use center of legend as anchor
                     x = .5))

fig.show()

pio.write_image(
     fig=fig,
    file="stats_data.pdf",
    width=1.5*600,
    height=0.75*400
 )
 

## Bias dimensions (identity markers)

In [13]:
BIAS_DATA_DIR = "../../unqover/word_lists/nouns/subjects/"

### Get subjects

In [14]:
def get_and_clean_content(
    content: list,
    pattern: str
) -> list:
    return [
        line.replace(pattern, "").strip("\n").strip(" ")
        for line in content
        if line.startswith(pattern)
    ]


def get_subjects(subjects_list: list) -> pd.DataFrame:
    all_subjects_df = pd.DataFrame()

    for subject in subjects_list:
        with open(os.path.join(BIAS_DATA_DIR, subject), mode="r") as file:
            content = file.readlines()
            content_df = pd.DataFrame({
                "Subject": get_and_clean_content(content, pattern="[subj]")
            })

        content_df["Type"] = subject.capitalize()

        # Add nationality while reading country subjects
        if subject == "country":
            nationality_df = pd.DataFrame({
                "Subject": get_and_clean_content(content, pattern="[subj:dem]")
            })
            nationality_df["Type"] = "Nationality"

            content_df = pd.concat(
                [content_df, nationality_df],
                axis=0,
                ignore_index=True
            )

        all_subjects_df = pd.concat(
            [
                all_subjects_df,
                pd.DataFrame(content_df)
            ],
            axis=0,
            ignore_index=True
        )

    return all_subjects_df

In [15]:
subjects_list = [
    "country",
    "ethnicity",
    "religion"
]

all_subjects_df = get_subjects(subjects_list)
all_subjects_df

Unnamed: 0,Subject,Type
0,Afghanistan,Country
1,America,Country
2,Australia,Country
3,Bangladesh,Country
4,Belgium,Country
...,...,...
158,Mormon,Religion
159,Protestant,Religion
160,Orthodox,Religion
161,Catholic,Religion


### Count subjects

In [16]:
subject_counts_df = pd.DataFrame()

for dataset in data_df["Dataset"].unique():

    dataset_df = data_df[data_df["Dataset"] == dataset].copy()

    # Get counter after splitting by words and exploding
    all_counts = Counter(
        dataset_df["text"]\
            .str.lower()\
            .str.split(" ")\
            .explode()
    )

    # Get counts per subject and 0.0 if not present
    for subject in all_subjects_df["Subject"]:
        subject_counts_df = pd.concat(
            [
                subject_counts_df,
                pd.DataFrame({
                    "Subject": [subject],
                    "Count": all_counts.get(subject.lower(), 0.0),
                    "Dataset": [dataset]
                })
            ],
            axis=0,
            ignore_index=True
        )

# Merge subject type
subject_counts_df = pd.merge(
    left=subject_counts_df,
    right=all_subjects_df,
    on="Subject",
    how="left"
)

subject_counts_df = subject_counts_df.sort_values(["Dataset", "Count"], ascending=False)
subject_counts_df

Unnamed: 0,Subject,Count,Dataset,Type
1792,Black,940.0,webis,Ethnicity
1794,White,771.0,webis,Ethnicity
1721,American,707.0,webis,Nationality
1651,America,372.0,webis,Country
1782,Syrian,210.0,webis,Nationality
...,...,...,...,...
157,Buddhist,0.0,basil,Religion
159,Sikh,0.0,basil,Religion
161,Protestant,0.0,basil,Religion
162,Orthodox,0.0,basil,Religion


In [None]:
NER_THR = 60

all_ner_df = pd.DataFrame()

dataset_list = (
    "basil",
    "buzzfeed",
    "clef22",
    "clickbait",
    "fingerprints",
    "pheme",
    "politifact",
    "propaganda",
    "shadesoftruth",
    "twittercovid",
    "webis"
)

for dataset in dataset_list:

    # Load NER file
    ner_df = pd.read_csv(
        os.path.join(
            DATA_DIR,
            f"{dataset}/{dataset}_ner.csv"
        )
    )

    # Format for merging
    ner_df = ner_df\
        .rename(columns={
            "NERS": "Subject",
            "NERS label": "Type",
            "counts":"Count"
        })

    ner_df["Type"] = ner_df["Type"].str.capitalize()
    ner_df["Dataset"] = dataset

    ner_df = ner_df[["Subject", "Count", "Dataset", "Type"]]

    all_ner_df = pd.concat(
        [all_ner_df, ner_df],
        axis=0,
        ignore_index=True
    )

all_ner_df

Unnamed: 0,Subject,Count,Dataset,Type
0,Donald Trump,503,basil,Person
1,Barack Obama,323,basil,Person
2,Hillary Clinton,323,basil,Person
3,Joe Biden,72,basil,Person
4,Nancy Pelosi,85,basil,Person
...,...,...,...,...
138,Bernie Sanders,92,webis,Person
139,Mike Pence,55,webis,Person
140,Rudy Giuliani,96,webis,Person
141,Elizabeth Warren,98,webis,Person


In [18]:
# Merge subjects with NERs
ner_subject_df = pd.concat(
    [all_ner_df, subject_counts_df],
    axis=0,
    ignore_index=True
)

ner_subject_df

Unnamed: 0,Subject,Count,Dataset,Type
0,Donald Trump,503.0,basil,Person
1,Barack Obama,323.0,basil,Person
2,Hillary Clinton,323.0,basil,Person
3,Joe Biden,72.0,basil,Person
4,Nancy Pelosi,85.0,basil,Person
...,...,...,...,...
1953,Buddhist,0.0,basil,Religion
1954,Sikh,0.0,basil,Religion
1955,Protestant,0.0,basil,Religion
1956,Orthodox,0.0,basil,Religion


In [19]:
ner_subject_df["Proportion"] = \
    ner_subject_df["Count"] / ner_subject_df.groupby(["Dataset", "Type"])["Count"].transform('sum')

ner_subject_df

Unnamed: 0,Subject,Count,Dataset,Type,Proportion
0,Donald Trump,503.0,basil,Person,0.297457
1,Barack Obama,323.0,basil,Person,0.191011
2,Hillary Clinton,323.0,basil,Person,0.191011
3,Joe Biden,72.0,basil,Person,0.042578
4,Nancy Pelosi,85.0,basil,Person,0.050266
...,...,...,...,...,...
1953,Buddhist,0.0,basil,Religion,0.000000
1954,Sikh,0.0,basil,Religion,0.000000
1955,Protestant,0.0,basil,Religion,0.000000
1956,Orthodox,0.0,basil,Religion,0.000000


In [20]:
# Apply threshold to proportion
ner_subject_df = ner_subject_df[ner_subject_df["Proportion"] > 0.02]

In [38]:
for num_var in ("Count", "Proportion"):

    for data_type in ner_subject_df["Type"].unique():
        type_df = ner_subject_df[ner_subject_df["Type"] == data_type].copy()

        # Sort by count
        type_df = type_df.sort_values(num_var)

        # Get category orders as sum across datasets
        category_orders = type_df\
            .groupby(["Subject", "Type"], as_index=False)\
            .sum(num_var)\
            .sort_values(num_var, ascending=False)["Subject"].values.tolist()

        fig = px.bar(
            data_frame=type_df,
            x="Subject",
            y=num_var,
            color="Dataset",
            barmode="stack",
            color_discrete_sequence=px.colors.qualitative.Pastel,
            category_orders={
                "Subject": category_orders,
                "Dataset": dataset_list
            },
            width=900,
            height=500
        )
        fig.update_layout(xaxis={"title": data_type})
        fig.show()

        pio.write_image(
            fig=fig,
            file=f"plots/{data_type}_barplot_{num_var}.pdf",
            width=1.5*600,
            height=0.75*1000
        )

        # ---------------------------------------------------------------------- #

        # Calculate mean and standard distribution
        mean_df = type_df\
            .groupby("Subject")[num_var].mean()\
            .reset_index(drop=False)\
            .rename(columns={num_var: "Mean"})
        std_df = type_df\
            .groupby("Subject")[num_var].std()\
            .reset_index(drop=False)\
            .rename(columns={num_var: "Std"})

        # Fill NaN with zeroes
        mean_df["Mean"] = mean_df["Mean"].fillna(0)
        std_df["Std"] = std_df["Std"].fillna(0)

        distribution_df = pd.merge(
            left=mean_df,
            right=std_df,
            on="Subject",
            how="outer"
        )

        # Calculate the number of datasets containing the subject
        num_data_df = type_df\
            .groupby("Subject")["Dataset"].count()\
            .reset_index(drop=False)\
            .rename(columns={"Dataset": "Nº datasets"})

        distribution_df = pd.merge(
            left=distribution_df,
            right=num_data_df,
            on="Subject",
            how="outer"
        )

        fig = px.bar(
            data_frame=distribution_df,
            x="Subject",
            y="Mean",
            color="Nº datasets",
            barmode="stack",
            error_y="Std",
            category_orders={
                "Subject": category_orders
            },
            width=900,
            height=500
        )
        fig.update_layout(xaxis={"title": data_type})
        fig.show()

        pio.write_image(
            fig=fig,
            file=f"plots/{data_type}_barplot_error_{num_var}.pdf",
            width=1.5*600,
            height=0.75*1000
        )


In [None]:
for dataset in dataset_list:

    dataset_ner_subject_df = ner_subject_df[
        ner_subject_df["Dataset"] == dataset
    ].copy()

    fig = px.scatter(
        dataset_ner_subject_df,
        y="Count",
        size="Count",
        color="Subject",
        facet_col="Type",
        log_y=True,
        log_x=True,
        size_max=60,
        text=[
            '{} {:.2}'.format(subject, proportion)
            for subject, proportion in zip(
                dataset_ner_subject_df["Subject"],
                dataset_ner_subject_df["Proportion"]
            )
        ]
    )

    fig.update_layout(
        showlegend=False,
        legend=dict(
            xanchor="left", # show entries horizontally
            x=1             # use center of legend as anchor
        ),
        width=1200,
        height=1200
    )

    fig.show()

    pio.write_image(
        fig=fig,
        file=f"plots/{dataset}_numbers.pdf",
        width=1.5*600,
        height=0.75*1000
    )
