In [1]:
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
from operator import itemgetter

import pandas as pd
import numpy as np
import copy
import json
import os

In [2]:
DATA_PATH = "../data/annotated"
TAXONOMIES_PATH = "../data/taxonomies"
datasets = os.listdir(DATA_PATH)
taxonomies = os.listdir(TAXONOMIES_PATH)
taxonomies_names = [tax.split(".")[0] for tax in taxonomies]

In [3]:
taxonomy_schemas = {}
for filename in os.scandir(TAXONOMIES_PATH):
    with open(filename, "r") as fi:
        taxonomy_schema = {line.strip("\n"):float(0) for line in fi.readlines()}
        taxonomy_schemas[filename.name.split(".")[0]] = taxonomy_schema

In [4]:
pos_labels = ["contains-bias", "clickbait", "false", "fake", "has_propaganda", "yes", "contains_false", True]

In [15]:
for dataset in datasets:
    dataset_name = dataset.split(".")[0]
    df = pd.read_csv(f"{DATA_PATH}{os.sep}{dataset}")
    print(dataset_name, df.shape, df["labels"].value_counts().to_dict())

buzzfeed (170, 8) {'real': 90, 'fake': 80}
twittercovidq2 (260, 8) {'no_false': 223, 'contains_false': 37}
twittercovidq1 (504, 8) {'yes': 305, 'no': 199}
clickbait (19038, 8) {'no-clickbait': 14720, 'clickbait': 4318}
webis (1604, 8) {'true': 1249, 'fake': 355}
politifact (202, 8) {'real': 111, 'fake': 91}
propaganda (1594, 8) {'has_propaganda': 816, 'no_propaganda': 778}
pheme (1685, 8) {True: 1058, False: 627}
basil (7959, 8) {'no-bias': 6335, 'contains-bias': 1624}


In [5]:
dataset2tax = {}
for dataset in datasets:
    dataset_name = dataset.split(".")[0]
    df = pd.read_csv(f"{DATA_PATH}{os.sep}{dataset}")
    # scaler = MinMaxScaler()
    # scaler.fit(df)
    # df = scaler.transform(df)
    
    # df = df[df['labels'].isin(pos_labels)]
    df = df[taxonomies_names]
    schemas_deepcopy = copy.deepcopy(taxonomy_schemas)
    for taxonomy_name in taxonomies_names:
        col = df[taxonomy_name].apply(lambda x: json.loads(x))
        col_array = np.vstack(col.values)
        col_mean = np.mean(col_array, axis=0)
        for mean_val, (tax_class, _) in zip(col_mean, schemas_deepcopy[taxonomy_name].items()):
            schemas_deepcopy[taxonomy_name][tax_class] = mean_val
        schemas_deepcopy[taxonomy_name] = OrderedDict(sorted(schemas_deepcopy[taxonomy_name].items(), key=itemgetter(1), reverse=True))
        dataset2tax[dataset_name] = schemas_deepcopy

In [6]:
for dataset_name, dataset_taxs in dataset2tax.items():
    for taxonomy_name, taxonomy_class in dataset_taxs.items():
        print(f"dataset_name: {dataset_name}, taxonomy_name: {taxonomy_name}")
        class_values_sum = sum([class_val[1] for class_val in taxonomy_class.items()])
        top3 = list(taxonomy_class.items())[:3]
        top_list = []
        for class_name, class_value in top3:
            class_value_freq = class_value*100/class_values_sum
            if class_value_freq != 0:
                top_list.append(f"{class_name} ({round(class_value_freq, 2)}\%)")
            else:
                top_list.append("-")
        
        print_slash = " \\\ "
        print(f"{print_slash.join(top_list)} ")

        print("\n")

dataset_name: buzzfeed, taxonomy_name: radnarr_prj
Legitimacy of ideology (100.0\%) \\ - \\ - 


dataset_name: buzzfeed, taxonomy_name: hate_speech
Personal Insult (50.0\%) \\ Racism (24.1\%) \\ Threat and Violence (24.1\%) 


dataset_name: buzzfeed, taxonomy_name: sentiment
positivity (-5.89\%) \\ overall (50.0\%) \\ negativity (55.89\%) 


dataset_name: buzzfeed, taxonomy_name: behavioral_traits_en
Extremism (20.5\%) \\ Unlawfulness (10.93\%) \\ Discrimination (9.68\%) 


dataset_name: buzzfeed, taxonomy_name: emotional_traits_en
Hatred (20.21\%) \\ Anger (10.45\%) \\ Disgust (10.45\%) 


dataset_name: buzzfeed, taxonomy_name: writeprint
Gulpease (32.55\%) \\ charactersPerSentence (27.48\%) \\ atomsPerSentence (5.41\%) 


dataset_name: twittercovidq2, taxonomy_name: radnarr_prj
Legitimacy of ideology (70.59\%) \\ Homophily (29.41\%) \\ - 


dataset_name: twittercovidq2, taxonomy_name: hate_speech
Threat and Violence (29.59\%) \\ Personal Insult (26.53\%) \\ Body Shaming (18.37\%) 




In [7]:
for dataset_name, dataset_taxs in dataset2tax.items():
    for taxonomy_name, taxonomy_class in dataset_taxs.items():
        top_list = []
        if taxonomy_name == "sentiment":
            print(f"dataset_name: {dataset_name}, taxonomy_name: {taxonomy_name}")
            for item in taxonomy_class.items():
                top_list.append((item[0], round(item[1], 1)))
            print(top_list)
            print("\n")

        if taxonomy_name == "writeprint":
            print(f"dataset_name: {dataset_name}, taxonomy_name: {taxonomy_name}")
            for item in taxonomy_class.items():
                if item[0] in ["Coleman-Liau", "Gulpease", "Automated Readability"]:
                    top_list.append((item[0], round(item[1], 1)))
            print(top_list)
            print("\n")

dataset_name: buzzfeed, taxonomy_name: sentiment
[('positivity', 1.6), ('overall', -13.9), ('negativity', -15.5)]


dataset_name: buzzfeed, taxonomy_name: writeprint
[('Gulpease', 72.4), ('Coleman-Liau', 9.5), ('Automated Readability', 6.7)]


dataset_name: twittercovidq2, taxonomy_name: sentiment
[('positivity', 2.3), ('overall', -6.4), ('negativity', -8.7)]


dataset_name: twittercovidq2, taxonomy_name: writeprint
[('Gulpease', 61.3), ('Coleman-Liau', 11.5), ('Automated Readability', 10.9)]


dataset_name: twittercovidq1, taxonomy_name: sentiment
[('positivity', 2.4), ('overall', -6.4), ('negativity', -8.8)]


dataset_name: twittercovidq1, taxonomy_name: writeprint
[('Gulpease', 63.7), ('Coleman-Liau', 11.5), ('Automated Readability', 10.7)]


dataset_name: clickbait, taxonomy_name: sentiment
[('positivity', 4.1), ('overall', -3.8), ('negativity', -7.9)]


dataset_name: clickbait, taxonomy_name: writeprint
[('Gulpease', 69.5), ('Coleman-Liau', 9.9), ('Automated Readability', 7.4)]




In [125]:
dataset2pval = {}
for dataset in datasets:
    dataset_name = dataset.split(".")[0]
    df = pd.read_csv(f"{DATA_PATH}{os.sep}{dataset}")
    schemas_deepcopy = copy.deepcopy(taxonomy_schemas)

    id2label = {}
    labels = set(df["labels"].to_list())
    for label in labels:
        if str(label).lower() in pos_labels:
            id2label.update({1: label})
        else:
            id2label.update({0: label})
    label2id  = {id2label[k] : k for k in id2label}

    selected_features = {}
    for col in df.iloc[:,2:].columns:
        col = df[col].apply(lambda x: json.loads(x))
        col_array = np.vstack(col.values)
        _, pvals = f_regression(col_array, df.iloc[:,1].apply(lambda x: label2id[x]).values)
        selected_pval = pvals < 0.05
        selected_features[col.name] = OrderedDict(sorted({class_name: pval_v for pval_v, pval, class_name in zip([round(pval, 2) for pval in pvals], selected_pval, schemas_deepcopy[col.name]) if pval}.items(), key=itemgetter(1), reverse=False))
        dataset2pval[dataset_name] = selected_features

In [128]:
for dataset_name, dataset_taxs in dataset2pval.items():
    for taxonomy_name, taxonomy_class in dataset_taxs.items():
        print(f"dataset_name: {dataset_name}, taxonomy_name: {taxonomy_name}")
        top3 = list(taxonomy_class.items())[:3]
        top_list = []
        for class_name, class_value in top3:
            top_list.append(f"{class_name} ({class_value})")
        if not top_list:
            print(" - \\\ - \\\ - ")
        else:
            print_slash = " \\\ "
            print(f"{print_slash.join(top_list)} ")

        print("\n")

dataset_name: buzzfeed, taxonomy_name: radnarr_prj
 - \\ - \\ - 


dataset_name: buzzfeed, taxonomy_name: hate_speech
 - \\ - \\ - 


dataset_name: buzzfeed, taxonomy_name: sentiment
 - \\ - \\ - 


dataset_name: buzzfeed, taxonomy_name: behavioral_traits_en
 - \\ - \\ - 


dataset_name: buzzfeed, taxonomy_name: emotional_traits_en
 - \\ - \\ - 


dataset_name: buzzfeed, taxonomy_name: writeprint
colonsPerSentence (0.0) \\ exclamationMarksPerSentence (0.0) \\ sentences (0.0) 


dataset_name: twittercovidq2, taxonomy_name: radnarr_prj
 - \\ - \\ - 


dataset_name: twittercovidq2, taxonomy_name: hate_speech
 - \\ - \\ - 


dataset_name: twittercovidq2, taxonomy_name: sentiment
 - \\ - \\ - 


dataset_name: twittercovidq2, taxonomy_name: behavioral_traits_en
Sexuality (0.01) \\ Unawareness (0.02) \\ Discrimination (0.04) 


dataset_name: twittercovidq2, taxonomy_name: emotional_traits_en
Anger (0.01) \\ Anxiety (0.01) \\ Worry (0.01) 


dataset_name: twittercovidq2, taxonomy_name: writepr