# Annotate datasets

In [1]:
# Imports

from emoji import demojize
from tqdm import tqdm

import pandas as pd
import getpass
import requests
import json
import copy
import os


# Constants

PLATFORM_PATH = "https://playground.api.expertcustomers.ai/api/v1/runtime/workflow/413984e9-5955-4133-920a-4e8aa90c406f/action/analyze"
X_API_KEY = getpass.getpass("Insert expert.ai x_api_key: ")
TAXONOMIES_PATH = "../data/raw/taxonomies/"
PREPROCESSED_DATA_PATH = "../data/processed/"
PROCESSED_ANNOTATED_DATA_PATH = "../data/processed/annotated/"

In [2]:
def platform_call(text):
    headers = {
        'Content-Type': 'application/json; charset=utf-8',
        'x-api-key': X_API_KEY
    }
    req = json.dumps({"text": demojize(text)})
    response = requests.request("POST", PLATFORM_PATH, headers=headers, data=req)
    return response.json()

In [3]:
taxonomy_schemas = {}
for filename in os.scandir(TAXONOMIES_PATH):
    with open(filename, "r") as fi:
        taxonomy_schema = {line.strip("\n"):float(0) for line in fi.readlines()}
        taxonomy_schemas[filename.name.split(".")[0]] = taxonomy_schema

In [5]:
def process_stylo_out(output):    
    graph = output.get("extraData").get('JSON-LD').get('@graph')[0]
    readability_idx = graph.get('readabilityIndexes')
    readability_idx_dict = {''.join(i['name'].split()):i['value'] for i in readability_idx}
    structure_idx = graph.get('structureIndexes')
    structure_idx_dict = {k:float(v.get('mean', v.get('total'))) for k, v in structure_idx.items()}
    return {**readability_idx_dict, **structure_idx_dict}

In [6]:
def process_senti_out(output):
    sentiment = output.get("sentiment")
    if sentiment.get("items"):
        _ = sentiment.pop("items")
    return sentiment

In [7]:
def process_output(output, taxonomy_schemas):
    schemas_deepcopy = copy.deepcopy(taxonomy_schemas)
    for _, annotation in output.items():
        for category in annotation["document"]["categories"]:
            schemas_deepcopy[category["namespace"]][category["label"]] = float(category["score"])
        if annotation["document"].get("extraData"):
            schemas_deepcopy["writeprint"] = process_stylo_out(annotation["document"])
        if annotation["document"].get("sentiment"):
            schemas_deepcopy["sentiment"] = process_senti_out(annotation["document"])
    return schemas_deepcopy

In [8]:
def get_annotations(text, taxonomy_schemas):
    platform_output = platform_call(text)
    return process_output(platform_output, taxonomy_schemas)

In [9]:
def get_annotation_vectors(text, taxonomy_schemas):
    annotations = get_annotations(text, taxonomy_schemas)
    tax_names = list(annotations.keys())
    ann_values = [list(x.values()) for x in list(annotations.values())]
    return {name:values for name, values in zip(tax_names, ann_values)}

## Webis

In [None]:
webis_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}webis.csv")
df2list = webis_df.to_dict("records")

In [None]:
for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

In [None]:
filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

In [None]:
webis_annotated_df = pd.DataFrame.from_dict(filtered_df2list)

In [None]:
webis_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}webis.csv", index=False, header=True, encoding="utf-8")

## Basil

In [None]:
basil_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}basil.csv")
df2list = basil_df.to_dict("records")

In [None]:
for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

In [None]:
filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

In [None]:
basil_annotated_df = pd.DataFrame.from_dict(filtered_df2list)

In [None]:
basil_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}basil.csv", index=False, header=True, encoding="utf-8")

## Clickbait

In [None]:
clickbait_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}clickbait.csv")
df2list = clickbait_df.to_dict("records")

In [None]:
for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

In [None]:
filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

In [None]:
clickbait_annotated_df = pd.DataFrame.from_dict(filtered_df2list)

In [None]:
clickbait_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}clickbait.csv", index=False, header=True, encoding="utf-8")

## Pheme

In [None]:
pheme_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}pheme.csv")
df2list = pheme_df.to_dict("records")

In [None]:
for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

In [None]:
filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

In [None]:
pheme_annotated_df = pd.DataFrame.from_dict(filtered_df2list)

In [None]:
pheme_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}pheme.csv", index=False, header=True, encoding="utf-8")

## Politifact

In [None]:
politifact_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}politifact.csv")
df2list = politifact_df.to_dict("records")

for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

politifact_annotated_df = pd.DataFrame.from_dict(filtered_df2list)
politifact_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}politifact.csv", index=False, header=True, encoding="utf-8")

## Buzzfeed

In [10]:
buzzfeed_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}buzzfeed.csv")

df2list = buzzfeed_df.to_dict("records")

for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

buzzfeed_annotated_df = pd.DataFrame.from_dict(filtered_df2list)
buzzfeed_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}buzzfeed.csv", index=False, header=True, encoding="utf-8")

100%|██████████| 170/170 [00:48<00:00,  3.54it/s]

Annotation process failed for 0 items





## Propaganda

In [None]:
propaganda_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}propaganda.csv")

df2list = propaganda_df.to_dict("records")

for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

propaganda_annotated_df = pd.DataFrame.from_dict(filtered_df2list)
propaganda_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}propaganda.csv", index=False, header=True, encoding="utf-8")

## TwitterCovidQ1

In [None]:
twittercovidq1_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}twittercovidq1.csv")

df2list = twittercovidq1_df.to_dict("records")

for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

propaganda_annotated_df = pd.DataFrame.from_dict(filtered_df2list)
propaganda_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}twittercovidq1.csv", index=False, header=True, encoding="utf-8")

## TwitterCovidQ2

In [None]:
twittercovidq2_df = pd.read_csv(f"{PREPROCESSED_DATA_PATH}twittercovidq2.csv")

df2list = twittercovidq2_df.to_dict("records")

for row in tqdm(df2list):
    try:
        row.update(get_annotation_vectors(row["text"], taxonomy_schemas))
    except:
        continue

filtered_df2list = [item for item in df2list if len(item) == 8]
print(f"Annotation process failed for {len([item for item in df2list if len(item) != 8])} items")

propaganda_annotated_df = pd.DataFrame.from_dict(filtered_df2list)
propaganda_annotated_df.to_csv(f"{PROCESSED_ANNOTATED_DATA_PATH}twittercovidq2.csv", index=False, header=True, encoding="utf-8")