## Table of Contents

* [Chapter 1](#chapter1): Data Cleaning
* [Chapter 2](#chapter2): Data Mapping

## Chapter 1: <a class="anchor" id="chapter1"></a> Data Cleaning

In [1]:
#Performing required installations and downloads

In [2]:
#Importing libraries
#Data processing
import pandas as pd
import numpy as np
import re

#Date handling
from datetime import datetime

#Other
import os
from tqdm import tqdm
import warnings
tqdm.pandas()

In [3]:
#Suppressing warnings
warnings.simplefilter(action = "ignore")

In [4]:
#Reading in CSVs
os.chdir("..")
os.chdir("..")
df = pd.read_csv("Outputs/Articles/Snorkel/snorkel.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])

df_entities = pd.read_csv("Inputs/Articles/entities.csv")
df_media = pd.read_csv("Inputs/Articles/media.csv", index_col = 0)
df_key_media = pd.read_csv("Inputs/Articles/key_media.csv", index_col = 0)
df_key_entities = pd.read_csv("Inputs/Articles/key_entities.csv", index_col = 0)
os.chdir("Notebooks/Articles")

In [5]:
#Defining function to clean text
def clean_text(col_list):
    for col in col_list:
        try:
            df[col] = df[col].apply(lambda x: x.replace("_", " ") if pd.notnull(x) else x)
            df[col] = df[col].apply(lambda x: re.sub(r"\b(\w+)( \1\b)+", r"\1", x) if pd.notnull(x) else x)
        except:
            continue

In [6]:
#Cleaning sentence_ABSA, passage_ABSA, sentence_ABSA_rel_keywords, sentence_ABSA_subclause, clause_ABSA
clean_text(["sentence_ABSA", "passage_ABSA", "sentence_ABSA_rel_keywords", "sentence_ABSA_subclause", "clause_ABSA"])

In [7]:
#Removing positive sentiment and mapping negative sentiment to -1
df["sentiment"] = df["sentiment"].apply(lambda x: 0 if x == 1 else x) #for Snorkel df
#df["sentiment"] = df["sentiment"].apply(lambda x: -1 if x == 1 else x) #for ML / DL dfs

In [8]:
#Mapping media names
df["medium_name_mapped"] = df["medium_name"].apply(lambda x: x.replace("_", " ").replace("ae", "ä").replace("oe", "ö").replace("ue", "ü").replace("aü", "aue"))

In [9]:
#Mapping entity names
df["entity_name_mapped"] = df["entity_name"].apply(lambda x: x.replace("_", " ").replace("ae", "ä").replace("oe", "ö").replace("ue", "ü"))

## Chapter 2: <a class="anchor" id="chapter2"></a> Data Mapping

In [10]:
#Setting maps
publisher_map = {medium: publisher for medium, publisher in zip(df_media["medium_name"], df_media["publisher"])}
entity_group_map = {entity: group for entity, group in zip(df_entities["designed_entity"], df_entities["associated_group"])}
entity_party_map = {entity: party for entity, party in zip(df_entities["designed_entity"], df_entities["affiliated_party"])}
entity_type_map = {entity: type for entity, type in zip(df_entities["designed_entity"], df_entities["entity_type"])}

In [11]:
#Applying maps
df["publisher"] = df["medium_name"].apply(lambda x: publisher_map[x])
df["entity_group"] = df["entity_name"].apply(lambda x: entity_group_map[x])
df["entity_party"] = df["entity_name"].apply(lambda x: entity_party_map[x])
df["entity_type"] = df["entity_name"].apply(lambda x: entity_type_map[x])

In [12]:
#Dropping columns
df.drop(["index", "stratification", "sentence_ABSA_subclause", "sentence_ABSA_rel_keywords", "entity_keyword"], 
        axis = 1, 
        inplace = True)

In [14]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df.to_csv("Outputs/Articles/df_final.csv")
os.chdir("Notebooks/Articles")