## Table of Contents

* [Chapter 1](#chapter1): Data Cleaning
    * [Section 1.1](#section_1_1): Reading in Data
    * [Section 1.2](#section_1_2): Handling Datatypes
    * [Section 1.3](#section_1_3): Handling Missing Values
    * [Section 1.4](#section_1_4): Handling Impossible Data
    * [Section 1.5](#section_1_5): Feature Engineering
* [Chapter 2](#chapter2): Transforming Data Structure for Sentiment Analysis
    * [Section 2.1](#section_2_1): Entity Recognition 
    * [Section 2.2](#section_2_2): Extracting Passages
    * [Section 2.3](#section_2_3): Preparing Passages for Aspect-Based Sentiment Analysis

## Chapter 1: <a class="anchor" id="chapter1"></a> Data Cleaning

In [1]:
#Performing required installations and downloads
#pip install spacy
#pip install pyvis
#python -m spacy.de.download

In [2]:
#Importing libraries
#TSV file processing
import lzma

#Data processing
import pandas as pd
import numpy as np
import re
import itertools as it
import typing as tp
import string as st

#NLP
from __future__ import print_function, unicode_literals
import spacy
import networkx as nx
from spacy.language import Language

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
import pyvis
from pyvis.network import Network

#Stats
from scipy.stats import norm

#Other
import os
from tqdm import tqdm
import warnings
tqdm.pandas()

In [3]:
#Suppressing warnings
warnings.simplefilter(action = "ignore")

In [4]:
#Reading in CSVs
os.chdir("..")
os.chdir("..")
df_uncleaned = pd.read_csv("Data/Articles/2022-04-11_swissdox_goebel.csv", index_col = 0)

df_entities = pd.read_csv("Inputs/Articles/entities.csv")
df_key_media = pd.read_csv("Inputs/Articles/key_media.csv", index_col = 0)
df_media_map = pd.read_csv("Inputs/Articles/media_map.csv", index_col = 0)
df_doctype_map = pd.read_csv("Inputs/Articles/doctype_map.csv", index_col = 0)
os.chdir("Notebooks/Articles")

In [5]:
#Setting maps for higher-level aggregation
entity_map = {key.encode("utf-8").decode("unicode-escape"): value for key, value in zip(df_entities["keyword"], df_entities["designed_entity"])}
media_map = {key: value for key, value in zip(df_media_map["medium_name"].values, df_media_map["mapped_medium_name"].values)}
doctype_map = {key: value for key, value in zip(df_doctype_map["doctype"].values, df_doctype_map["mapped_doctype"].values)}

In [6]:
#Setting key media
key_media = list(df_key_media["key_media"])
key_media

['Cash',
 'Blick',
 'NZZ',
 'Aargauer_Zeitung',
 '20_Minuten',
 'Basler_Zeitung',
 'SRF',
 'Tages_Anzeiger',
 'St._Galler_Tagblatt']

### Section 1.1: <a class="anchor" id="section_1_1"></a> Reading in Data

In [7]:
#Defining function to read TSV file and save as CSV
def tsv_to_csv(filepath):
    #Reading TSV
    file = lzma.open(filepath, mode = 'rt', encoding = 'utf-8')
    rows = []
    for line in file:
        if not line.strip() or line.startswith('#'):
            continue
        row = line.rstrip().split('\t')
        rows.append(row)
    
    #Instantiating dataframe
    df_uncleaned = pd.DataFrame(data = [],
                                columns = read_compressed_tsv('2022-04-11_swissdox_goebel.tsv.xz')[0])
    
    #Reading TSV file
    rows = read_compressed_tsv('2022-04-11_swissdox_goebel.tsv.xz')
    rows = rows[1:]

    #Filling rows of dataframe
    for index, row in enumerate(rows):
        df_uncleaned.loc[index] = row   
    
    #Saving to CSV
    os.chdir("..")
    os.chdir("..")
    df_uncleaned.to_csv("2022-04-11_swissdox_goebel.csv")
    os.chdir("Notebooks/Articles")

In [8]:
#Reading TSV file 
#tsv_to_csv("2022-04-11_swissdox_goebel.csv")

### Section 1.2: <a class="anchor" id="section_1_2"></a> Handling Datatypes

In [9]:
#Creating dataframe for cleaning
df_cleaned = df_uncleaned.copy()

In [10]:
#Checking datatypes
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86990 entries, 0 to 86989
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   86990 non-null  int64 
 1   pubtime              86990 non-null  object
 2   medium_code          86990 non-null  object
 3   medium_name          86990 non-null  object
 4   rubric               55072 non-null  object
 5   regional             4672 non-null   object
 6   doctype              86990 non-null  object
 7   doctype_description  86990 non-null  object
 8   language             86990 non-null  object
 9   char_count           86990 non-null  int64 
 10  dateline             21944 non-null  object
 11  head                 86990 non-null  object
 12  subhead              4581 non-null   object
 13  content_id           86990 non-null  object
 14  content              86990 non-null  object
dtypes: int64(2), object(13)
memory usage: 10.6+ MB


In [11]:
#Changing datatypes
df_cleaned["pubtime"] = pd.to_datetime(df_cleaned["pubtime"], infer_datetime_format = True, utc = True)
df_cleaned["id"] = df_cleaned["id"].astype("float").astype("int64") 
df_cleaned["char_count"] = df_cleaned["char_count"].astype("float").astype("int64") 

### Section 1.3: <a class="anchor" id="section_1_3"></a> Handling Missing Values

In [12]:
#Replacing "" with np.nan
df_cleaned = df_cleaned.replace("", np.nan)

In [13]:
#Inspecting missing values
share_missing_vals_num = (df_cleaned.isnull().sum() / len(df_cleaned))*100
share_missing_vals_str = round(share_missing_vals_num,1).astype(str)+"%"
share_missing_vals_df = pd.DataFrame({"share_of_missing_values": share_missing_vals_str, "share_of_missing_values_num": share_missing_vals_num})
share_missing_vals_df = share_missing_vals_df.sort_values(by = "share_of_missing_values_num", ascending = False)
share_missing_vals_df = share_missing_vals_df.drop("share_of_missing_values_num", axis = 1)
share_missing_vals_df

Unnamed: 0,share_of_missing_values
subhead,94.7%
regional,94.6%
dateline,74.8%
rubric,36.7%
id,0.0%
pubtime,0.0%
medium_code,0.0%
medium_name,0.0%
doctype,0.0%
doctype_description,0.0%


In [14]:
#Dropping columns with too many missing values
df_cleaned.drop(["subhead", "regional", "dateline", "rubric"], axis = 1, inplace = True)

### Section 1.4: <a class="anchor" id="section_1_4"></a> Handling Impossible Data

In [15]:
#Dropping English articles
df_cleaned = df_cleaned[df_cleaned["language"] == "de"]

#Dropping language column
df_cleaned.drop("language", axis = 1, inplace = True)

In [16]:
#Dropping duplicate articles
df_cleaned.drop_duplicates(subset = "content_id", keep = "first", inplace = True)

### Section 1.5: <a class="anchor" id="section_1_5"></a> Feature Engineering

In [17]:
#Defining function to clean content
def clean_text(col):
    #HTML markers
    #Replacing author and image source markers and all text inbetween with a " "
    markers = ["au", "ur"]
    for marker in markers:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(f"<{marker}.+?>", " ", x))
        
    #Replacing link, text, paragraph, and other markers with a " "
    markers = ["a", "tx", "p", "lg", "ka", "nt"]
    for marker in markers:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(f"</{marker}>", " ", x))
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(f"<{marker}>", " ", x))
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(f"<{marker}.+?>", " ", x))
        
    #Replacing \xad and \xa0 with a " "
    markers = ["\xad", "\xa0"]
    for marker in markers:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(marker, " ", x))

    #Replacing table and other markers with a "."
    markers = ["td", "tr", "table", "th", "br", "zt", "pre"]
    for marker in markers:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(f"</{marker}>", ".", x))
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(f"<{marker}>", ".", x))
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(f"<{marker}.+?>", ".", x))

    #Replacing "+++" with a "."
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("\+\+\+", ".", x))

    #Replacing subtitle markers with [SUB]
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("</ld>", "[SUB]", x))
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("<ld>", "[SUB]", x))
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("<ld.+?>", "[SUB]", x))
    
    #Numbers
    #Removing numbers with dots
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("[0-9]\d{0,}\.", "", x))
    
    #Removing numbers with commas or thousand markers
    symbols = [",", "’", "'"]
    for symbol in symbols:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("[0-9]\d{0,}symbol[0-9]\d{0,}", "", x))
    
    #Removing remaining numbers
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("[0-9]\d{0,}", "", x))
    
    #Symbols
    #Removing special symbols
    symbols = ["#", "\\", "<", ">", "@", "{", "}", "~", "\+", "\*", "_"]             
    matches_regex = "|".join(symbols)
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(matches_regex, "", x))
    
    symbols = ["&", "%", "/", "(", ")"]
    for symbol in symbols:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: x.replace(symbol, ""))
        
    #Replacing "-" and ";" with a " ":
    symbols = ["-", "–" , ";"] 
    for symbol in symbols:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(symbol, " ", x))
        
    #Replacing ":", "?" and "!" with a ".":
    symbols = [":", "\?", "!"] 
    for symbol in symbols:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(symbol, ".", x))
    
    #Replacing '""', '«', '»', "'" and "’" with a " "
    symbols = ['""', '«', '»', "'", "’", "‹", "›"]
    for symbol in symbols:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(symbol, " ", x))
    
    #Beginning and end of content
    #Removing trailing spaces
    df_cleaned[col] = df_cleaned[col].apply(lambda x: x.strip())

    #Sentence stops and subtitle markers
    #Removing " " before and after "."
    patterns = [" +\.", "\. +"]
    for pattern in patterns:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(pattern, ".", x))

    #Removing "." and " " before and after subtitle markers
    patterns = ["\]\.+", "\.+\[", "\] +", " +\["]
    substitutes = ["]", "[", "]", "["]
    for pattern, substitute in zip(patterns, substitutes):
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(pattern, substitute, x))
    
    #Double spaces and dots
    #Removing double spaces
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(" +", " ", x))
    
    #Removing double dots
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("(\.){2,}", ".", x))
    
    #Removing comma with space
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(" +, +", ", ", x))
    
    #Spelling
    #Replacing Umlaute
    umlaute = ["ä", "ö", "ü", "é"]
    substitutes = ["ae", "oe", "ue", "e"]
    for umlaut, substitute in zip(umlaute, substitutes):
        df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub(umlaut, substitute, x))

    #Ensuring consistent spelling of "taskforce"
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("Task-Force", "Taskforce", x))
    df_cleaned[col] = df_cleaned[col].apply(lambda x: re.sub("Task Force", "Taskforce", x))
    
    #Removing trailing spaces
    df_cleaned[col] = df_cleaned[col].apply(lambda x: x.strip())

In [18]:
#Defining function to find nth (first or second) substring in string
def find_first_or_second_substring_index(string, substring, n):
    subtractor = n-1
    return string.find(substring, string.find(substring) + subtractor)

In [19]:
#Mapping medium name to aggregated medium name
df_cleaned["medium_name_agg"] = df_cleaned["medium_name"].apply(lambda x: media_map[x] if x in media_map else x)

#Replace spaces
df_cleaned["medium_name_agg"] = df_cleaned["medium_name_agg"].apply(lambda x: x.replace(" ", "_"))

#Replace double spaces
df_cleaned["medium_name_agg"] = df_cleaned["medium_name_agg"].apply(lambda x: x.replace("__", "_"))

#Replace common patterns
patterns = [".ch", "Newsnet_/_", "_/_MLZ"]
for pattern in patterns:
    df_cleaned["medium_name_agg"] = df_cleaned["medium_name_agg"].apply(lambda x: x.replace(pattern, ""))

#Replace Umlaute
patterns = ["ä", "ö", "ü"]
substitutes = ["ae", "oe", "ue"]
for pattern, substitute in zip(patterns, substitutes):
    df_cleaned["medium_name_agg"] = df_cleaned["medium_name_agg"].apply(lambda x: x.replace(pattern, substitute))

In [20]:
#Mapping doctype to aggregated doctype
df_cleaned["doctype_description_agg"] = df_cleaned["doctype_description"].apply(lambda x: doctype_map[x] if x in doctype_map else x)

In [21]:
#Extracting medium channel
df_cleaned["channel_description"] = df_cleaned["doctype_description"].apply(lambda x: "Online medium" if x == "Online medium" else "Offline medium")

In [22]:
#Cleaning text 
df_cleaned["content_cleaned"] = df_cleaned["content"]
df_cleaned["head_cleaned"] = df_cleaned["head"]

clean_text("content_cleaned")
clean_text("head_cleaned")

In [23]:
#Extracting subtitle
df_cleaned["subhead_cleaned"] = df_cleaned["content_cleaned"]
df_cleaned["subhead_cleaned"] = df_cleaned["subhead_cleaned"].apply(lambda x: x[find_first_or_second_substring_index(x, "[SUB]", 1)  \
                                                            + 5 : find_first_or_second_substring_index(x, "[SUB]", 2)] \
                                                            if "[SUB]" in x else "")

In [24]:
#Removing subtitle, all remaining subtitle markers, and trailing spaces from content
df_cleaned["content_cleaned"] = df_cleaned["content_cleaned"].apply(lambda x: x[find_first_or_second_substring_index(x, "[SUB]", 2)  \
                                                            + 5 : ]\
                                                            if "[SUB]" in x else x)
df_cleaned["content_cleaned"] = df_cleaned["content_cleaned"].apply(lambda x: x.replace("[SUB]",""))
df_cleaned["content_cleaned"] =  df_cleaned["content_cleaned"].apply(lambda x: x.strip())

In [25]:
#Creating content_full including title, subtitle, and content
df_cleaned["content_full"] = df_cleaned.apply(lambda x: str(x["head_cleaned"]) + "." + str(x["subhead_cleaned"]) + "." + str(x["content_cleaned"]), axis = 1)
df_cleaned["content_full_lower"] = df_cleaned["content_full"].apply(lambda x : x.lower())

In [26]:
#Extracting character count 
df_cleaned["char_count_cleaned"] = df_cleaned["content_full"].apply(lambda x: len(x))

In [27]:
#Extracting publication day
df_cleaned["pubday"] = df_cleaned["pubtime"].dt.strftime('%Y-%m-%d')
df_cleaned["pubday"] = pd.to_datetime(df_cleaned["pubday"], infer_datetime_format = True)

In [28]:
#Extracting publication month
df_cleaned["pubmonth"] = df_cleaned["pubtime"].dt.strftime('%Y-%m')
df_cleaned["pubmonth"] = pd.to_datetime(df_cleaned["pubmonth"])

In [29]:
#Dropping irrelevant columns
df_cleaned.drop(["medium_code", 
                 "medium_name", 
                 "doctype", 
                 "doctype_description",
                 "char_count", 
                 "head", 
                 "content_id", 
                 "content"], axis = 1, inplace = True)

In [30]:
#Renaming columns
df_cleaned.rename(columns = {"medium_name_agg": "medium_name", 
                             "doctype_description_agg": "doctype_description", 
                             "content_cleaned": "content", 
                             "head_cleaned": "head", 
                             "subhead_cleaned": "subhead", 
                             "char_count_cleaned": "char_count"}, inplace = True)

In [32]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df_cleaned.to_csv("Data/Articles/cleaned_data.csv")
os.chdir("Notebooks/Articles")

## Chapter 2: <a class="anchor" id="chapter2"></a> Transforming Data Structure for Sentiment Analysis

### Section 2.1: <a class="anchor" id="section_2_1"></a> Entity Recognition

In [44]:
#Reading in data
os.chdir("..")
os.chdir("..")
df_cleaned = pd.read_csv("Data/Articles/cleaned_data.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])
os.chdir("Notebooks/Articles")

In [46]:
#Defining entities with proper names where capitalization is important and non-proper names where capitalization is unimportant
entities_proper = [x.encode("utf-8").decode("unicode-escape") for x in df_entities[df_entities["proper_noun"] == 1]["keyword"].values]
entities_not_proper = [x.encode("utf-8").decode("unicode-escape") for x in df_entities[df_entities["proper_noun"] == 0]["keyword"].values]

### Section 2.2: <a class="anchor" id="section_2_2"></a> Extracting Passages

In [49]:
#Defining function to filter dataframe by entity mentions
def filter_by_entity_mentions(df, entities_list, content_col, entity_col):
    matches_regex = "|".join(entities_list)
    df_filtered = df[df[content_col].str.contains(matches_regex, regex = True)]
    df.loc[df_filtered.index, entity_col] = 1

In [50]:
#Defining function to generate passage
def create_passage(row):
    if row["start_of_article"] == 1:
        passage = row["original_sentence"] + ". " + row["next_sentence"]
    elif row["end_of_article"] == 1:
        passage = row["previous_sentence"] + ". " + row["original_sentence"]
    else:
        passage = row["previous_sentence"] + ". " + row["original_sentence"] + ". " + row["next_sentence"]
    return passage

In [51]:
#Defining function to get keyword(s) from string and map them to entity
def get_entities(string, entities_list):
    keyword_list = [entity for entity in entities_list if bool(re.search(entity, string))]
    keyword_list = [entity_map[keyword] for keyword in keyword_list]
    keyword_string = str(keyword_list).replace("'","").replace("[","").replace("]","")
    return keyword_string

In [52]:
#Defining function to get keyword(s) from string
def get_keywords(string, entities_list):
    keyword_list = [entity for entity in entities_list if bool(re.search(entity, string))]
    keyword_string = str(keyword_list).replace("['","").replace("']","").replace("'","").replace("[]","")
    return keyword_string

In [53]:
#Filtering dataframe by mentions of entity and mark relevant rows (proper names where capitalization is important)
df_cleaned["entities_proper"] = 0
filter_by_entity_mentions(df_cleaned, entities_proper, "content_full", "entities_proper")

In [54]:
#Filtering dataframe by mentions of entity and mark relevant rows (not proper names where capitalization is unimportant)
df_cleaned["entities_not_proper"] = 0
filter_by_entity_mentions(df_cleaned, entities_not_proper, "content_full_lower", "entities_not_proper")

In [55]:
#Creating joint column of mentions
df_cleaned["entities"] = df_cleaned.apply(lambda x: 1 if (x["entities_proper"] == 1) | (x["entities_not_proper"] == 1) else 0, axis = 1)

In [56]:
#Creating new dataframe, containing only articles which mention at least one entity
df = df_cleaned[df_cleaned["entities"] == 1].reset_index(drop = True)

In [57]:
#Splitting dataframe by sentence
df["content_full"] = df["content_full"].apply(lambda x: x.split("."))

In [58]:
#Exploding dataframe to sentence-level
df = df.explode("content_full").reset_index(drop = True)
df["content_full"] = df["content_full"].apply(lambda x: x.strip())

In [59]:
#Renaming columns
df.rename(columns = {"content": "original_content",
                     "content_full": "original_sentence"}, inplace = True)

In [60]:
#Dropping irrelevant columns
df.drop(["head", "subhead", "entities_proper", "entities_not_proper", "entities", "content_full_lower"], 
       axis = 1, 
       inplace = True)

In [61]:
#Dropping rows with empty content
df = df[df["original_sentence"] != ""].reset_index(drop = True)

In [62]:
#Recording previous and following sentence
df["previous_sentence"] = df["original_sentence"].shift(1, fill_value = "")
df["next_sentence"] = df["original_sentence"].shift(-1, fill_value = "")

In [63]:
#Recording whether sentence marks start or end of an article
df["start_of_article"] = np.where(df["id"] != df["id"].shift(1, fill_value = 0), 1, 0)
df["end_of_article"] = np.where(df["id"] != df["id"].shift(-1, fill_value = 0), 1, 0)

In [64]:
#Generating passage
df["original_passage"] = df.apply(lambda x: create_passage(x), axis = 1)

In [65]:
#Dropping irrelevant columns
df.drop(["previous_sentence", "next_sentence", "start_of_article", "end_of_article"], 
        axis = 1, 
        inplace = True)

In [66]:
#Creating lowercase content_full and passage
df["original_sentence_lower"] = df["original_sentence"].apply(lambda x: x.lower())
df["original_passage_lower"] = df["original_passage"].apply(lambda x: x.lower())

In [67]:
#Filtering dataframe by mentions of entity and mark relevant rows (proper names where capitalization is important)
df["entities_proper"] = 0
filter_by_entity_mentions(df, entities_proper, "original_sentence", "entities_proper")

In [68]:
#Filtering dataframe by mentions of entity and mark relevant rows (not proper names where capitalization is unimportant)
df["entities_not_proper"] = 0
filter_by_entity_mentions(df, entities_not_proper, "original_sentence_lower", "entities_not_proper")

In [69]:
#Creating joint column of mentions
df["entities"] = df.apply(lambda x: 1 if (x["entities_proper"] == 1) | (x["entities_not_proper"] == 1) else 0, axis = 1)

In [70]:
#Dropping sentences without mentions of at least one entity
df = df[df["entities"] == 1].reset_index(drop = True)

In [71]:
#Adding columns with entity contained in sentence
df["entities_proper_names"] = df["original_sentence"].apply(lambda x: get_entities(x, entities_proper))
df["entities_not_proper_names"] = df["original_sentence_lower"].apply(lambda x: get_entities(x, entities_not_proper))

In [72]:
#Creating joint column of entities
df["entities_names"] = df.apply(lambda x: set(x["entities_proper_names"].split(", ")).union(set(x["entities_not_proper_names"].split(", "))), axis = 1)
df["entities_names"] = df["entities_names"].apply(lambda x: str(x).replace(", ''","").replace("{","").replace("}","").replace("'","").lstrip(", ").rstrip(", "))

In [73]:
#Adding columns with entity keywords contained in sentence
df["entities_proper_keywords"] = df["original_sentence"].apply(lambda x: get_keywords(x, entities_proper))
df["entities_not_proper_keywords"] = df["original_sentence_lower"].apply(lambda x: get_keywords(x, entities_not_proper))

In [74]:
#Creating joint column of entity keywords
df["entities_keywords"] = df.apply(lambda x: set(x["entities_proper_keywords"].split(", ")).union(set(x["entities_not_proper_keywords"].split(", "))), axis = 1)
df["entities_keywords"] = df["entities_keywords"].apply(lambda x: str(x).encode("utf-8").decode("unicode-escape").replace(", ''","").replace("'","").lstrip("{").rstrip("}").lstrip(", ").rstrip(", "))

In [75]:
#Creating column with number of entities
df["num_entities"] = df["entities_names"].apply(lambda x: len(x.split(", ")))

In [76]:
#Dropping irrelevant columns
df.drop(["entities_proper", "entities_not_proper", "entities"], 
        axis = 1, 
        inplace = True)

In [78]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df.to_csv("Data/Articles/cleaned_partially_parsed_data.csv")
os.chdir("Notebooks/Articles")

### Section 2.3: <a class="anchor" id="section_2_3"></a> Preparing Passages for Aspect-Based Sentiment Analysis

In [79]:
#Reading in data
os.chdir("..")
os.chdir("..")
df = pd.read_csv("Data/Articles/cleaned_partially_parsed_data.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])
os.chdir("Notebooks/Articles")

In [80]:
#Defining function for dependency parsing
def get_dep(string):
    tokens = nlp(string)
    dep = [token.dep_ for token in tokens]
    return dep

In [81]:
#Defining function for PoS tagging
def get_pos(string):
    tokens = nlp(string)
    pos = [token.pos_ for token in tokens]
    return pos

In [82]:
#Defining classes to detect clauses
verb_pos = {"VERB", "AUX"}
finite_verb_tags = {"VVFIN", "VMFIN", "VAFIN"}

class Clause:
    def __init__(self, spans: tp.Iterable["spacy.tokens.Span"]):
        self.spans = spans

    @property
    def __chain(self) -> tp.Iterable["spacy.tokens.Token"]:
        return [token for token in it.chain(*self.spans)]

    def __getitem__(self, index: int) -> "spacy.tokens.Token":
        return self.__chain[index]

    def __iter__(self) -> tp.Iterator:
        self.n = 0
        return self

    def __next__(self) -> "spacy.tokens.Token":
        self.n += 1
        try:
            return self[self.n - 1]
        except IndexError:
            raise StopIteration

    def __repr__(self) -> str:
        return " ".join([span.text for span in self.inner_spans])

    @property
    def is_subclause(self) -> bool:
        """Clause is a subclause iff the last token is a finite verb."""
        return (
            self[-2].tag_ in finite_verb_tags
            if self[-1].pos_ == "PUNCT"
            else self[-1].tag_ in finite_verb_tags)

    @property
    def clause_type(self) -> str:
        return "SUB" if self.is_subclause else "MAIN"

    @property
    def inner_spans(self) -> tp.List["spacy.tokens.Span"]:
        inner_spans = []
        for span in self.spans:
            inner_spans.append(span)
        return inner_spans


class ClausedSentence(spacy.tokens.Span):
    @property
    def __finite_verb_indices(self) -> tp.List[int]:
        return [token.i for token in self if token.tag_ in finite_verb_tags]

    def progeny(
        self,
        index: int,
        stop_indices: tp.Optional[tp.List[int]] = None) -> tp.List["spacy.tokens.Token"]:
        if stop_indices is None:
            stop_indices = []

        progeny = [index]

        for child in self[index].children:
            if child.i in stop_indices:
                continue

            progeny += [child.i] + self.progeny(child.i, stop_indices)

        return sorted(list(set(progeny)))

    @property
    def clauses(self) -> tp.Generator["Clause", None, None]:
        for verb_index in self.__finite_verb_indices:
            clause_tokens = [
                self[index]
                for index in self.progeny(
                    index=verb_index, stop_indices = self.__finite_verb_indices)]

            spans = []

            for _, group in it.groupby(
                enumerate(clause_tokens),
                lambda index_token: index_token[0] - index_token[1].i):
                tokens = [item[1] for item in group]
                spans.append(self[tokens[0].i : tokens[-1].i + 1])

            yield Clause(spans)

In [83]:
#Defining function to get subclause for given entity
def get_subclause(string, entity):
    tokens = nlp(string)
    try:
        sentences = tokens.sents 
        sentence = next(sentences)
        claused_sentence = ClausedSentence(sentence.doc, sentence.start, sentence.end)
        clauses = list(claused_sentence.clauses)
        relevant_clauses = []
        for clause in clauses:
            regex_match = re.search("\w*" + entity + "\w*", str(clause))
            if regex_match:
                clause = re.sub(",", "", str(clause))
                clause = re.sub(" +", " ", clause)
                clause = re.sub("\.", "", clause)

                relevant_clauses.append(clause)
        return relevant_clauses
    except:
        return string

In [84]:
#Defining function to get most related words for a given entity
def get_related_words(string, entity):
    #Get tokens
    tokens = nlp(string)
    tokens_text = [str(token) for token in tokens]
    
    #Get index of entity and surrounding 3 words on each side
    for token_text in tokens_text:
        regex_match = re.search("\w*" + entity + "\w*", token_text)
        if regex_match:
            entity_form = regex_match.group(0)
            break
    try:
        entity_idx = tokens_text.index(entity_form)
    except:
        return string
    idx_range_max = min(entity_idx + 3, len(tokens)-1)
    idx_range_min = max(0, entity_idx - 3)
    idx_range = [x for x in range(idx_range_min, idx_range_max + 1)]
    
    #Get document length
    doc_len = len(tokens)
    
    #Add children into network 
    nodes = []
    edges = []
    for token in tokens:
        nodes.append(str(token))
        for child in token.children:
            edges.append((str(token),
                          str(child)))
    
    #Create network
    graph = nx.Graph(edges)
    
    #Extract relevant tokens
    relevant_tokens = []
    for token, token_text in zip(tokens, tokens_text):
        #If token in immediate surrounding of entity, add token to relevant tokens
        token_idx = tokens_text.index(token_text)
        if token_idx in idx_range:
            relevant_tokens.append(token_text)
        
        #Calculate path distance between entity and token
        try:
            path_length = nx.shortest_path_length(graph, source = entity_form, target = token_text)
        except:
            path_length = 0
        path_length_normalized = path_length / doc_len
        #print(token_text + " " + str(round(path_length_normalized,2)))
        
        #If token within short distance of entity in network, add token to relevant tokens
        if (path_length_normalized <= 0.2) & (token_text not in relevant_tokens):
            relevant_tokens.append(token_text)
        elif (path_length_normalized <= 0.3) & ((token.pos_ == "ADV")|(token.pos_ == "ADJ")) & (token_text not in relevant_tokens):
            relevant_tokens.append(token_text)
    
    return relevant_tokens

In [85]:
#Defining function to replace entity keywords with entities in sentence
def replace_entity_mentions(string, designed_entity):
    entity_keywords = [key for key, value in zip(entity_map.keys(), entity_map.values()) if value == designed_entity]
    matches_regex = "|".join(entity_keywords)
    string_new = re.sub(matches_regex, designed_entity, string)
    return string_new

In [86]:
#Instantiating nlp
nlp = spacy.load("de_core_news_sm")

In [87]:
#Defining custom sentence boundaries for clause detector
#nlp.remove_pipe("custom_boundaries")

@Language.component("custom_boundaries")
def custom_boundaries(doc):
    for token in doc:
        doc[token.i].is_sent_start = False
    for token in doc:
        if token.text == ".":
            doc[token.i+1].is_sent_start = True
    return doc

#Language.factory("custom_boundaries", func = custom_boundaries)

nlp.add_pipe("custom_boundaries", before = "parser")

<function __main__.custom_boundaries(doc)>

In [88]:
#Splitting dataframe on entities
df["entities_keywords"] = df["entities_keywords"].apply(lambda x: x.split(", "))

In [89]:
#Exploding dataframe to entity-level
df = df.explode("entities_keywords").reset_index(drop = True)

In [90]:
#Renaming columns 
df.rename(columns = {"entities_names": "entity_name", 
                     "entities_keywords": "entity_keyword"}, inplace = True)

In [91]:
#Mapping keyword to entity
df["entity_name"] = df["entity_keyword"].apply(lambda x: entity_map[x.encode("utf-8").decode("unicode-escape")])

In [92]:
#Replacing entity keywords with entities
df["sentence_ABSA"] = df.apply(lambda x: replace_entity_mentions(x["original_sentence_lower"], x["entity_name"]) \
                               if x["entity_keyword"].islower() else \
                               replace_entity_mentions(x["original_sentence"], x["entity_name"]),
                               axis = 1)
df["passage_ABSA"] = df.apply(lambda x: replace_entity_mentions(x["original_passage_lower"], x["entity_name"]) \
                              if x["entity_keyword"].islower() else \
                              replace_entity_mentions(x["original_passage"], x["entity_name"]),
                              axis = 1)

In [93]:
#Getting related keywords per entity
df["sentence_ABSA_rel_keywords"] = df.progress_apply(lambda x: get_related_words(x["sentence_ABSA"], x["entity_name"]), axis = 1)

100%|██████████████████████████████████| 271577/271577 [21:16<00:00, 212.75it/s]


In [94]:
#Turning back into string
df["sentence_ABSA_rel_keywords"] = df["sentence_ABSA_rel_keywords"].apply(lambda x: " ".join(x))

In [95]:
#Getting subclauses per entity
df["sentence_ABSA_subclause"] = df.progress_apply(lambda x: get_subclause(x["sentence_ABSA"], x["entity_name"].lower), axis = 1)

100%|██████████████████████████████████| 271577/271577 [19:57<00:00, 226.80it/s]


In [96]:
#Turning back into string
df["sentence_ABSA_subclause"] = df["sentence_ABSA_subclause"].apply(lambda x: str(x).replace("[]", ""))

In [97]:
#Lowercasing
columns = ["sentence_ABSA", "passage_ABSA", "sentence_ABSA_rel_keywords", "sentence_ABSA_subclause"]

for column in columns:
    df[column] = df[column].apply(lambda x: x.lower())

In [98]:
#Removing polarized entity keywords from sentence
polarized_entities = ["Befuerworter", "Gegner", "Skeptiker", "Kritiker", "Opposition", "Demonstranten"]
matches_regex = "|".join(polarized_entities)
columns = ["sentence_ABSA", "passage_ABSA", "sentence_ABSA_rel_keywords", "sentence_ABSA_subclause"]

for column in columns:
    df[column] = df[column].apply(lambda x: re.sub(matches_regex, "[NEG_ENT]", x))

In [99]:
#Dropping rows where BAG infoline number is indicated
df = df[~df["sentence_ABSA"].str.contains("bag infoline", regex = False)].reset_index(drop = True)

In [100]:
#Defining function to create relevant clause
def create_clause(number_entities, sentence, subclause):
    """Create clause with full sentence, if only one entity is mentioned or subclause is too small, 
    and subclause otherwise"""
    length_subclause = len(subclause)
    if (number_entities > 1) & (length_subclause > 5):
        return subclause
    else:
        return sentence

In [101]:
#Creating new column with relevant clause
df["clause_ABSA"] = df.apply(lambda x: create_clause(x["num_entities"], 
                                                     x["sentence_ABSA"], 
                                                     x["sentence_ABSA_subclause"]), 
                             axis = 1)

In [102]:
#Dropping irrelevant columns
df.drop(["original_sentence_lower", "original_passage_lower",
         "entities_proper_names", "entities_not_proper_names", 
         "entities_proper_keywords", "entities_not_proper_keywords"], 
        axis = 1, 
        inplace = True)

In [104]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df.to_csv("Data/Articles/cleaned_parsed_data.csv")
os.chdir("Notebooks/Articles")