In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import operator
import csv

In [None]:
PATH_TO_TAGGED_STATEMENTS = "all_tagged.csv"
PATH_SAMPLE_FOLDER = "samples/"

Importing the .csv file containing all the political statements and the pos-tagged version of the statements. The RFTagger was used for tagging

In [None]:
df = pd.read_csv(PATH_TO_TAGGED_STATEMENTS)

This notebooks aims at reimplementing the approach followed by Othman et al. in the paper **Using NLP Approach for Opinion Types Classifier**.

Four different categories are defined in the paper:
 - Non-Opinionated Statement
 - Comparative Opinionated Statement
 - Superlative Opinionated Statement 
 - Opinionated Statement
 
The table below shows which tags, if any, are relevant for a category:
 
| Sentimental Category | POS Tags  |
| --- | --- |
| Non-Opinionated Statement |  -- |
| Comparative Opinionated Statement |  JJR, RBR |
| Superlative Opinionated Statement |  JJS, RBS |
| Opinionated Statement |  JJ |

The next table shows what each tag stands for in the tagger that was used in the paper:

| POS Tag | Description  | Example |
| --- | --- | --- |
| JJ | Adjective  | Big |
| JJR | Adjective comparative  | Bigger |
| JJS | Adjective superlative  | Biggest |
| RBR | Adverb comparative  | Better |
| RBS | Adverb superlative  | Best |

The RFTagger provides the following tags:

| POS Tag (Paper) | POS Tag (RFTagger)  | 
| --- | --- |
| JJ | ADJA, ADJD  |
| JJR | ADJA.Comp, ADJD.Comp  |
| JJS | ADJA.Sup, ADJD.Sup | 
| RBR | Am ehesten ADJD.Comp | 
| RBS | Am ehesten ADJD.Sup |

In [None]:
df.shape[0]

In [None]:
def contains_tag(sentence, tag):
    return tag in sentence

def analyze_dataframe_for_tags(df, tags, mode="and"):
    column_names = []
    modes = ["and", "or"]
    
    if mode not in modes:
        print("Please use either 'and' or 'or' as mode")
        return
    
    # define the output of the function 'f' based on the input parameter
    # 'mode'
    if mode == 'and' or len(tags) == 1:
        def f(x,y):
            return x & y
    else:
        def f(x,y):
            return x | y
    
    for tag in tags:
        column_name = "contains_tag_{tag}".format(tag=tag)
        column_names.append(column_name)
        df[column_name] = df["tagged"].apply(
            lambda row: contains_tag(row, tag))
    
    final_truth_values = np.ones(df.shape[0], dtype=bool)
    
    if mode == "or" and len(tags) > 1:
        final_truth_values = np.zeros(df.shape[0], dtype=bool)
    
    for column_name in column_names:
        #final_truth_values = final_truth_values == df[column_name].to_numpy()
        final_truth_values = f(final_truth_values, df[column_name].to_numpy())
        df.drop(column_name, axis=1, inplace=True)
    
    final_column_name = ""
    
    for tag in tags:
        final_column_name += tag
        final_column_name += "_"
    
    final_column_name += mode
    
    df[final_column_name] = final_truth_values
    
    return df      

In [None]:
analyze_dataframe_for_tags(df,["ADJA.Comp", "ADJD.Comp"],mode="or")
analyze_dataframe_for_tags(df,["ADJA.Sup", "ADJD.Sup"],mode="or")
analyze_dataframe_for_tags(df,["ADJA", "ADJD"],mode="or")
print()

Amount of statements for each of the tag combinations added above

In [None]:
df.shape[0]

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"]].shape[0]

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"]].sample(200).to_csv(PATH_SAMPLE_FOLDER + "samples_pos_comp.csv")

In [None]:
df[df["ADJA.Sup_ADJD.Sup_or"]].shape[0]

In [None]:
df[df["ADJA.Sup_ADJD.Sup_or"]].sample(200).to_csv(PATH_SAMPLE_FOLDER + "samples_pos_sup.csv")

In [None]:
df[df["ADJA_ADJD_or"] & ~df["ADJA.Sup_ADJD.Sup_or"] & ~df["ADJA.Comp_ADJD.Comp_or"]].shape[0]

In [None]:
df[df["ADJA_ADJD_or"] & ~df["ADJA.Sup_ADJD.Sup_or"] & ~df["ADJA.Comp_ADJD.Comp_or"]].sample(200).to_csv(PATH_SAMPLE_FOLDER + "samples_pos.csv")

In [None]:
df[df["ADJA_ADJD_or"]].shape[0]

In [None]:
df[~df["ADJA_ADJD_or"] & ~df["ADJA.Sup_ADJD.Sup_or"] & ~df["ADJA.Comp_ADJD.Comp_or"]].shape[0]

### Evaluation of opinionated statements

First we define some functions that are used to analyze the words which were tagged with certain tags by the RFTagger

In [None]:
def get_value_counts_of_tagged_words_by_postag(df_tagged, postag):
    words = []
    
    words.extend(df_tagged["tagged"].apply(lambda text: 
                             find_desired_words(text,postag)))
    
    words_flat = flatten_list_of_lists(words)
    
    column_name = postag.upper()
    
    return pd.DataFrame(words_flat, columns=[column_name]).value_counts()
    

def flatten_list_of_lists(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

def find_desired_words(sentence, tag):
    words = sentence.split(" ")
    res = []
    
    for word in words:
        if tag in word:
            res.append(word.split("\t")[0])
    
    return res

def sentence_only_contains_words_that_should_be_ignored(sentence, tag, ignore_set):
    relevant_words = find_desired_words(sentence,tag)
    
    res = []
    
    for word in relevant_words:
        if word in ignore_set:
            res.append(True)
        else:
            res.append(False)

    return all(res)


#### First we want to display the words which were tagged with the ADJA and ADJD tags the most often

In [None]:
df_adja_adjd = df[df["ADJA_ADJD_or"] & ~df["ADJA.Sup_ADJD.Sup_or"] & ~df["ADJA.Comp_ADJD.Comp_or"]]

###### Analyze the words containing the ADJA tag

The words 'geehrte', 'letzten', 'geehrter', 'Geschätzte', 'Liebe', 'Hohes', 'Werte', 'Geschätzter' and 'geschätzte'  are going to be added to the set of ignored words, as they might only have to do with greetings

In [None]:
value_counts_adja = get_value_counts_of_tagged_words_by_postag(df_adja_adjd,"ADJA")

In [None]:
value_counts_adja.head(5)

##### Analyze words containing the ADJD tag

Here we cannot see a word that should clearly be added to the set of ignored words

In [None]:
value_counts_adjd = get_value_counts_of_tagged_words_by_postag(df_adja_adjd,"ADJD")

In [None]:
value_counts_adjd.head(5)

Defining the set of words that should be ignored 

In [None]:
adja_set_of_words_to_ignore = set(['geehrte', 
                                   'letzten', 
                                   'geehrter', 
                                   'Geschätzte', 
                                   'Liebe', 
                                   'Hohes',
                                   'Werte', 
                                   'Geschätzter',
                                   'geschätzte'
                                  'geehrten',
                                   '„'])

Create new columns that are used to filter out texts with words that should be ignored, where no other ADJA or ADJD tag is present

In [None]:
df["adja_to_ignore"] = df_adja_adjd["tagged"].apply(lambda sentence: 
            sentence_only_contains_words_that_should_be_ignored(sentence, 'ADJA', adja_set_of_words_to_ignore))

In [None]:
df["adjd_to_ignore"] = df_adja_adjd["tagged"].apply(lambda sentence: 
            sentence_only_contains_words_that_should_be_ignored(sentence, 'ADJD', adja_set_of_words_to_ignore))

Replacing Nan with True as True is the value that is given to the texts that should be ignored

In [None]:
df["adja_to_ignore"] = df["adja_to_ignore"].fillna(True)
df["adjd_to_ignore"] = df["adjd_to_ignore"].fillna(True)

Total amount of statements left after filtering out words that should be ignored

In [None]:
df[~df["adja_to_ignore"] | ~df["adjd_to_ignore"]].shape[0]

#### Now the same approach as above is implemented for ADJA.Comp and ADJD.Comp

In [None]:
df_comp = df[df["ADJA.Comp_ADJD.Comp_or"]]

'weitere', 'weiteren', 'weiterer' and 'weiteres' would not be classified as comparative according to https://de.wiktionary.org/wiki/weiter - They are added to the set of ignored words

In [None]:
value_counts_adja_comp = get_value_counts_of_tagged_words_by_postag(df_comp,"ADJA.Comp")

In [None]:
value_counts_adja_comp.head(10)

In [None]:
adja_comp_set_of_words_to_ignore = set(['weitere', 'weiteren', 'weiterer','weiteres'])

Now the same is done for the ADJD.Comp tag

Here we cannot see a word that should clearly be added to the set of ignored words

In [None]:
value_counts_adjd_comp = get_value_counts_of_tagged_words_by_postag(df_comp,"ADJD.Comp")

In [None]:
value_counts_adjd_comp.head(5)

In [None]:
df["adja_comp_to_ignore"] = df_comp["tagged"].apply(lambda sentence: sentence_only_contains_words_that_should_be_ignored(sentence, 'ADJA.Comp', adja_comp_set_of_words_to_ignore))

In [None]:
df["adjd_comp_to_ignore"] = df_comp["tagged"].apply(lambda sentence: sentence_only_contains_words_that_should_be_ignored(sentence, 'ADJD.Comp', adja_comp_set_of_words_to_ignore))

In [None]:
df["adja_comp_to_ignore"] = df["adja_comp_to_ignore"].fillna(True)
df["adjd_comp_to_ignore"] = df["adjd_comp_to_ignore"].fillna(True)

In [None]:
df[~df["adjd_comp_to_ignore"] | ~df["adja_comp_to_ignore"]].shape[0]

#### Now the same approach as above is implemented for ADJA.Sup and ADJD.Sup

In [None]:
df_sup = df[df["ADJA.Sup_ADJD.Sup_or"]]

In [None]:
value_counts_adja_sup = get_value_counts_of_tagged_words_by_postag(df_sup,"ADJA.Sup")

In [None]:
value_counts_adja_sup.head(5)

In [None]:
value_counts_adjd_sup = get_value_counts_of_tagged_words_by_postag(df_sup, "ADJD.Sup")

In [None]:
value_counts_adjd_sup.head(5)

In [None]:
superlatives_to_ignore = set([
'nendsten',
'herumzutesten',
'freitesten',
'dritthöchste',
'übernächsten',
'übernächstes',
'nächster',
'nächstes',
'nächste',
'nächsten',
'Nächstes',
'Nächste',
'Obersten',
'Oberstes',
'Nächster',
'Oberste',
'Letztes',
'Allerehrenwerteste',
'Hochgeschätzter',
'Wertgeschätzter',
'Liebsten',
'Bedarfsorientierten',
'Ex-Innenminister',
'Gespielte'
])

In [None]:
df["adja_sup_to_ignore"] = df_sup["tagged"].apply(lambda sentence: 
        sentence_only_contains_words_that_should_be_ignored(sentence, 'ADJA.Sup', superlatives_to_ignore))

In [None]:
df["adjd_sup_to_ignore"] = df_sup["tagged"].apply(lambda sentence: 
        sentence_only_contains_words_that_should_be_ignored(sentence, 'ADJD.Sup', superlatives_to_ignore))

In [None]:
df["adja_sup_to_ignore"] = df["adja_sup_to_ignore"].fillna(True)
df["adjd_sup_to_ignore"] = df["adjd_sup_to_ignore"].fillna(True)

In [None]:
df[~df["adjd_sup_to_ignore"] | ~df["adja_sup_to_ignore"]].shape[0]

Finding the number of non-opinionated statements after the cleaning process with the sets of words that should be ignored in each case

In [None]:
df[df["adjd_sup_to_ignore"] & df["adja_sup_to_ignore"] & df["adjd_comp_to_ignore"] 
   & df["adja_comp_to_ignore"] & df["adjd_to_ignore"] & df["adja_to_ignore"]].shape[0]

### Trying additional methods for finding sentences containing adjectives

A dataset which was extracted from wiktionary containing nearly all adjectives that are listed on wiktionary, including the comparative and superlative form if available. Some special cases were not included, as they would not be relevant to our analysis

In [None]:
PATH_TO_ADJECTIVES_DATASET = "adjektive/nearly_all.csv"

In [None]:
df_adjectives = pd.read_csv(PATH_TO_ADJECTIVES_DATASET)

Creating a set containing all the positive adjectives from the dataset above

In [None]:
set_positives = set(df_adjectives["positiv"].dropna().to_list())

In [None]:
def add_deklination(set_adjectives):
    
    res = []
    
    for word in set_adjectives:
        res.append(word)
        res.append(word+"e")
        res.append(word+"er")
        res.append(word+"en")
        res.append(word+"em")
        res.append(word+"es")
    
    return set(res)

In [None]:
set_positives_with_deklination = add_deklination(set_positives)

In [None]:
len(set_positives)

In [None]:
len(set_positives.difference(adja_set_of_words_to_ignore))

In [None]:
set_positives_with_deklination = set_positives_with_deklination.difference(adja_set_of_words_to_ignore)

In [None]:
len(set_positives_with_deklination)

In [None]:
set_positives_with_deklination.difference(adja_set_of_words_to_ignore)

In [None]:
def contains_positive(sentence, set_positives):
    
    res_list = []

    words = nltk.tokenize.word_tokenize(sentence,language='german')
        
    for word in words:
        if word in set_positives:
            res_list.append(True)
            break
    
    return True in res_list

In [None]:
df["contains_positive_with_set"] = df["speech"].apply(lambda sentence: contains_positive(sentence, set_positives))

In [None]:
df["contains_positive_with_deklination_set"] = df["speech"].apply(lambda sentence: contains_positive(sentence, set_positives_with_deklination))

In [None]:
df[df["contains_positive_with_set"]].shape[0]

In [None]:
df[df["contains_positive_with_set"]].sample(200).to_csv(PATH_SAMPLE_FOLDER + "sample_datadriven_positive.csv")

In [None]:
df[df["contains_positive_with_deklination_set"]].shape[0]

In [None]:
df[df["ADJA_ADJD_or"] & ~df["ADJA.Sup_ADJD.Sup_or"] & ~df["ADJA.Comp_ADJD.Comp_or"]].shape[0]

In [None]:
df[(df["ADJA_ADJD_or"] & ~df["ADJA.Sup_ADJD.Sup_or"] & ~df["ADJA.Comp_ADJD.Comp_or"]) | df["contains_positive_with_set"]].shape[0]

### Trying additional methods for finding sentences containing comparative statements

Creating a set containing all the comparative forms from the dataset. A clean-up is performed so that special characters are removed from the set

In [None]:
set_comparative = set(df_adjectives["komparativ"].to_list())

words_to_remove = []

for word in set_comparative:
    if len(word) < 3:
        words_to_remove.append(word)

for word in words_to_remove:
    set_comparative.remove(word)

In [None]:
set_comparative_and_deklinations = add_deklination(set_comparative)

In [None]:
len(set_comparative)

In [None]:
set_comparative_and_deklinations = set_comparative_and_deklinations.difference(adja_comp_set_of_words_to_ignore)

In [None]:
len(set_comparative_and_deklinations)

Defining a function which is then used to return True if the text contains a comparative statement or False otherwise

In [None]:
def contains_comparative(sentence, comparative_words = None, execute_set_based_approach=False):
    
    res_list = []
    
    # First case: so ... wie
    res_case_1 = re.search('(So|so) [a-zäöüß ]* wie',sentence)
    res_list.append(bool(res_case_1))
    
    # Second case: nicht so ... wie
    res_case_2 = re.search('(Nicht|nicht) so [a-zäöüß ]* wie',sentence)
    res_list.append(bool(res_case_2))
    
    # Third case: immer ...
    res_case_3 = re.search('(Immer|immer) [a-zäöüß]{2,60}er',sentence)
    
    if bool(res_case_3) and comparative_words is not None:
        res_case_3_improved = res_case_3.group(0).split(" ")[1] in comparative_words
        res_list.append(res_case_3_improved)
    else:
        res_list.append(bool(res_case_3))
    
    # Fourth case: als
    res_case_4 = re.search('[A-ZÄÖÜßa-zäöüß]+er als [a-zäöüß]+',sentence)
    
    if bool(res_case_4) and comparative_words is not None:
        res_case_4_improved = res_case_4.group(0).split(" ")[0] in comparative_words
        res_list.append(res_case_4_improved)
    else:
        res_list.append(bool(res_case_4))
    
    # Fifth case: je ... desto or je ... umso
    res_case_5 = re.search('(Je|je) [a-zäöüß ,]+(desto|umso)',sentence)
    res_list.append(bool(res_case_5))
    
    # Sixth case: text contains any word that is specified in the set of comparative words
    # optional, as the set of comparative words is an optional parameter
    if comparative_words is not None and execute_set_based_approach:
        words = nltk.tokenize.word_tokenize(sentence,language='german')
        
        for word in words:
            if word in comparative_words:
                res_list.append(True)
                break
    
    
    return True in res_list

def contains_comparative_set_only(sentence, comparative_words):
    res_list = []
    
    words = nltk.tokenize.word_tokenize(sentence,language='german')
        
    for word in words:
        if word in comparative_words:
            res_list.append(True)
            break
    return True in res_list

Applying the function defined above to our dataframe. Once without the set of comparative adjectives and once including the set 

In [None]:
df["comparative_set_only"] = df["speech"].apply(lambda row: contains_comparative_set_only(row, set_comparative))

In [None]:
df[df["comparative_set_only"]].shape[0]

In [None]:
df[df["comparative_set_only"]].sample(200).to_csv(PATH_SAMPLE_FOLDER+"sample_datadriven_comp.csv")

In [None]:
df["contains_comparative"] = df["speech"].apply(lambda row: contains_comparative(row))

In [None]:
df["contains_comparative_with_set"] = df["speech"].apply(lambda row: contains_comparative(row,set_comparative))

In [None]:
df["contains_comparative_with_deklination_set"] = df["speech"].apply(lambda row: contains_comparative(row,set_comparative_and_deklinations))

In [None]:
df["contains_comparative_new_no_set"] = df["speech"].apply(lambda row: contains_comparative(row))

In [None]:
df["contains_comparative_new_with_set"] = df["speech"].apply(lambda row: contains_comparative(row,set_comparative))

In [None]:
df["contains_comparative_new_with_declension_set"] = df["speech"].apply(lambda row: contains_comparative(row,set_comparative_and_deklinations))

In [None]:
df["contains_comparative_new_with_declension_set_and_approach"] = df["speech"].apply(lambda row: contains_comparative(row,set_comparative_and_deklinations,True))

In [None]:
df[df["contains_comparative"]].shape[0]

In [None]:
df[df["contains_comparative"]].sample(200).to_csv(PATH_SAMPLE_FOLDER + "sample_rule_comp.csv")

In [None]:
df[df["contains_comparative_new_with_declension_set"]].shape[0]

In [None]:
df[df["contains_comparative_new_with_set"]].shape[0]

In [None]:
df[df["contains_comparative_new_with_declension_set_and_approach"]].shape[0]

In [None]:
df[df["contains_comparative_new_with_declension_set"] & df["ADJA.Comp_ADJD.Comp_or"]].shape[0]

In [None]:
df[df["contains_comparative"]].iloc[26]["speech"]

In [None]:
sentence = 'Auch in meiner Zeit als Finanzminister wurden öffentliche Informationen in Form von Zeitungsinseraten vorgenommen und auch immer wieder Studien beauftragt, vor allem um aktuelle Maßnahmen zu bewerten, internationale Vergleichbarkeit zu erzielen und Handlungsnotwendigkeiten abzuleiten. Alle Studien in meiner Amtszeit sind übrigens öffentlich einsehbar.'
re.search('(Immer|immer) [a-zäöüß]{2,60}er',sentence).group(0).split(" ")[1]

In [None]:
sentence

##### The different methods are compared per amount of statements marked by each method

Amount of statements marked by tags

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"]].shape[0]

Amount of statements marked using comparative rules without a set of comparative adjectives

In [None]:
df[df["contains_comparative"]].shape[0]

Amount of statements marked using comparative rules and the set of comparative statements

In [None]:
df[df["contains_comparative_with_set"]].shape[0]

Amount of statements marked using comparative rules and the set of comparative statements (with deklinations as well)

In [None]:
df[df["contains_comparative_with_deklination_set"]].shape[0]

Amount of statements which are marked by all three methods at the same time

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"] & 
   df["contains_comparative"] & 
   df["contains_comparative_with_set"]
].shape[0]

Amount of statements which are marked by at least one of the three methods

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"] | 
   df["contains_comparative"] | 
   df["contains_comparative_with_set"]
].shape[0]

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"] | 
   df["contains_comparative"] | 
   df["contains_comparative_with_set"] |
   df["contains_comparative_with_deklination_set"]
].shape[0]

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"] & 
   df["contains_comparative_with_deklination_set"]
].shape[0]

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"] | 
   df["contains_comparative_with_set"]
].shape[0]

In [None]:
df[df["ADJA.Comp_ADJD.Comp_or"]].shape[0]

#### Trying additional methods for finding sentences containing superlative statements

In this first method a simple regular expression is used to detect the patterns "am ....sten" and "am ....ßten", examples are "am besten", "am höchsten" or "am größten"

In [None]:
def contains_superlative(sentence):
    res = re.search('am [a-zäöüß]+(s|ß)ten',sentence)
    
    return bool(res)

In [None]:
df["contains_superlative"] = df["speech"].apply(lambda row: contains_superlative(row))

In [None]:
df[df["ADJA.Sup_ADJD.Sup_or"]].shape[0]

Here we can see, that the amount of detected superlatives using the simple regular expression method alone leads to less than a tenth of cases that we had with the tagging method (477 vs 5259)

In [None]:
df[df["contains_superlative"]].shape[0]

In [None]:
df[df["contains_superlative"]].sample(200).to_csv(PATH_SAMPLE_FOLDER+"sample_rule_sup.csv")

Only 383 statements contain superlatives if both methods (regular expression and tagging) need to detect a superlative for it to count

In [None]:
df[df["contains_superlative"] | 
   df["ADJA.Sup_ADJD.Sup_or"]
].shape[0]

In [None]:
df[~df["contains_superlative"] & 
   df["ADJA.Sup_ADJD.Sup_or"]
].shape[0]

The next section is dedicated to finding the words that are most commonly tagged as a superlative. This is helpful as it might show cases that lead to wrong classifications, for example 'nächste', which might be date-related in many cases and therefore should not lead to the statement being classified as an opinionated statement with a superlative

In [None]:
def find_desired_words(sentence, tag):
    words = sentence.split(" ")
    res = []
    
    for word in words:
        if tag in word:
            res.append(word.split("\t")[0])
    
    return res

In [None]:
test_df = df[~df["contains_superlative"] & 
   df["ADJA.Sup_ADJD.Sup_or"]
]["tagged"]

In [None]:
all_res = []

for elem in test_df.values:
    res = find_desired_words(elem,"ADJA.Sup")
    all_res.append(res)

for elem in test_df.values:
    res = find_desired_words(elem,"ADJD.Sup")
    all_res.append(res)

In [None]:
res_dict = {}

for res in all_res:
    for word in res:
        if word not in res_dict.keys():
            res_dict[word] = 1
        else:
            res_dict[word] = res_dict[word] + 1

The following line of code creates a sorted (descending order) dictionary where the keys are sorted according to the values. The result is used to get a list of the most common superlatives which are found using the tagging method 

In [None]:
sorted_d = dict( sorted(res_dict.items(), key=operator.itemgetter(1),reverse=True))

In [None]:
list(sorted_d.keys())[0:10]

In [None]:
test_df_2 = df[~df["contains_superlative"] & 
   df["ADJA.Sup_ADJD.Sup_or"]
]["speech"]

In [None]:
count = 0
for value in test_df_2.values:

    if "der größten" in value:
        count += 1
count

In [None]:
naechsten_ind = test_df_2.apply(lambda row: "nächsten" in row)

In [None]:
test_df_2[naechsten_ind]

In [None]:
df.loc[19]["ADJA_ADJD_or"]

In the next block the following steps are implemented:

* Create a set of superlative words that should be ignored, for example 'nächste'
* Find statements where at least one of the tagged superlatives is not part of the set of words that should be ignored

In [None]:
superlatives_to_ignore = set([
'nendsten',
'herumzutesten',
'freitesten',
'dritthöchste',
'übernächsten',
'übernächstes',
'nächster',
'nächstes',
'nächste',
'nächsten',
'Nächstes',
'Nächste',
'Obersten',
'Oberstes',
'Nächster',
'Oberste',
'Letztes',
'Allerehrenwerteste',
'Hochgeschätzter',
'Wertgeschätzter',
'Liebsten',
'Bedarfsorientierten',
'Ex-Innenminister',
'Gespielte'
])

Definition of two regular expressions which are used to find the tagged superlatives in the tagged texts.

In [None]:
# (ADJA|ADJD) did not work in python but worked on regexr.com
regx_adja = r'[A-Za-zÖÄÜßöäü]{2,100}\tADJA.Sup[.A-Za-z]*'
regx_adjd = r'[A-Za-zÖÄÜßöäü]{2,100}\tADJD.Sup[.A-Za-z]*'

In [None]:
regx_list = [regx_adja, regx_adjd]

In [None]:
def all_tagged_superlatives_should_be_ignored(text, regx_list, ignore_set):
    all_tagged = get_all_tagged_superlatives(text, regx_list)
    
    res = []
    
    for tagged in all_tagged:
        word = tagged.split("\t")[0]
        
        if word in ignore_set:
            res.append(True)
        else:
            res.append(False)
    
    # all values must be true to return true
    # returns true if list is empty, which is useful in this case
    return all(res)

def get_all_tagged_superlatives(text, regx_list):
    res = []
    
    for regx in regx_list:
        res.extend(re.findall(regx, text))
    
    return res

In [None]:
df["has_no_real_superlative"] = df["tagged"].apply(lambda text: all_tagged_superlatives_should_be_ignored(text, regx_list, superlatives_to_ignore))

In [None]:
df[~df["has_no_real_superlative"]]

Make use of the superlatives from the wiktionary dataset

In [None]:
set_superlative = set(df_adjectives["superlativ"].to_list())

In [None]:
def clean_superlatives(input_set):
    list_superlatives = list(input_set)
    
    # remove nan
    cleaned_list = [x for x in list_superlatives if str(x) != 'nan']
    
    # regex to find entries with two superlatives at once
    # this is the case if there are multiple forms
    r = re.compile("am [A-Za-zöäßüÖÄÜ]+[ A-Za-zöäüßÖÄÜ]*am [A-Za-zöäüßÖÄÜ]+")

    index = 0
    
    # used for special case
    faulty_index = 0
    
    while index < len(cleaned_list):
        
        current_elem = cleaned_list[index]
        
        if bool(r.match(current_elem)):
            cleaned_list[index] = "am " + current_elem.split("am ")[1]
        # special case
        elif current_elem == "am allerliebsten allerliebst":
            cleaned_list[index] = "am allerliebsten"
        # special case
        elif current_elem == "most stupid stupidest":
            faulty_index = index
            
        index += 1
        
    del cleaned_list[faulty_index]
    
    return cleaned_list

In [None]:
cleaned_superlatives = clean_superlatives(set_superlative)

In [None]:
len(cleaned_superlatives)

In [None]:
[x for x in cleaned_superlatives if not x.startswith("am")]

In [None]:
[x for x in superlatives_without_am if len(x) == 1]

The following approach removes the 'am' from all the superlatives to simplify the computational effort and to get additional results when looking for these superlatives in the texts.

It simplifies the computation because we only need to split the text into words and then check whether they are in the set of superlatives without 'am' at the beginning. This would not be possible if the string would have a structure like 'am besten' as there would be a whitespace in the string, the logic for splitting the text into words would not consider this. Therefore we would need a solution with multiple loops, which would be way more inefficient.

In [None]:
superlatives_without_am = []

for elem in cleaned_superlatives:
    if elem.startswith("am"):
        superlatives_without_am.append(elem.split(" ",1)[1])
    else:
        superlatives_without_am.append(elem)

There are some additional superlatives with whitespace in them (which did not have 'am' at the beginning) - These will be removed using the list below

In [None]:
to_remove = [x for x in superlatives_without_am if len(x.split(" ")) > 1]

In this block all the superlatives that we want to ignore are removed from the previously created list

In [None]:
for elem in superlatives_without_am:
    if elem in to_remove or elem in superlatives_to_ignore or len(elem) == 1:
        superlatives_without_am.remove(elem)
    

The final set which is used for the search of superlatives in the texts

In [None]:
superlatives_without_am = set(superlatives_without_am)

In [None]:
def contains_superlative_word(sentence, superlatives_without_am):

    words = nltk.tokenize.word_tokenize(sentence,language='german')
    res_list = []
    
    for word in words:
        if word in superlatives_without_am:
            res_list.append(True)
            break
    
    return any(res_list)

In [None]:
df["has_superlative_word"] = df["speech"].apply(lambda sentence: contains_superlative_word(sentence, superlatives_without_am))

In [None]:
len(superlatives_without_am)

At the beginning there were around 3.700 cases, after the removal of the words that were defined as 'superlatives to ignore' only around 2.600 cases were left. After some final dataset clean-up (removal of '-') 1973 cases with superlatives were found using this method 

In [None]:
df[df["has_superlative_word"]].sample(200).to_csv(PATH_SAMPLE_FOLDER+"sample_datadriven_sup.csv")

In [None]:
df[df["has_superlative_word"] | df["ADJA.Sup_ADJD.Sup_or"]].shape[0]

In [None]:
df.columns

### Evaluation

In [None]:
#df["ADJA_SUP_LEN"] = df[df["ADJA.Sup_ADJD.Sup_or"]]["speech"].apply(lambda text: len(text))

In [None]:
sup_index = df[df["ADJA.Sup_ADJD.Sup_or"]].sample(50).index

In [None]:
df.iloc[sup_index][["speech","ADJA.Sup_ADJD.Sup_or"]].to_csv("sup_eval_data.csv",sep=";",quoting=csv.QUOTE_ALL,index=False)

In [None]:
df_unique_speakers = pd.Series(df["speaker"].unique()).to_frame()
df_unique_speakers.columns = ["speaker"]
df_unique_speakers.to_csv("speakers.csv", index=False)