In [None]:
import pandas as pd
import re
import requests
import nltk
import sys
sys.path.append("wikipedia_wordclouds/")
sys.path.append("inverted_index/")
from wikipedia_wordclouds.wikipedia_corpus_creator import WikiCorpusCreator
from inverted_index.inverted_index import InvertedIndex

In [None]:
PATH_TO_POLITICIANS_PARTY_MAPPING = "../../data/speaker_party_unique.csv"
PATH_TO_ALL_TAGGED_STATEMENTS = "../../data/protocol_obtainment/political_statements_thesis.csv"
PATH_TO_DEWAC_CORPUS = "../../big_datasets/sorted.de.word.unigrams"

TARGET_PATH_FREQUENCY_LIST = "../../data/research_question_1/preprocessed_frequency_list.csv"
TARGET_PATH_TAGGED_WITH_PARTY = "../../data/tagged_with_party.csv"

First the two datasets are going to be joined together, so that only statements are left where the unique speaker and the speakers party could be added to the dataset containing the speeches. This is important as a lot of speakers are written in multiple slightly different forms

In [None]:
df_party_mapping = pd.read_csv(PATH_TO_POLITICIANS_PARTY_MAPPING)

In [None]:
df_all_tagged = pd.read_csv(PATH_TO_ALL_TAGGED_STATEMENTS)

In [None]:
df_joined = df_all_tagged.join(df_party_mapping.set_index('speaker'), on='speaker')

In [None]:
df_preprocessed = df_joined[~df_joined["party"].isna()]

In [None]:
df_preprocessed.to_csv(TARGET_PATH_TAGGED_WITH_PARTY,index = False)

In [None]:
df_preprocessed.head(5)

# Preparation

The following blocks show the preparation of the frequency list and a simple example using the approach which is based on the frequency list. The input is always the article from Wikipedia. 

Get text from wikipedia

In [None]:
wcc = WikiCorpusCreator("Klimawandel",exclude_stopwords=False)

In [None]:
text = wcc.get_text()

Create the inverted index

In [None]:
invert_index = InvertedIndex(TARGET_PATH_TAGGED_WITH_PARTY, "speech")

In [None]:
invert_index.create()

In [None]:
def find_all_documents_containing_words_in_corpus(inverted_index, word_set):
    
    res = []
    results_per_word = dict()
    
    for word in word_set:
        results = invert_index.search(word)
        
        if results is not None:
            res.extend(results)
            results_per_word[word] = len(results)
        else:
            results_per_word[word] = 0
    
    return set(res), results_per_word

### Individual approach using a large frequency list

https://wacky.sslmit.unibo.it/doku.php?id=frequency_lists



https://www.sketchengine.eu/dewac-german-corpus/

Preprocessing of the dewac corpus:
* replaced space with tab in vim using ```:%s/\s/\t/g```
* removed the entry " (line 7) as it caused problems while reading the file

In [None]:
!vim -c "%s/\s/\t/g | wq" ../../big_datasets/sorted.de.word.unigrams

In [None]:
!vim -c "7d | wq" ../../big_datasets/sorted.de.word.unigrams

In [None]:
df_frequencies = pd.read_csv(PATH_TO_DEWAC_CORPUS,encoding="Latin-1",sep="\t", header = None)

In [None]:
df_frequencies.columns = ["amount","word"]

In [None]:
df_frequencies

Preprocessing of the unigrams file

In [None]:
def string_is_word(input_string):
    
    if input_string.isupper():
        return False
    
    return bool(re.match("^[A-Za-zÖÄÜöäüß]+$",input_string))

In [None]:
df_frequencies["word"] = df_frequencies["word"].apply(lambda inp: str(inp))

In [None]:
df_frequencies["is_word"] = df_frequencies["word"].apply(lambda inp: string_is_word(inp))

In [None]:
df_frequencies["word_length"] = df_frequencies["word"].apply(lambda word: len(word))

In [None]:
df_frequencies["amount_unique_characters"] = df_frequencies["word"].apply(lambda word: len(''.join(set(word.lower()))))

In [None]:
df_frequencies_preprocessed = df_frequencies[
    (df_frequencies["is_word"]) & 
    (df_frequencies["word_length"] > 3) & 
    (df_frequencies["amount_unique_characters"] > 2)
]

Filter for words that appear greater than 10 times

In [None]:
df_preprocessed_filtered = df_frequencies_preprocessed[(df_frequencies_preprocessed["amount"] > 10)]

In [None]:
df_preprocessed_filtered["percentage"] = df_preprocessed_filtered["amount"] / df_preprocessed_filtered["amount"].sum()

In [None]:
df_preprocessed_filtered[["word","amount","word_length","amount_unique_characters","percentage"]].to_csv(TARGET_PATH_FREQUENCY_LIST,index=False)

Create a dict with percentage and word, this is used for the classification later

In [None]:
word_percentage_dict = pd.Series(df_preprocessed_filtered.percentage.values,index=df_preprocessed_filtered.word).to_dict()

In [None]:
mean_percentage = float(df_preprocessed_filtered["percentage"].mean())

Define functions that are used for finding specific words in a text

In [None]:
def get_specific_words(text, word_dict, treshold, consider_words = None):
    tokens = nltk.word_tokenize(text, language='german')
    res = []
    
    # define the output of the function 'f' based on the input parameter
    # 'consider_words'
    if consider_words is not None:
        def f(word):
            return classify_word_as_specific_word(word,word_percentage_dict,mean_percentage, consider_words)
    else:
        def f(word):
            return classify_word_as_specific_word(word,word_percentage_dict,mean_percentage)
    
    for word in tokens:
        if word[0].isupper():
            if f(word):
                res.append(word)
                
    return set(res)
    

def classify_word_as_specific_word(word, word_dict, threshold, consider_words = None):
    
    if consider_words is not None:
        for w in consider_words:
            if w in word:
                return True
    
    if len(word) < 4:
        return False
    
    if word in word_dict:
        if word_dict[word] < threshold:
            return True
        
        return False
    return True

def get_documents_per_term_ordered(documents,reverse=True):
    
    return dict(sorted(documents[1].items(), key=lambda item: item[1],reverse=reverse))


def get_indices_from_documents(documents):
    
    res = []
    
    for entry in documents[0]:
        res.append(entry[0])
    
    return res

The topic-related terms are extracted and stored in a set

In [None]:
word_set_climate = get_specific_words(text, 
                                      word_percentage_dict, 
                                      mean_percentage, 
                                      consider_words=["Klima","Treibhaus"])

The documents are retrieved, the indices are stored in a list and the topic-related terms are counted

In [None]:
documents_climate = find_all_documents_containing_words_in_corpus(invert_index, word_set_climate)

Shows the amount of documents that were found per topic-related term

In [None]:
get_documents_per_term_ordered(documents_climate)

All the indices (from the dataframe) are stored in a list

In [None]:
indices_climate = get_indices_from_documents(documents_climate)

The speeches are extracted from the dataframe

In [None]:
df_climate_speeches = df_preprocessed.iloc[indices_climate]

In [None]:
df_climate_speeches.head(5)