In [None]:
import pandas as pd
import re
import requests
import nltk
import sys
sys.path.append("wikipedia_wordclouds/")
sys.path.append("inverted_index/")
from wikipedia_wordclouds.wikipedia_corpus_creator import WikiCorpusCreator
from inverted_index.inverted_index import InvertedIndex
#import spacy
#import pytextrank
#from keybert import KeyBERT
#from keyphrase_vectorizers import KeyphraseCountVectorizer

In [None]:
PATH_TO_POLITICIANS_PARTY_MAPPING = "speaker_party.csv"
PATH_TO_ALL_TAGGED_STATEMENTS = "all_tagged.csv"

First the two datasets are going to be joined together, so that only statements are left where the speaker and the speakers party could be added to the dataset containing the speeches

In [None]:
df_party_mapping = pd.read_csv(PATH_TO_POLITICIANS_PARTY_MAPPING)

In [None]:
df_all_tagged = pd.read_csv(PATH_TO_ALL_TAGGED_STATEMENTS)

In [None]:
df_joined = df_all_tagged.join(df_party_mapping.set_index('speaker'), on='speaker')

In [None]:
df_preprocessed = df_joined[~df_joined["party"].isna()]

In [None]:
df_preprocessed.to_csv("tagged_with_party.csv",index = False)

### Now different approaches for creating corpora using text from Wikipedia are used to find statements regarding specific topics

Get text from wikipedia

In [None]:
wcc = WikiCorpusCreator("Klimawandel",exclude_stopwords=False)

In [None]:
text = wcc.get_text()

Create the inverted index

In [None]:
invert_index = InvertedIndex("tagged_with_party.csv", "speech")

In [None]:
invert_index.create()

In [None]:
def find_all_documents_containing_words_in_corpus(inverted_index, word_set):
    
    res = []
    results_per_word = dict()
    
    for word in word_set:
        results = invert_index.search(word)
        
        if results is not None:
            res.extend(results)
            results_per_word[word] = len(results)
        else:
            results_per_word[word] = 0
    
    return set(res), results_per_word

### Spacy with textrank

execute in shell:
```python -m spacy download de_core_news_lg```

In [None]:
#model = spacy.load("de_core_news_lg")

In [None]:
#model.add_pipe("textrank")

In [None]:
#doc = model(text)

In [None]:
#stop_words = ["Jahr","Million","Teil","Milliarde","Beginn","Folge","Zeit","Sprache","Bereich","Beispiel"]

In [None]:
#for phrase in doc._.phrases[:300]:
#    current = phrase.text
#    if len(current) > 3 and ' ' not in current and '„' not in current:
#        
#        should_be_handled = True
#        
#        for word in stop_words:
#            if word in current:
#                should_be_handled = False
#        
#        if should_be_handled:
#            print(phrase.text)

### KeyBert

In [None]:
#nlp = spacy.load("de_core_news_lg", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

In [None]:
#kw_model = KeyBERT(model=nlp)

In [None]:
#keywords = kw_model.extract_keywords(text)

In [None]:
#keywords

### Using a vectorizer in combination with spacy

In [None]:
#vectorizer = KeyphraseCountVectorizer(spacy_pipeline='de_core_news_lg', pos_pattern='<ADJ.*>*<N.*>+', stop_words='german')

In [None]:
#vectorizer.fit([text])

In [None]:
#keyphrases = vectorizer.get_feature_names_out()

In [None]:
#for phrase in keyphrases:
#    if ' ' not in phrase:
#        print(phrase)

### Individual approach using a large frequency list

https://wacky.sslmit.unibo.it/doku.php?id=frequency_lists



https://www.sketchengine.eu/dewac-german-corpus/

Add an approach which is based on a frequency list

* replaced space with tab in vim using :%s/\s/\t/g
* removed the entry " as it caused problems while reading the file

In [None]:
df_frequencies = pd.read_csv("sorted.de.word.unigrams",encoding="Latin-1",sep="\t", header = None)

In [None]:
df_frequencies.columns = ["amount","word"]

Preprocessing of the unigrams file

In [None]:
def string_is_word(input_string):
    
    if input_string.isupper():
        return False
    
    return bool(re.match("^[A-Za-zÖÄÜöäüß]+$",input_string))

In [None]:
df_frequencies["word"] = df_frequencies["word"].apply(lambda inp: str(inp))

In [None]:
df_frequencies["is_word"] = df_frequencies["word"].apply(lambda inp: string_is_word(inp))

In [None]:
df_frequencies["word_length"] = df_frequencies["word"].apply(lambda word: len(word))

In [None]:
df_frequencies["amount_unique_characters"] = df_frequencies["word"].apply(lambda word: len(''.join(set(word.lower()))))

In [None]:
df_frequencies_preprocessed = df_frequencies[(df_frequencies["is_word"]) & 
                                             (df_frequencies["word_length"] > 3) & 
                                             (df_frequencies["amount_unique_characters"] > 2)]

Filter for words that appear greater than 10 times

In [None]:
df_preprocessed_filtered = df_frequencies_preprocessed[(df_frequencies_preprocessed["amount"] > 10)]

In [None]:
df_preprocessed_filtered["percentage"] = df_preprocessed_filtered["amount"] / df_preprocessed_filtered["amount"].sum()

Create a dict with percentage and word, this is used for the classification later

In [None]:
word_percentage_dict = pd.Series(df_preprocessed_filtered.percentage.values,index=df_preprocessed_filtered.word).to_dict()

In [None]:
mean_percentage = float(df_preprocessed_filtered["percentage"].mean())

Define functions that are used for finding specific words in a text

In [None]:
def get_specific_words(text, word_dict, treshold, consider_words = None):
    tokens = nltk.word_tokenize(text, language='german')
    res = []
    
    # define the output of the function 'f' based on the input parameter
    # 'consider_words'
    if consider_words is not None:
        def f(word):
            return classify_word_as_specific_word(word,word_percentage_dict,mean_percentage, consider_words)
    else:
        def f(word):
            return classify_word_as_specific_word(word,word_percentage_dict,mean_percentage)
    
    for word in tokens:
        if word[0].isupper():
            if f(word):
                res.append(word)
                
    return set(res)
    

def classify_word_as_specific_word(word, word_dict, threshold, consider_words = None):
    
    if consider_words is not None:
        for w in consider_words:
            if w in word:
                return True
    
    if len(word) < 4:
        return False
    
    if word in word_dict:
        if word_dict[word] < threshold:
            return True
        
        return False
    return True

In [None]:
classify_word_as_specific_word("Klimaschutz",word_percentage_dict,mean_percentage,consider_words=["Klima","Treibhaus"])

In [None]:
classify_word_as_specific_word("Klimaschutz",word_percentage_dict,mean_percentage)

In [None]:
word_set_climate = get_specific_words(text, 
                                      word_percentage_dict, 
                                      mean_percentage, 
                                      consider_words=["Klima","Treibhaus"])

In [None]:
documents_climate = find_all_documents_containing_words_in_corpus(invert_index, word_set_climate)

898

In [None]:
len(documents_climate[0])

In [None]:
dict(sorted(documents_climate[1].items(), key=lambda item: item[1],reverse=True))

In [None]:
list(documents_climate[0])[201]

In [None]:
def get_indices_from_documents(documents):
    
    res = []
    
    for entry in documents[0]:
        res.append(entry[0])
    
    return res

In [None]:
indices_climate = get_indices_from_documents(documents_climate)

In [None]:
df_climate_speeches = df_preprocessed.iloc[indices_climate]

In [None]:
df_climate_speeches["party"].value_counts()

In [None]:
df_preprocessed["party"].value_counts()

In [None]:
df_climate_speeches["speaker"].value_counts()

Now the sentiment analysis is executed