In [None]:
import pandas as pd
import re
import requests
import nltk
import sys
sys.path.append("wikipedia_wordclouds/")
sys.path.append("inverted_index/")
from wikipedia_wordclouds.wikipedia_corpus_creator import WikiCorpusCreator
from inverted_index.inverted_index import InvertedIndex

In [None]:
PATH_TO_DATA_FOLDER = "../../data/"


PATH_TO_FULL_CSV = PATH_TO_DATA_FOLDER + "protocol_obtainment/political_statements_thesis.csv"
PATH_TO_FREQUENCY_LIST = PATH_TO_DATA_FOLDER + "research_question_1/thesis/preprocessed_frequency_list.csv"

PATH_TO_RESULTS_TERMS_ASYL = PATH_TO_DATA_FOLDER + "research_question_1/technical_terms_asyl.csv"
PATH_TO_RESULTS_TERMS_CLIMATE = PATH_TO_DATA_FOLDER + "research_question_1/technical_terms_climate.csv"
PATH_TO_RESULTS_TERMS_FEMINISM = PATH_TO_DATA_FOLDER + "research_question_1/technical_terms_feminism.csv"

PATH_TO_RESULTS_SPEECHES_ASYL = PATH_TO_DATA_FOLDER + "research_question_1/asyl_speeches.csv"
PATH_TO_RESULTS_SPEECHES_CLIMATE = PATH_TO_DATA_FOLDER + "research_question_1/climate_speeches.csv"
PATH_TO_RESULTS_SPEECHES_FEMINISM = PATH_TO_DATA_FOLDER + "research_question_1/feminism_speeches.csv"

In [None]:
df_full = pd.read_csv(PATH_TO_FULL_CSV)

Implementation of the TechnicalTermsFinder

In [None]:
class TechnicalTermsFinder:
    def __init__(self, unigrams_file):
        self.unigrams_file = unigrams_file
        self.df_unigrams = pd.read_csv(unigrams_file)
        
        self.setup()
    
    def setup(self):
        print("Setting up...")
        self.word_percentage_dict = pd.Series(self.df_unigrams.percentage.values,
                                              index=self.df_unigrams.word).to_dict()
        
        self.mean_percentage = float(self.df_unigrams["percentage"].mean())
        print("Setup is done")
    
    def get_specific_words(self, text, consider_words = None):
        tokens = nltk.word_tokenize(text, language='german')
        res = []
        
        # define the output of the function 'f' based on the input parameter
        # 'consider_words'
        if consider_words is not None:
            def f(word):
                return self._classify_word_as_specific_word(word,consider_words)
        else:
            def f(word):
                return self._classify_word_as_specific_word(word)
    
        for word in tokens:
            if word[0].isupper():
                if f(word):
                    res.append(word)
                
        return set(res)
        
    def _classify_word_as_specific_word(self, word, consider_words = None):
    
        if consider_words is not None:
            for w in consider_words:
                if w in word:
                    return True
    
        if len(word) < 4:
            return False
    
        if word in self.word_percentage_dict:
            if self.word_percentage_dict[word] < self.mean_percentage:
                return True
        
            return False
        return True
    
    def _string_is_word(self, input_string):
    
        if input_string.isupper():
            return False
    
        return bool(re.match("^[A-Za-zÖÄÜöäüß]+$",input_string))

Get the text from wikipedia for a certain topic

In [None]:
wcc_climate = WikiCorpusCreator("Klimawandel",exclude_stopwords=False)

In [None]:
text_climate = wcc_climate.get_text()

In [None]:
wcc_feminism = WikiCorpusCreator("Feminismus",exclude_stopwords=False)

In [None]:
text_feminism = wcc_feminism.get_text()

In [None]:
wcc_asyl = WikiCorpusCreator("Flüchtlingskrise_in_Europa_2015/2016",
                             exclude_stopwords=False)

In [None]:
text_asyl = wcc_asyl.get_text()

Setup the TechnicalTermsFinder

In [None]:
ttf = TechnicalTermsFinder(PATH_TO_FREQUENCY_LIST)

Get the technical terms from a certain text

Climate Change

In [None]:
technical_terms_climate = ttf.get_specific_words(text_climate,
    ["Klima","Treibhaus"])

In [None]:
pd.Series(list(technical_terms_climate)).to_csv(
    PATH_TO_RESULTS_TERMS_CLIMATE)

Feminism

In [None]:
technical_terms_feminism = ttf.get_specific_words(text_feminism)

In [None]:
pd.Series(list(technical_terms_feminism)).to_csv(
    PATH_TO_RESULTS_TERMS_FEMINISM)

European Migrant Crisis

In [None]:
technical_terms_asyl = ttf.get_specific_words(text_asyl,
    ["Asyl","Flüchtling","Flüchtlings"])

In [None]:
pd.Series(list(technical_terms_asyl)).to_csv(
    PATH_TO_RESULTS_TERMS_ASYL)

Create the inverted index

In [None]:
invert_index = InvertedIndex(PATH_TO_FULL_CSV, "speech")
invert_index.create()

In [None]:
def find_all_documents_containing_words_in_corpus(inverted_index, word_set):
    
    res = []
    results_per_word = dict()
    
    for word in word_set:
        results = invert_index.search(word)
        
        if results is not None:
            res.extend(results)
            results_per_word[word] = len(results)
        else:
            results_per_word[word] = 0
    
    return set(res), results_per_word

def get_indices_from_documents(documents):
    
    res = []
    
    for entry in documents[0]:
        res.append(entry[0])
    
    return res

In [None]:
def get_documents_for_topic(df, topic, technical_terms_finder, inverted_index, additional_words=None):
    
    wcc = WikiCorpusCreator(topic, exclude_stopwords=False)
    
    text = wcc.get_text()
    
    if additional_words is None:
        technical_terms = technical_terms_finder.get_specific_words(text)
    else:
        technical_terms = technical_terms_finder.get_specific_words(text, consider_words=additional_words)

    
    documents = find_all_documents_containing_words_in_corpus(inverted_index, technical_terms)
    
    indices = get_indices_from_documents(documents)
    
    return df.iloc[indices]

Climate Change

In [None]:
df_climate_large = get_documents_for_topic(df_full, "Klimawandel",
    ttf, invert_index,["Klima","Treibhaus"])

In [None]:
df_climate_large.to_csv(PATH_TO_RESULTS_SPEECHES_CLIMATE,index=False)

Feminism

In [None]:
df_feminism = get_documents_for_topic(df_full, 
    "Feminismus",ttf,invert_index)

In [None]:
df_feminism.to_csv(PATH_TO_RESULTS_SPEECHES_FEMINISM,index=False)

European Migrant Crisis

In [None]:
df_asyl = get_documents_for_topic(
    df_full, "Flüchtlingskrise_in_Europa_2015/2016",ttf,invert_index,
    ["Asyl","Flüchtling","Flüchtlings"]
)

In [None]:
df_asyl.to_csv(PATH_TO_RESULTS_SPEECHES_ASYL,index=False)