In [None]:
import os
import pickle
import re
import unidecode
from tqdm import tqdm
from collections import Counter
import pandas as pd

import gensim
from gensim.corpora import Dictionary
from gensim.models import Phrases

import spacy
nlp = spacy.load('de_core_news_lg')  # requires python3 -m spacy download de_core_news_lg in terminal

### Setting paths

In [None]:
os.chdir("..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus','prepared','corpus.csv')
result_path = os.path.join(os.path.abspath(os.curdir),'corpus','preprocessed')

### Loading prepared dataset

In [None]:
df = pd.read_csv(data_path)
# sort dataframe by id, same structures is followed to create time slice list
df.sort_values(by='id', ascending=True, inplace=True)

### Preprocessing the plenary speeches for LDA and DTM

In [None]:
def save_terms_to_disk(df):
    '''
    :param df: dataframe containing speeches and an 'electoralTerms' column
    :return: Save electoral terms that are covered within the dataset to disk. This is needed as input for DTM
    '''
    path = os.path.join(result_path, "electoralTerms")
    os.makedirs(path, exist_ok=True)

    file_name = os.path.join(path, "electoralTerms_list.pkl")
    electoralTerms_list = df["electoralTerm"].tolist()
    with open(file_name, 'wb') as handle:
        pickle.dump(electoralTerms_list, handle)

    file_name = os.path.join(path, "electoralTerms_count.pkl")
    electoralTerms_count = df['electoralTerm'].value_counts()
    with open(file_name, 'wb') as handle:
        pickle.dump(electoralTerms_count, handle)

save_terms_to_disk(df)

#### Initial data cleaning

In [None]:
def data_cleaning(df):
    '''

    :param df: dataframe prepared during the first preprocessing step
    :return: dataframe with a column that contains lowercased plenary speeches without puncutation
    '''
    """
    This function will apply a number of lambda functions over a pandas series such as df['text'].
    Data_Cleaning will convert text to lowercase and remove punctuation
    """
    speechContent_column = df['speechContent']

    # convert to lowercase
    speechContent_column = speechContent_column.apply(lambda x: ' '.join([w.lower() for w in x.split()]))

    # remove punctuation
    speechContent_column = speechContent_column.apply(lambda x: ' '.join(re.sub("[\W]", " ", x).split()))


    return speechContent_column

df['speechContent_cleaned'] = data_cleaning(df=df)

#### Tokenization of plenary speeches

In [None]:
def tokenize_speeches(df):
    '''
    :param df: dataframe with cleaned speech in a column 'speechContent_cleaned'
    :return: list of list with tokenized speeches
    '''

    speeches = df['speechContent_cleaned'].values.tolist()
    for speech in tqdm(speeches, unit="speeches", desc="Extract tokens"):
        yield (gensim.utils.simple_preprocess(str(speech),
                                              deacc=False,
                                              min_len=2))

tokenized_speeches = list(tokenize_speeches(df))
print("Total number of speeches:", len(tokenized_speeches))

#### Removing general and custom stopwords

In [None]:
def remove_general_stopwords(tokenized_speeches):
    '''
    :param tokenized_speeches: list of list with tokenized speeches
    :return: uses spaCy default stopwords from 'de_core_news_lg' and removes them from speeches
    '''
    stopwords = nlp.Defaults.stop_words
    tokenized_speeches = [[token for token in speech if token not in stopwords] for speech in tqdm(tokenized_speeches)]
    return tokenized_speeches, stopwords

tokenized_speeches, stopwords = remove_general_stopwords(tokenized_speeches)
print("Number of stopwords: ", len(stopwords))

In [None]:
def get_word_frequency(tokenized_speeches):
    '''
    :param tokenized_speeches: list of list with tokenized speeches after general stopwords are removed
    :return: dataframe with word frequencies
    '''
    flat_list = [token for speeches in tokenized_speeches for token in speeches]
    flat_count = dict(Counter(flat_list))
    word_frequency_after_stopwords = pd.DataFrame.from_dict(flat_count, orient='index', columns=['count'])
    word_frequency_after_stopwords.sort_values(by='count', ascending=False, inplace=True)
    word_frequency_after_stopwords = word_frequency_after_stopwords.reset_index()
    word_frequency_after_stopwords.columns = ['word', 'count']
    return word_frequency_after_stopwords

word_frequency = get_word_frequency(tokenized_speeches)

In [None]:
def create_initial_custom_stopword_set(word_frequency, number_of_stopwords):
    '''
    :param word_frequency: dataframe containing the word frequencies, with a column 'word' that contains one word per row
    :param number_of_stopwords: number of most frequent words that should be taken as initial stopwords
    :return: an initial set of stopwords containing the n most frequent words in the tokenized speeches
    '''
    custom_stopword_set = set(word_frequency['word'].head(n=number_of_stopwords))
    return custom_stopword_set

initial_custom_stopword_set = create_initial_custom_stopword_set(word_frequency, number_of_stopwords = 1500)

In [None]:
def create_final_custom_stopword_set(initial_custom_stopword_set, words_to_keep):
    '''
    :param initial_custom_stopword_set: initial_custom_stopword_set created earlier
    :param words_to_keep: relevant words that are added to a txt file after analysing the initial_custom_stopword_set
    :return: final_custom_stopword_set with words that should be deleted before further processing
    '''
    custom_stopword_set = initial_custom_stopword_set - words_to_keep
    return custom_stopword_set

words_to_keep = set(line.strip() for line in open(os.path.join(result_path, "stopwords", "words_to_keep.txt")))
custom_stopword_set = create_final_custom_stopword_set(initial_custom_stopword_set, words_to_keep)

In [None]:
def remove_custom_stopwords(tokenized_speeches, custom_stopword_set):
    '''
    :param tokenized_speeches: list of list with tokenized speeches after general stopwords are removed
    :param custom_stopword_set: spaCy default stopwords appended with custom stop words
    :return: list of list with tokenized speeches without stopwords
    '''
    nlp.Defaults.stop_words |= custom_stopword_set
    stopwords_custom = nlp.Defaults.stop_words
    print("Number of custom stopwords: ", len(stopwords_custom))

    speeches_tokens = [[token for token in speech if token not in stopwords_custom] for speech in
                                         tqdm(tokenized_speeches)]
    return tokenized_speeches, stopwords_custom

tokenized_speeches, stopwords_custom = remove_custom_stopwords(tokenized_speeches, custom_stopword_set)
word_frequency_after_custom_stopwords = get_word_frequency(tokenized_speeches)

#### Removing umlauts

In [None]:
def remove_umlaute(tokenized_speeches):
    '''
    :param tokenized_speeches: list of list with tokenized speeches without stopwords
    :return: list of list with tokenized speeches without stopwords, umlauts, and accents
    '''
    speeches_tokens = [[unidecode.unidecode(token) for token in speech] for speech in tqdm(tokenized_speeches)]
    return speeches_tokens

speeches_tokens = remove_umlaute(tokenized_speeches)

#### Lemmatizing plenary speeches 

In [None]:
def lemmatize(tokenized_speeches, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    '''
    :param tokenized_speeches: list of list with tokenized speeches after general and custom stopwords are removed
    :param allowed_postags: Which types of words should be kept, default 'NOUN', 'ADJ', 'VERB', 'ADV'
    :return: lemmatized speeches
    '''
    lemmatized_speeches = [[item.lemma_ for item in nlp(' '.join(speech)) if item.pos_ in allowed_postags] for speech
                           in tqdm(tokenized_speeches,
                                   unit="speech",
                                   desc="Creating lemmas")]
    return lemmatized_speeches


lemmatized_speeches = lemmatize(tokenized_speeches)

#### Adding bigrams to the lemmatized plenary speeches

In [None]:
def get_bigrams(lemmatized_speeches, name):
    '''

    :param lemmatized_speeches: lemmatized speeches as list of list
    :param name: filename for the bigram model
    :return:
    '''
    bigram = gensim.models.Phrases(lemmatized_speeches, min_count=400,threshold= 30)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    for idx in tqdm(range(len(lemmatized_speeches))):
        for token in bigram_mod[lemmatized_speeches[idx]]:
            if '_' in token:
                lemmatized_speeches[idx].append(token)
    bigram_mod.save(os.path.join(result_path,'ngram_models', name))
    return lemmatized_speeches

name = 'bigram_model.pkl'
speeches_bigrams = get_bigrams(lemmatized_speeches=lemmatized_speeches, name=name)

#### Create a dictionary and corpus from the lemmatized data

In [None]:
#create a dictionary and corpus from the full lemmatized speeches
dictionary = Dictionary(speeches_bigrams)
corpus_full = [dictionary.doc2bow(speech) for speech in tqdm(speeches_bigrams)]

In [None]:
def filter_extremes(dictionary, no_below, no_above):
    '''
    :param dictionary: full dictionary
    :param no_below: (int) – Keep tokens which are contained in at least no_below documents.
    :param no_above: (float) – Keep tokens which are contained in no more than no_above documents (fraction of total corpus size, not an absolute number)
    :return: pruned dicitionary
    '''
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    dictionary.compactify()
    return dictionary

filter_extremes(dictionary, no_below=10, no_above=0.5)


path = os.path.join(result_path, "corpus")
os.makedirs(path, exist_ok=True)

file_name = os.path.join(path, "corpus_full.pkl")
with open(file_name, 'wb') as handle:
    pickle.dump(corpus_full, handle)

In [None]:
def prune_lemmas_with_pruned_dict(dict, speech_tokens):
    '''
    :param dict: pruned dictionary
    :return: pruned lemmatized tokens, which can be used as input for the coherencemodel later on
    '''
    dict_tokens = dict.token2id
    dict_keys = list(dict_tokens.keys())
    lemmatized_filtered = [[token for token in speeches if token in dict_keys] for speeches in
                                tqdm(speech_tokens)]

    return lemmatized_filtered

lemmatized_filtered = prune_lemmas_with_pruned_dict(dictionary,speeches_bigrams)

path = os.path.join(result_path, "lemmas")
os.makedirs(path, exist_ok=True)

file_name = os.path.join(path, "lemmatized_preprocessed.pkl")
with open(file_name, 'wb') as handle:
    pickle.dump(lemmatized_filtered, handle)

In [None]:
##% create a filtered corpus
corpus_preprocessed = [dictionary.doc2bow(speech) for speech in tqdm(speeches_bigrams)]    

path = os.path.join(result_path, "corpus")
os.makedirs(path, exist_ok=True)

file_name = os.path.join(path, "corpus_preprocessed.pkl")
with open(file_name, 'wb') as handle:
    pickle.dump(corpus_preprocessed, handle)

#### Update the preprocessed dictionary with additional stopwords 

In [None]:
#load initial dictionary from disk
path = os.path.join(result_path, "dictionary")
loaded_dict = Dictionary.load(os.path.join(path, "dictionary_preprocessed.dict"))

#load additional stopwords that were identified during initial attempts to LDA modeling
additional_stopwords = list(line.strip() for line in open(os.path.join(result_path, "stopwords","additional_stopwords.txt")))

In [None]:
def delete_additional_stopwords_from_dict (dict, additional_stopwords):
    '''
    
    :param dict: Initially created dictionary (loaded from disk)
    :param additional_stopwords: 
    :return: 
    '''
    stopword_ids = map(dict.token2id.get, additional_stopwords)
    dict.filter_tokens(bad_ids=stopword_ids)
    dict.compactify()
    
    for x in additional_stopwords:
        if x in dict.token2id:
            print(x,'yes')
        else:
            print(x,'no')
    return dict

dictionary_updated = delete_additional_stopwords_from_dict (dict=loaded_dict, additional_stopwords=additional_stopwords)

#### Update corpus after removing additional stopwords from dictionary

In [None]:
texts = pd.read_pickle(os.path.join(result_path, 'preprocessed_lemmas', 'lemmatized_preprocessed.pkl'))

lemmatized_speeches_pruned = prune_lemmas_with_pruned_dict(dictionary_updated,texts)

corpus_updated = [dictionary_updated.doc2bow(speech) for speech in tqdm(lemmatized_speeches_pruned)]

#### Save updated lemmas, corpus, and dictionary to disk

In [None]:
path = os.path.join(result_path, 'lemmas')
os.makedirs(path, exist_ok=True)
file_name = os.path.join(path, "lemmatized_preprocessed.pkl")
with open(file_name, 'wb') as handle:
    pickle.dump(lemmatized_speeches_pruned, handle)

path = os.path.join(result_path, "corpus")
os.makedirs(path, exist_ok=True)
file_name = os.path.join(path, "corpus_preprocessed.pkl")
with open(file_name, 'wb') as handle:
    pickle.dump(corpus_updated, handle)

path = os.path.join(result_path, "dictionary")
dictionary.save(os.path.join(path, "dictionary_preprocessed.dict"))
dictionary.save_as_text(os.path.join(path, "dictionary_preprocessed.txt"))