For creating the LDA model, we need to filter the dictionary to remove words that appear frequently.
This notebook plots the most frequent words and asks the user whether to rate whether it is relevant or not. 

In [None]:
import os
if not os.environ.get("PREAMBLE_RUN", False): 
    %run -i '../preamble.py' 2

In [None]:
import json
from gensim import corpora

In [4]:
PATH_PREPROCESSED = "data/lda/preprocessed_texts_all_translated.json"
preprocessed_data = json.load(open(PATH_PREPROCESSED))
dictionary = corpora.Dictionary(preprocessed_data)

First, plot examlpes of most and least frequent words

In [6]:
def revdict(d): 
    return {v: k for (k, v) in dict(d).items()}
id2token = revdict(dictionary.token2id)

counts = list(dictionary.dfs.items())
counts.sort(key=lambda x: x[1], reverse=True)

print("token | frequency (abs) | frequency (rel)")
print("="*100)

for i in range(0, 10): 
    id = counts[i][0]
    count = counts[i][1]
    print(id2token[id], count, (count / dictionary.num_docs))

print("="*100)

for i in range(1, 11): 
    id = counts[-i][0]
    count = counts[-i][1]
    print(id2token[id], count, (count / dictionary.num_docs))

token | frequency (abs) | frequency (rel)
european 144090 0.6084573416887658
union 88287 0.3728147222269142
president 85599 0.3614639460838133
vote 84499 0.35681891120382414
state 82712 0.3492728409033326
report 75391 0.31835802239751365
support 73009 0.30829941050284615
member 71340 0.301251625762208
eu 71057 0.30005658497035625
country 63273 0.26718662905596
governmentalism 1 4.222758981808355e-06
alternativen 1 4.222758981808355e-06
braucht 1 4.222758981808355e-06
demokratie 1 4.222758981808355e-06
elegir 1 4.222758981808355e-06
archimedes 1 4.222758981808355e-06
gauze 1 4.222758981808355e-06
italophile 1 4.222758981808355e-06
coraggio 1 4.222758981808355e-06
auguri 1 4.222758981808355e-06


## Find relevance threshold manually

In [7]:
def get_keep(): 
    answer = input("press enter to omit, write anything to keep")
    if answer == "": 
        return False
    else: 
        return True 
    
def get_threshold(counts, skip_same_counts = True):
    running = True 
    i = 0 
    last_omitted_rate = 0
    last_ommited_count = 0 
    while running: 
        id = counts[i][0]
        count = counts[i][1]
        occurance_rate = (count / dictionary.num_docs)

        i += 1
        if skip_same_counts and count == last_ommited_count: 
            continue

        print(f"{id2token[id]} ({count}, {'%.4f' % occurance_rate})",)
        keep = get_keep()

        if keep: 
            running = False 
            print(f"Omitted all tokens until '{last_omitted_word}' with count > {last_ommited_count} ({'%.4f' % last_omitted_rate})")
            print(f"First entry to keep: '{id2token[id]}' with count: {count} ({'%.4f' % occurance_rate})")
            return occurance_rate, last_omitted_rate

        last_ommited_count = count 
        last_omitted_rate = occurance_rate
        last_omitted_word = id2token[id]

In [None]:
print("Filtering most frequent words")
get_threshold(counts)

Filtering most frequent words
european (144090, 0.6085)
union (88287, 0.3728)
president (85599, 0.3615)
vote (84499, 0.3568)
state (82712, 0.3493)
report (75391, 0.3184)
support (73009, 0.3083)
member (71340, 0.3013)
eu (71057, 0.3001)
country (63273, 0.2672)
commission (60677, 0.2562)
need (58824, 0.2484)
right (56042, 0.2367)
parliament (55298, 0.2335)
mr (55050, 0.2325)
europe (52809, 0.2230)
year (52213, 0.2205)
people (49434, 0.2087)
important (46316, 0.1956)
time (42958, 0.1814)
new (42366, 0.1789)
policy (41703, 0.1761)
work (39496, 0.1668)
madam (39066, 0.1650)
citizen (38464, 0.1624)
like (36308, 0.1533)
economic (35784, 0.1511)
Omitted all tokens until 'like' with count > 36308 (0.1533)
First entry to keep: 'economic' with count: 35784 (0.1511)


(0.15110720740503014, 0.15331993311149772)

=> the first word with relevance, that is related to an acutal debate topic (according to our judgement) is economic occuring in 15.11% of the documents

In [None]:
print("Filtering most frequent words")
get_threshold(list(reversed(counts)))

Filtering most frequent words
governmentalism (1, 0.0000)
foreskin (2, 0.0000)
lita (3, 0.0000)
mediatize (4, 0.0000)
yahia (5, 0.0000)
nouri (6, 0.0000)
whirpool (7, 0.0000)
jewelery (8, 0.0000)
reparatii (9, 0.0000)
câmpia (10, 0.0000)
Omitted all tokens until 'reparatii' with count > 9 (0.0000)
First entry to keep: 'câmpia' with count: 10 (0.0000)


(4.222758981808354e-05, 3.800483083627519e-05)

=> this is less principaled; we remove all words that occur in less than 10 speeches

In [5]:
len(dictionary)

59173