In [None]:
import os 
from gensim import corpora
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess
import spacy
import logging
from typing import List
from tqdm import tqdm
import json 
import pandas as pd

In [None]:
def preprocess_documents(documents: List[str], custom_stopwords=[], test_first_k = None):     
    logging.basicConfig(format ='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level = logging.WARN)
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

    def preprocess_document(document):    
        # tokenize using gensim's default preprocessing
        tokens = simple_preprocess(document)
        document = nlp(" ".join(tokens))
        # lemmatize and remove stopwords 
        lemmas = [token.lemma_ for token in document if (not token.is_stop) and (not token.lemma_ in custom_stopwords)]
        return lemmas

    if test_first_k: 
        documents = documents[:test_first_k]
    
    processed_data = [preprocess_document(doc) for doc in tqdm(documents, "preprocessing")]
    return processed_data
    

In [None]:
def evaluate_model(lda_model, n_topics, k_words, preprocessed_data, dictionary, search_term = "migration", compute_coherence=True): 
    """For the LDA model compute: 
    - coherence (a metric in LDA to express whether each word is associated with one topic (desireable, coherence => 1) or many (undesireable, coherence => 0)
    - highest probability that the search term is given in a topic
    - the most frequent position of the search term within the topics (e.g. if search term is most likely word in topic X, its most frequent position will be 0)
    - the indices of topics where the search term is within the k most likely words of that topic
    
    """
    if compute_coherence: 
        print("Computing coherence")
        coherence_model = CoherenceModel(
            model=lda_model, 
            texts=preprocessed_data, 
            dictionary=dictionary, 
            coherence='c_v'  # most common coherence measure
        )
        coherence_score = coherence_model.get_coherence()
    else: 
        coherence_score = None 
        
    
    # for each topic get probability of migration 
    # print k most likely words for 3 topics with highest probability 
    # return max probability and whether migration appeared in k most likely words of any topic 

    # find maximum probability of search term in the topics
    search_term_max_prob = float("-inf")
    search_term_highest_pos = float("inf")
    indices_relevant_topics = []
    
    for topic_index, topic in lda_model.show_topics(formatted=False, num_topics=n_topics):
        topic_words, topic_probs = zip(*topic)
    
        if search_term in topic_words: 
            idx = topic_words.index(search_term)
            search_term_max_prob = max(topic_probs[idx], search_term_max_prob)
            search_term_highest_pos = min(idx, search_term_highest_pos)        
            # check if search term appears in k most likely words (are ordered by their likelihood)
            if idx < k_words: 
                indices_relevant_topics.append(topic_index)  
                label = ", ".join([f"{word} ({'.2f' % prob})" for word, prob in topic[:k_words]])
                print(f"Possibly relevant topic {idx + 1}: {label}")
                
    print("."*30)
    print("Coherence:", coherence_score)
    print(f"Highest probability of {search_term}: {search_term_max_prob}")
    print(f"Most likely position of {search_term}: {search_term_highest_pos}")
    print(f"Relevant topics: {indices_relevant_topics} (n: {len(indices_relevant_topics)})")
    return coherence_score, search_term_max_prob, search_term_highest_pos, indices_relevant_topics

In [None]:
df_file_path = "data/parllaw/speech_translated.csv"
df = pd.read_csv(df_file_path)

In [None]:
# only use speeches relevant for our project (by party members and with sufficient length)
df_party_members = df[~(df["party"] == "-")]
df_party_members = df_party_members[df_party_members["translatedText"].map(str).map(len) > 50]
df_party_members.reset_index(drop=True, inplace=True)

## Preprocessing the data

Because most of the data was already processed previously (before the missing translations were created), we here only preprpocess the speeches whose translations we made using Gemini. Then we merge the previously preprocessed data back with the appended data

In [None]:
# preprocess data once 
preprocessed_full_path = "data/lda/preprocessed_texts_all_translated.json"
if os.path.exists(preprocessed_full_path): 
    print("Loading preprocessed data")
    preprocessed_data = json.load(open(preprocessed_full_path))
else: 
    # TODO: acutally this is not needed anymore, streamline it
    print("COULD NOT FIND PREPROCESSED DATA; ASSUMING IT HAS TO BE MERGED WITH PREVIOUSLY PREPROCESSED DATA FIRST")
    preprocessed_gemini_path = "data/lda/preprocessed_texts_gemini_translated.json"
    preprocessed_parllaw_path = "data/lda/preprocessed_texts_parllaw_translated.json"

    if os.path.exists(preprocessed_gemini_path):
        preprocessed_gemini_translated = json.load(open(preprocessed_gemini_path))
    else:
        # for now: only those translated by gemini: 
        df_gemini_translated = df_party_members[df_party_members["translationSource"].isin(["original_gm", "machine_gm"])]
        print("Number of documents to preprocess:", len(df_gemini_translated))
        
        documents = df_gemini_translated["translatedText"].tolist()
        preprocessed_gemini_translated = preprocess_documents(documents)
        json.dump(preprocessed_gemini_translated, open(preprocessed_gemini_path, "w+"))

    # merge preprocessed data 
    preprocessed_parllaw_translated = json.load(open(preprocessed_parllaw_path)) 

    parllaw_translated_indices = df_party_members[df_party_members["translationSource"].isin(["original_pl", "machine_pl"])].index.tolist()
    gemini_translated_indices = df_gemini_translated.index.tolist()
    all_indices = parllaw_translated_indices + gemini_translated_indices
    # sanity checks:
    assert len(parllaw_translated_indices) == len(preprocessed_parllaw_translated)
    assert len(gemini_translated_indices) == len(preprocessed_gemini_translated)
    # first just append, but to keep indices aligned with the dataframe's indices, we re-order based on the dataframe's indices
    preprocessed_data_unordered = preprocessed_parllaw_translated + preprocessed_gemini_translated
    preprocessed_data = [None] * len(preprocessed_data_unordered)
    for current_index, target_index in enumerate(all_indices): 
        preprocessed_data[target_index] = preprocessed_data_unordered[current_index]
        
    json.dump(preprocessed_data, open(preprocessed_full_path, "w+"))

In [None]:
print("Creating dictionary")
dictionary = corpora.Dictionary(preprocessed_data)
print("Filtering dictionary")
dictionary.filter_extremes(
    no_below=10,     # Keep tokens appearing in at least 10 speeches
    no_above=0.4,    # Remove tokens appearing in more than 40% of speeches
    keep_n=100000    # Keep only the top 100k words by frequency
)
corpus = [dictionary.doc2bow(l) for l in tqdm(preprocessed_data, "Preparing corpus")]

In [None]:
n_topic_values = {    
    50: [5], 
    60: [5], 
    80: [5, 7, 10], 
    100: [5, 7, 10], 
    120: [5, 7, 10],
}

n_workers = 4
k_words = 10

runs = []
for n_topics, n_passes_values in n_topic_values.items(): 
    for n_passes in n_passes_values: 
        os.makedirs(f"data/lda/{n_topics}_topics/{n_passes}", exist_ok=True)
        out_path = f"data/lda/{n_topics}_topics/{n_passes}/model.model"
        num_topics = n_topics
        n_passes = 5
        workers = n_workers

        print("Fitting model with", num_topics, "topics and", n_passes, "passes")
        lda_model = LdaMulticore(corpus = corpus, id2word=dictionary, num_topics = num_topics, passes = n_passes, workers=workers)
        lda_model.save(out_path)
        
        runs.append((n_topics, n_passes, lda_model))

In [None]:
for n_topics, n_passes, lda_model in runs: 
    print("Evaluating model with", n_topics, "topics and", n_passes, "n_passes")
    os.makedirs(f"data/lda/{n_topics}_topics/{n_passes}", exist_ok=True)
    out_path = f"data/lda/{n_topics}_topics/{n_passes}/model.model"
    evaluate_model(lda_model, n_topics, k_words, preprocessed_data, dictionary)