In [1]:
import os 
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore
from gensim.utils import simple_preprocess
import spacy
import logging
from typing import List
from tqdm import tqdm
import json 
import pandas as pd

In [10]:
def preprocess_documents(documents: List[str], custom_stopwords=[], test_first_k = None): 
    logging.basicConfig(format ='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level = logging.WARN)
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

    def preprocess_document(document):    
        # tokenize using gensim's default preprocessing
        tokens = simple_preprocess(document)
        document = nlp(" ".join(tokens))
        # lemmatize and remove stopwords 
        lemmas = [token.lemma_ for token in document if (not token.is_stop) and (not token.lemma_ in custom_stopwords)]
        return lemmas

    if test_first_k: 
        documents = documents[:test_first_k]
    
    processed_data = [preprocess_document(doc) for doc in tqdm(documents, "preprocessing")]
    return processed_data
    

In [2]:
df_file_path = "data/csv/df_translated.csv"
df = pd.read_csv(df_file_path)


In [3]:
df["translationSource"].unique()

array([nan, 'machine_gm', 'orginial_gm', 'original_pl', 'machine_pl'],
      dtype=object)

In [5]:
df.loc[df["translationSource"] == "orginial_gm", "translationSource"] = "original_gm"


In [6]:
df["translationSource"].unique()

array([nan, 'machine_gm', 'original_gm', 'original_pl', 'machine_pl'],
      dtype=object)

In [7]:
df_party_members = df[~(df["party"] == "-")]
df_party_members = df_party_members[df_party_members["translatedText"].map(str).map(len) > 50]
df_party_members.reset_index(drop=True, inplace=True)

In [12]:
# preprocess data once 
preprocessed_path = "data/lda/preprocessed_texts_gemini_translated.json"

if os.path.exists(preprocessed_path):
    preprocessed_gemini_translated = json.load(open(preprocessed_path))
else:
    # for now: only those translated by gemini: 
    # only those with party affiliation
    df_gemini_translated = df_party_members[df_party_members["translationSource"].isin(["original_gm", "machine_gm"])]
    print("Number of documents to preprocess:", len(df_gemini_translated))
    
    documents = df_gemini_translated["translatedText"].tolist()
    preprocessed_gemini_translated = preprocess_documents(documents)
    json.dump(preprocessed_gemini_translated, open(preprocessed_path, "w+"))

Number of documents to preprocess: 52293


preprocessing: 100%|██████████| 52293/52293 [11:08<00:00, 78.23it/s] 


In [13]:
# merge preprocessed data 
preprocessed_parllaw_translated = json.load(open("data/lda/preprocessed_texts_parllaw_translated.json")) 

parllaw_translated_indices = df_party_members[df_party_members["translationSource"].isin(["original_pl", "machine_pl"])].index.tolist()
gemini_translated_indices = df_gemini_translated.index.tolist()
all_indices = parllaw_translated_indices + gemini_translated_indices

array(['machine_gm', 'orginial_gm', 'original_pl', 'machine_pl'],
      dtype=object)

In [14]:
assert len(parllaw_translated_indices) == len(preprocessed_parllaw_translated)
assert len(gemini_translated_indices) == len(preprocessed_gemini_translated)

print(len(preprocessed_parllaw_translated), "+", len(preprocessed_gemini_translated))
print(len(df_party_members))

453412 + 52293
505705


In [None]:
preprocessed_data_unordered = preprocessed_parllaw_translated + preprocessed_gemini_translated
preprocessed_data = [None] * len(preprocessed_data_unordered)
for current_index, target_index in enumerate(all_indices): 
    preprocessed_data[target_index] = preprocessed_data_unordered[current_index]

In [19]:
print(len(preprocessed_data))

505705


In [18]:
json.dump(preprocessed_data, open("data/lda/preprocessed_texts_all_translated.json", "w+"))

In [16]:
print("creating dictionary")
dictionary = corpora.Dictionary(preprocessed_data)
print("filtering dictionary")
dictionary.filter_extremes(
    no_below=10,     # Keep tokens appearing in at least 10 docs
    no_above=0.4,    # Remove tokens appearing in more than 40% of docs
    keep_n=100000    # Keep only the top 100k words by frequency
)
corpus = [dictionary.doc2bow(l) for l in tqdm(preprocessed_data, "Preparing corpus")]

creating dictionary
filtering dictionary


Preparing corpus: 100%|██████████| 505705/505705 [00:28<00:00, 17995.23it/s]


In [17]:
n_topic_values = [80]
n_workers = 8

for n_topics in n_topic_values: 
    os.makedirs(f"lda/{n_topics}_topics", exist_ok=True)
    out_path = f"lda/{n_topics}_topics/model.model"
    num_topics = n_topics
    n_passes = 5
    workers = n_workers

    print("Fitting model with", num_topics, "topics and", n_passes, "passes")
    lda_model = LdaMulticore(corpus = corpus, id2word=dictionary, num_topics = num_topics, passes = n_passes, workers=workers)
    lda_model.save(out_path)

    # Evaluate model
    # evaluate_model(lda_model, dictionary)

Fitting model with 80 topics and 5 passes
