In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd 
# from scripts import lda

In [None]:
filename = "data/parlaw/speech_output.csv"

df = pd.read_csv(filename)
df["year"] = df.apply(lambda s: int(s["date"][:4]), axis=1)
df["uq_agenda"] = df["agenda"]+df["date"]

## Use LDA to find clusters of speeches on the same topics

In [None]:
# only use speeches where speaker is associated with a party
df_party_members = df[~(df["party"] == "-")]
# for now: only use speeches which were given in english or machine translation availible
df_party_members = df_party_members[df_party_members["translatedText"].notna()]
df_party_members = df_party_members.reset_index(drop=True)

In [None]:
# documents = df_party_members["translatedText"].dropna()
# documents = documents[documents.map(len) > 50]
# model = lda.process_texts(documents, custom_stopwords=[], num_topics=30, n_passes=10)

In [None]:
import json 
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from tqdm import tqdm
from collections import Counter

# for comparison: filter agenda items with keywords 
keywords = ["(M|m)igration", "(M|m)igrant", "(R|r)efugee", "(A|a)sylum", "(T|t)hird(-)?country national"]
indices_keyword_identified_speeches = df_party_members.index[df_party_members["uq_agenda"].str.contains("|".join(keywords))].tolist()

# in a seperate step had already preprocessed (tokenized, lemmatized, ...) data
preprocessed_data = json.load(open("lda/preprocessed_texts.json"))
# TODO: remove unneccessary tokens like "european", "union", "mr", "president" ? 

In [None]:
def evaluate_model(lda_model, dictionary, compute_coherence=True): 
    if compute_coherence: 
        print("Computing coherence")
        coherence_model = CoherenceModel(
            model=lda_model, 
            texts=preprocessed_data, 
            dictionary=dictionary, 
            coherence='c_v'  # most common coherence measure
        )
        coherence_score = coherence_model.get_coherence()
        print("Coherence:", coherence_score)
    else: 
        coherence_score = None 
        
    # compute which topics are related to migration 
    migration_topic_indices = lda_model.get_term_topics(lda_model.id2word.token2id["migration"], minimum_probability=0)
    print("Topics related to 'migration':", len(migration_topic_indices))
    
    # check if any migration-related topic has probability > 0.05 (arbitrary threshold)
    threshold = 0.05
    high_topics = [(tid, prob) for tid, prob in migration_topic_indices if prob > threshold]
    if len(high_topics) > 0:
        print(f"Topics with probability > {threshold}: {high_topics}")
    else:
        print(f"No migration-related topic has probability > {threshold}")

    # for each speech with migration-keyworded agenda, get most probable topic
    topic_counts = Counter()
    topic_probabilities = []
    missmatches = 0 
    for idx in indices_keyword_identified_speeches:
        # if idx >= len(preprocessed_data):
        #     break
        bow = dictionary.doc2bow(preprocessed_data[idx])
        topic_distribution = lda_model.get_document_topics(bow)
        most_probable_topic = max(topic_distribution, key=lambda x: x[1])
        topic_counts[most_probable_topic[0]] += 1
        topic_probabilities.append(most_probable_topic[1])

        if most_probable_topic[0] not in dict(migration_topic_indices):
            missmatches += 1 # count how often the most probable topic is not a migration-related topic

    print("Most common topics for migration-related speeches:", topic_counts.most_common(len(migration_topic_indices)))
    print("Average probability of most probable topic for migration-related speeches:", sum(topic_probabilities) / len(topic_probabilities))
    print("missmatches (most probable topic not migration-related):", missmatches, f"{missmatches / len(indices_keyword_identified_speeches):.2%}")

    return len(high_topics), coherence_score, missmatches

In [None]:
len(preprocessed_data) == len(df_party_members)

# TODO: WHY NOT?? 

In [None]:
print("creating dictionary")
dictionary = corpora.Dictionary(preprocessed_data)
print("filtering dictionary")
dictionary.filter_extremes(
    no_below=10,     # Keep tokens appearing in at least 10 docs
    no_above=0.4,    # Remove tokens appearing in more than 40% of docs
    keep_n=100000    # Keep only the top 100k words by frequency
)
corpus = [dictionary.doc2bow(l) for l in tqdm(preprocessed_data, "Preparing corpus")]

In [None]:
def print_topics(model, n_topics, k_words=5):
    for idx, topic in model.show_topics(formatted=False, num_topics=n_topics):
        label = ", ".join([word for word, prob in topic[:k_words]])
        print(f"Topic {idx + 1}: {label}")
        

In [None]:
def assign_topics(lda_model, corpus):
    # choose the LDA model to use (change if you want a different one)
    topics = []
    for bow in tqdm(corpus, desc="Assigning most probable topic to each doc"):
        docs_topics = lda_model.get_document_topics(bow, minimum_probability=0)
        topics.append(docs_topics)
    return topics

In [None]:
def count_topic_assignments(corpus_topics, topic_id, prob_threshold=0.0):
    total = 0
    most_prob_count = 0
    at_least_prob_count = 0

    most_doc_indices = [] 
    most_probabilities = []
    at_least_probabilites = []
    for i, doc_topics in tqdm(enumerate(corpus_topics), desc="Counting topic assignments"):
        # skip empty entries
        if not doc_topics:
            continue
        total += 1

        most_topic, most_prob = max(doc_topics, key=lambda x: x[1])
        if most_topic == topic_id:
            most_prob_count += 1
            most_probabilities.append(most_prob)
            most_doc_indices.append(i)

        # probability of the target topic (0.0 if absent)
        prob_map = dict(doc_topics)
        prob = prob_map.get(topic_id, 0.0)
        if prob >= prob_threshold:
            at_least_prob_count += 1
            at_least_probabilites.append(prob)

    if total == 0:
        return ({
            "total_docs": 0,
            "most_prob_count": 0,
            "at_least_prob_count": 0,
            "most_avg_prob": 0.0, 
            "at_least_avg_prob": 0.0,
        }, [])

    return ({
        "total_docs": total,
        "most_prob_count": most_prob_count,
        "at_least_prob_count": at_least_prob_count, 
        "most_avg_prob": sum(most_probabilities) / most_prob_count,
        "at_least_avg_prob": sum(at_least_probabilites) / at_least_prob_count,  
    }, most_doc_indices)

In [None]:
from gensim.models import LdaModel

# all_n_topics = [60, 80, 90, 100, 120]

# all_n_topics = [80]

# for n_topics in all_n_topics: 
n_topics = 80
model = LdaModel.load(f"lda/{n_topics}_topics/model.model")
evaluate_model(model, dictionary, compute_coherence=False)
print_topics(model, n_topics)
corpus_topics = assign_topics(model, corpus)

migration_topic_index = 26
print("counting occurance of topic", migration_topic_index)
print(list(map(lambda x: x[0], model.show_topic(topicid=26, topn=10))))

counts, assigned_subset = count_topic_assignments(corpus_topics, topic_id=26, prob_threshold=0.3)
print(counts)

df_assigned_topic_most_probable = df_party_members.iloc[assigned_subset]
    

In [None]:
import matplotlib as plt

n_per_year = df_assigned_topic_most_probable.groupby(["year", "party"]).size().unstack()

ax = n_per_year.plot(kind="bar", stacked=False, figsize=(40, 5))
# shortened_labels = [l[:50]+"..." if len(l) > 50 else l for l in [lb.get_text() for lb in ax.get_xticklabels()]]
# _ = ax.set_xticklabels(shortened_labels)
# ax.set_ybound(upper=400)
plt.yscale("log")

In [None]:
n_per_agenda = df_assigned_topic_most_probable.groupby(["uq_agenda"]).size()

In [None]:
len(n_per_agenda[(n_per_agenda > 10)].index)

In [None]:
len(df_assigned_topic_most_probable[df_assigned_topic_most_probable["uq_agenda"].isin(n_per_agenda[(n_per_agenda > 20)].index)])

In [None]:
# TODO: look at Topic 68: border, cooperation, crime, state, member

In [None]:
count_topic_assignments(corpus_topics_m80, topic_id=26, prob_threshold=0.3)

: 

In [None]:
n_topic_values = [90, 100, 120]
n_workers = 8

import os 
for n_topics in n_topic_values: 
    os.makedirs(f"lda/{n_topics}_topics", exist_ok=True)
    out_path = f"lda/{n_topics}_topics/model.model"
    num_topics = n_topics
    n_passes = 5
    workers = n_workers

    print("Fitting model with", num_topics, "topics and", n_passes, "passes")
    lda_model = LdaMulticore(corpus = corpus, id2word=dictionary, num_topics = num_topics, passes = n_passes, workers=workers)
    lda_model.save(out_path)

    # Evaluate model
    evaluate_model(lda_model, dictionary)

## Use Keyword search to find relevant agendas / speeches

In [None]:
keywords = ["(M|m)igration", "(M|m)igrant", "(R|r)efugee", "(A|a)sylum", "(T|t)hird(-)?country national"]#, "(F|f)rontex"]

# TODO: potentially find relevant keywords using word2vec over text and look for words similar to migration? 

In [None]:
def filter_for_keywords(df, column="agenda", keywords=keywords, min_contributions_per_agenda_item=10): 
    relevant_indices = df[column].str.contains("|".join(keywords))
    agenda_items_vc = df[relevant_indices]["uq_agenda"].value_counts()
    
    # TODO: do this filtering in the beginning because why not
    n_agenda_items_before = len(agenda_items_vc)
    relevant_agendas = agenda_items_vc[agenda_items_vc > min_contributions_per_agenda_item]
    n_agenda_items_after = len(relevant_agendas)
    print(f"filtered {n_agenda_items_before-n_agenda_items_after} agenda items with < {min_contributions_per_agenda_item} speeches")
    
    relevant_indices = relevant_indices & (df["uq_agenda"].isin(relevant_agendas.index))
    
    print(f"n speeches: {len(df[relevant_indices])}")
    print(f"n agendas: {n_agenda_items_after}")
    
    return df[relevant_indices]

In [None]:
# agenda items per year; speech per year; 
# party per agenda item
# contribution per party per year
# normalize by original 
df_filtered = filter_for_keywords(df)

In [None]:
n_per_year_and_party = df_filtered.groupby(["year"]).size()#.unstack()

ax = n_per_year_and_party.plot(kind="bar", stacked=False, figsize=(40, 5))
shortened_labels = [l[:50]+"..." if len(l) > 50 else l for l in [lb.get_text() for lb in ax.get_xticklabels()]]
_ = ax.set_xticklabels(shortened_labels)

In [None]:
vc = df_filtered["uq_agenda"].value_counts()
vc

In [None]:
df_filtered_text = filter_for_keywords("text")

In [None]:
n = df_filtered_text["uq_agenda"].value_counts()
print(n.min(), n.mean(), n.max())

#df_filtered["uq_agenda"]
print(len(n))
# agendas that with more than 4 speeches with relevant keywords, that are not in the agendas that were discovered with keywords in title
new_relevant_agendas = n[(n>4) & (~n.index.isin(df_filtered["uq_agenda"]))]
new_relevant_agendas[:20]

In [None]:
def print_agenda(search_term, n_speeches=None): 
    all_agendas = df[df["uq_agenda"].str.contains(search_term)]["uq_agenda"].unique()
    for agenda in all_agendas: 
        speeches = df[df["uq_agenda"] == agenda].sort_values(by="speechnumber", ascending=True)
        print("Agenda:", (agenda[:30]+"..." if len(agenda) > 30 else agenda))
        print("Nr of speeches:", len(speeches))
        print("")
        
        if n_speeches:
            speeches = speeches[:n_speeches]
        for _, r in speeches.iterrows(): 
            print(f'({r["speechnumber"]}) {r["speaker"]} ({r['party']}): {r['text'] if not r['translatedText'] else r["translatedText"]}')

In [None]:
# print_agenda("6.4. Situation of fundamental rights in the European Union in 2015", 10)
# print_agenda("7.7. The situation of women refugees and asylum seekers in the EU", 10)
print_agenda("European Border and Coast Guard", 10)