In [14]:
import json
import string
from pathlib import Path

import gensim
import pandas as pd
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm

from unga_speeches import speech_analysis as sa

APP_DIR = Path().absolute().parent.parent / "app"
DATA_DIR = Path().absolute().parent.parent / "data"
# df_speech_url = pd.read_csv(DATA_DIR / "UN Speeches.csv")
df_speeches = sa.get_data()

In [11]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc: str) -> str:
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def get_corpus_from_file(country: str, path: Path = DATA_DIR / "2023") -> str:
    with open(path / f"{country}.json") as f:
        json_data = json.load(f)
    corpus = [x['text'] for x in json_data]
    large_corpus = ' '.join([x for x in corpus])
    return large_corpus

def filter_words(corpus: str):
    # TODO Use the same filter words for freq plots, wordcloud and topic analysis
    words_filter = ["excellency", "assembly", "president", "mr", "weve", "also", "uh"]
    filtered_corpus = ' '.join([x for x in corpus.split(' ') if x not in words_filter and len(x) > 1])
    return filtered_corpus

In [12]:
corpus = get_corpus_from_file(df_speech_url.iloc[0]['country'])
corpus = clean(corpus)
corpus = filter_words(corpus)
# corpus

In [15]:
all_text = []
for i, r in df_speeches.iterrows():
    try:
        if isinstance(
            df_speeches[df_speeches["country"] == r['country']]["start"], str
        ) and isinstance(df_speeches[df_speeches["country"] == r['country']]["end"], str):
            h_start, m_start, s_start = (
                df_speeches[df_speeches["country"] == r['country']]["start"]
                .values[0]
                .split(":")
            )
            start = int(h_start) * 60 * 60 + int(m_start) * 60 + int(s_start)
            h_end, m_end, s_end = (
                df_speeches[df_speeches["country"] == r['country']]["end"]
                .values[0]
                .split(":")
            )
            end = int(h_end) * 60 * 60 + int(m_end) * 60 + int(s_end)
            corpus = sa.get_corpus_from_file(r['country'], start=start, end=end)
        elif isinstance(
            df_speeches[df_speeches["country"] == r['country']]["start"], str
        ):
            h, m, s = (
                df_speeches[df_speeches["country"] == r['country']]["start"]
                .values[0]
                .split(":")
            )
            start = int(h) * 60 * 60 + int(m) * 60 + int(s)
            corpus = sa.get_corpus_from_file(r['country'], start=start)
        else:
            corpus = sa.get_corpus_from_file(r['country'])
        
        # corpus = get_corpus_from_file(r['country'])
        corpus = sa.clean(corpus)
        corpus = sa.filter_words(corpus)
        all_text.append(corpus)
    except:
        pass

In [17]:
doc_clean = [sa.clean(doc).split() for doc in all_text] 
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [16]:
doc_clean = [clean(doc).split() for doc in all_text] 
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

NameError: name 'clean' is not defined

In [22]:
# Taken from https://stackoverflow.com/a/64370066 to determine number of topics

import numpy as np
from gensim.models import LdaModel, CoherenceModel

num_topics = list(range(21)[1:])
num_keywords = 20

LDA_models = {}
LDA_topics = {}
for i in tqdm(num_topics):
    LDA_models[i] = LdaModel(corpus=doc_term_matrix,
                             id2word=dictionary,
                             num_topics=i,
                             update_every=1,
                             chunksize=len(doc_term_matrix),
                             passes=40,
                             alpha='auto',
                             random_state=42)

    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]


def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))

LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

coherences = [CoherenceModel(model=LDA_models[i], texts=doc_clean, dictionary=dictionary, coherence='c_v').get_coherence()\
              for i in num_topics[:-1]]

coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(num_keywords)[:-1]] # limit topic numbers to the number of keywords

coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]
print(f"The ideal number of topics is: {ideal_topic_num}")

100%|██████████| 20/20 [08:15<00:00, 24.75s/it]


The ideal number of topics is: 19


In [16]:
# num_topics_lda = ideal_topic_num
# # num_topics_lda = 14

# # Creating the object for LDA model using gensim library
# Lda = gensim.models.ldamodel.LdaModel

# # Running and Trainign LDA model on the document term matrix.
# ldamodel = Lda(doc_term_matrix, num_topics=num_topics_lda, id2word = dictionary, passes=50)

In [23]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# vis_data = gensimvis.prepare(ldamodel, doc_term_matrix, dictionary)
vis_data = gensimvis.prepare(LDA_models[ideal_topic_num], doc_term_matrix, dictionary)
pyLDAvis.display(vis_data)

In [18]:
pyLDAvis.save_html(vis_data, str(APP_DIR / 'pages' / 'lda.html'))

In [19]:
# ldamodel.print_topics(num_topics=num_topics_lda, num_words=10)

In [20]:
# ldamodel.show_topics()