In [40]:
import json
import string
from pathlib import Path

import gensim
import pandas as pd
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

DATA_DIR = Path().absolute().parent.parent / "data"
df_speech_url = pd.read_csv(DATA_DIR / "UN Speeches.csv")

In [66]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc: str) -> str:
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def get_corpus_from_file(country: str, path: Path = DATA_DIR / "2023") -> str:
    with open(path / f"{country}.json") as f:
        json_data = json.load(f)
    corpus = [x['text'] for x in json_data]
    large_corpus = ' '.join([x for x in corpus])
    return large_corpus

def filter_words(corpus: str):
    words_filter = ["excellency", "assembly", "president", "mr", "weve", "also", "uh"]
    filtered_corpus = ' '.join([x for x in corpus.split(' ') if x not in words_filter and len(x) > 1])
    return filtered_corpus

In [67]:
corpus = get_corpus_from_file(df_speech_url.iloc[0]['country'])
corpus = clean(corpus)
corpus = filter_words(corpus)
# corpus

In [68]:
all_text = []
for i, r in df_speech_url.iterrows():
    try:
        corpus = get_corpus_from_file(r['country'])
        corpus = clean(corpus)
        corpus = filter_words(corpus)
        all_text.append(corpus)
    except:
        pass

In [69]:
doc_clean = [clean(doc).split() for doc in all_text] 
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [70]:
num_topics_lda = 16

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=num_topics_lda, id2word = dictionary, passes=50)

In [71]:
# ldamodel.print_topics(num_topics=num_topics_lda, num_words=10)

In [72]:
# ldamodel.show_topics()

In [73]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(ldamodel, doc_term_matrix, dictionary)
pyLDAvis.display(vis_data)

In [44]:
pyLDAvis.save_html(vis_data, 'lda.html')

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = [doc for doc in all_text]

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names_out()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
print()
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
development global mr climate sustainable united agenda sdgs health 2030
Topic 1:
russia ukraine aggression war ukrainian crime security nuclear council grain
Topic 2:
uh um african like strengthen continent el area integration right
Topic 3:
africa african continent mr republic solidarity conflict security democratic economic
Topic 4:
need today right time want european make europe minister united
Topic 5:
island ocean climate small sea pacific global security mr japan
Topic 6:
haiti mr haitian tobago trinidad caribbean government small community crisis
Topic 7:
israel palestinian refugee occupation iran arab resolution israeli sudan blessing
Topic 8:
inequality human democracy democratic child society government development right republic
Topic 9:
security global asia region central development cooperation afghanistan member issue

Topic 0:
global united war security climate right economic today republic like
Topic 1:
human community france security republic support responsi