In [15]:
import json
import string
from pathlib import Path

import gensim
import pandas as pd
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

DATA_DIR = Path('/Users/darenasc/projects/un-speeches/data')
df_speech_url = pd.read_csv(DATA_DIR / "UN Speeches.csv")

In [16]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc: str) -> str:
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def get_corpus_from_file(country: str, path: Path = DATA_DIR / "2023") -> str:
    with open(path / f"{country}.json") as f:
        json_data = json.load(f)
    corpus = [x['text'] for x in json_data]
    large_corpus = ' '.join([x for x in corpus])
    return large_corpus

In [17]:
corpus = get_corpus_from_file(df_speech_url.iloc[0]['country'])
corpus = clean(corpus)

In [24]:
all_text = []
for i, r in df_speech_url.iterrows():
    try:
        corpus = get_corpus_from_file(r['country'])
        corpus = clean(corpus)
        all_text.append(corpus)
    except:
        pass

In [26]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."
doc_complete = [doc1, doc2, doc3, doc4, doc5]
doc_clean = [clean(doc).split() for doc in doc_complete] 

In [28]:
doc_clean = [clean(doc).split() for doc in all_text] 
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [37]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [38]:
ldamodel.print_topics(num_topics=5, num_words=10)

[(0,
  '0.010*"country" + 0.009*"international" + 0.007*"world" + 0.007*"peace" + 0.007*"people" + 0.006*"security" + 0.006*"nation" + 0.006*"president" + 0.006*"development" + 0.006*"assembly"'),
 (1,
  '0.017*"russia" + 0.012*"ukraine" + 0.008*"war" + 0.006*"must" + 0.006*"ukrainian" + 0.005*"aggression" + 0.005*"crime" + 0.004*"nation" + 0.004*"world" + 0.004*"food"'),
 (2,
  '0.012*"country" + 0.009*"world" + 0.008*"people" + 0.007*"nation" + 0.006*"u" + 0.006*"state" + 0.006*"united" + 0.005*"one" + 0.004*"international" + 0.004*"development"'),
 (3,
  '0.009*"nation" + 0.009*"country" + 0.008*"president" + 0.007*"global" + 0.006*"world" + 0.006*"international" + 0.006*"united" + 0.006*"assembly" + 0.006*"development" + 0.006*"peace"'),
 (4,
  '0.008*"ecuador" + 0.004*"malnutrition" + 0.002*"child" + 0.002*"organized" + 0.002*"infant" + 0.002*"chronic" + 0.002*"galapagus" + 0.002*"prioritized" + 0.001*"transnational" + 0.001*"figure"')]

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = [doc for doc in all_text]

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names_out()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
print()
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
central asia global afghanistan development security region participant cooperation issue
Topic 1:
russia ukraine aggression war crime ukrainian security global nuclear council
Topic 2:
thing decision different time think make today right lot need
Topic 3:
africa african continent global united security council economic democratic war
Topic 4:
uh um strengthen like african union continent security area order
Topic 5:
island ocean taiwan climate global challenge change sea pacific mr
Topic 6:
sudan south education child republic arm health embargo ensure implementation
Topic 7:
trafficking drug child crime weve organized human organization phenomenon united
Topic 8:
development global mr united developing agenda sdgs sustainable support progress
Topic 9:
refugee palestinian syrian crisis majesty help security arab host solution
Topic 10:
haitian haiti net climate zero mr carbon energy republic kenya
Topic 11:
inequality need conflict multilateral civilian congo crucial crisis d