# Step 1: Data Preparation (starting with the non-truncated first)

In [1]:
## CODE ADAPTED FROM: https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

In [2]:
import numpy as np
import pandas as pd

In [3]:
nontruncated = pd.read_csv("english_nontruncated.csv", index_col=0)
nontruncated.head()

Unnamed: 0,organization,Text
0,Adaptation Fund Board,10 Years of the Adaptation Fund - Pioneering A...
1,Adaptation Fund Board,About 10 Years of the Adaptation Fund Stories ...
2,Adaptation Fund Board,Contact 10 Years of the Adaptation Fund Storie...
3,Adaptation Fund Board,Governance - Adaptation Fund 10 Years of the A...
4,Adaptation Fund Board,AF | Adaptation Fund 10 Years of the Adaptatio...


In [4]:
# FOR EACH ORGANIZATION, MERGE THE TEXTS CORRESPONDING TO THAT ORGANIZATION

In [8]:
unique_orgs = list(nontruncated.organization.unique())
unique_orgs

['Adaptation Fund Board',
 'African Centre of Meteorological Application for Development',
 'African Development Bank Group',
 'African Union Commission',
 'African, Caribbean and Pacific Group of States',
 'Asian Development Bank',
 'Autorité de développement intégré de la région du Liptako-Gourma',
 'Banco Centroamericano de Integración Económica',
 'CAB International',
 'Caribbean Community Climate Change Centre',
 'Caribbean Community Secretariat',
 'Center for International Forestry Research',
 'CGIAR System Organization',
 'Comisión Centroamericana de Ambiente y Desarrollo',
 'Comité permanent inter-états de lutte contre la sécheresse au Sahel',
 'Commonwealth Secretariat',
 'Convention on Wetlands of International Importance especially as Waterfowl Habitat',
 'Corporación Andina de Fomento',
 'Council of Europe',
 'Council of Europe Development Bank',
 'East African Community',
 'Economic Community of Central African States',
 'Economic Community of West African States',
 'Europ

In [9]:
text_collection = []

for org in unique_orgs:
    merged_text = ' '.join(str(x) for x in nontruncated.Text[nontruncated.organization == org])
    text_collection.append(merged_text)
    

In [10]:
print(len(unique_orgs), len(text_collection))

1091 1091


In [11]:
df = pd.DataFrame(list(zip(list(unique_orgs), text_collection)), columns=['organization', 'Text'])

In [12]:
df

Unnamed: 0,organization,Text
0,Adaptation Fund Board,10 Years of the Adaptation Fund - Pioneering A...
1,African Centre of Meteorological Application f...,African Centre of Meteorological Application f...
2,African Development Bank Group,Mission & Strategy | African Development Bank ...
3,African Union Commission,All African Union Websites | African Union Ski...
4,"African, Caribbean and Pacific Group of States","African, Caribbean, and Pacific Group of State..."
...,...,...
1086,Fresh Energy,Skip to content Log In Events Subscribe Take A...
1087,International Solar Energy Society e.V.,Log in Username Password Submit﻿﻿﻿﻿Become a me...
1088,Instituto Global Attitude,United Nations Global Compact All Participants...
1089,Earth Innovation Institute,Membership No 6-0023-12-000-00 Category Ordina...


In [13]:
import re
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

In [15]:
# REMOVE STOP WORDS
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(["retrieved", "retrieve", "january","february","march","april", "may","june", "july", 
                   "august","september", "october", "november", "december","wikipedia", "facebook", 
                   "instagram", "linkedIn","used", "u", "U", "b", "lat", "lon", 
                   "archived", "archive", "download","about", "events", "careers", "contact"])

In [16]:
# Convert to list 
data = df.Text.values.tolist()  
# Remove new line characters 
data = [re.sub(r'\s+', ' ', sent) for sent in data]  
# Remove distracting single quotes 
data = [re.sub(r"\'", "", sent) for sent in data]  
pprint(data[:1])

['10 Years of the Adaptation Fund - Pioneering Adaptation Finance 10 Years of '
 'the Adaptation Fund Stories Implementing Partners Beneficiary Capsules '
 'Stakeholder Stories Adaptation Fund Stories Table of Contents Anniversary '
 'Event Materials & Resources Documents & Publications About Governance Board '
 'Secretariat Trustee Accreditation Panel Financial Status Evaluation Partners '
 '& Supporters Direct Access Timeline FAQs Careers Contact Donate Helping '
 'developing countries build resilience and adapt to climate change Projects & '
 'Programmes Project Information Projects Map View Projects Photo View '
 'Projects Table View Project Sectors Agriculture Coastal Zone Management '
 'Disaster Risk Reduction Food Security Forests Multisector Projects Rural '
 'Development Urban Development Water Management Project Performance Project '
 'Waitlist Active Pipeline Projects Proposals Under Review Accountability & '
 'Complaints Ad Hoc Complaint Handling Mechanism (ACHM) Complaints

In [17]:
# TRANSFORM TEXT TO LIST OF INDIVIDUAL WORDS

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) #deacc=True removes punctuations

data_words = list(sent_to_words(data))

# POSSIBLY REMOVE THE WORDS WITH LESS THAN 3 CHARACTERS...
print(data_words[:1])



In [18]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



In [49]:
?gensim.models.Phrases

In [50]:
?gensim.models.phrases.original_scorer

In [19]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # SpaCy documentation: https://spacy.io/usage/spacy-101

def lemmatization(texts):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

In [20]:
# CALL THE FUNCTIONS IN ORDER:

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams)

print(data_lemmatized[:1])

[['year', 'adaptation', 'fund', 'pioneer', 'adaptation', 'finance', 'year', 'adaptation', 'fund', 'story', 'implement', 'partner', 'beneficiary_capsules', 'stakeholder', 'story', 'adaptation', 'fund', 'story', 'table', 'content', 'anniversary', 'event', 'material', 'resource', 'document', 'publication', 'governance', 'board', 'secretariat', 'trustee', 'accreditation', 'panel', 'financial', 'status', 'evaluation', 'partner', 'supporter', 'direct', 'access', 'timeline', 'faq', 'donate', 'help', 'develop', 'country', 'build', 'resilience', 'adapt', 'climate', 'change', 'project', 'programme', 'project', 'information', 'project', 'map', 'view', 'project', 'photo', 'view', 'project', 'table', 'view', 'project', 'sector', 'agriculture', 'coastal', 'zone', 'management', 'disaster_risk', 'reduction', 'food', 'security', 'forest', 'multisector', 'project', 'rural', 'development', 'urban', 'development', 'water', 'management', 'project', 'performance', 'project', 'waitlist', 'active', 'pipeline'

In [21]:
# started 17:55 -> ended 18:00
# started 23:09 -> 23:14

In [22]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  # doc2bow = document to bag-of-words
# View 
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 41), (4, 1), (5, 1), (6, 2), (7, 7), (8, 2), (9, 6), (10, 31), (11, 4), (12, 1), (13, 2), (14, 13), (15, 8), (16, 1), (17, 11), (18, 3), (19, 5), (20, 17), (21, 152), (22, 2), (23, 1), (24, 1), (25, 3), (26, 2), (27, 6), (28, 2), (29, 1), (30, 1), (31, 3), (32, 2), (33, 4), (34, 1), (35, 3), (36, 95), (37, 7), (38, 1), (39, 5), (40, 3), (41, 1), (42, 5), (43, 1), (44, 1), (45, 8), (46, 2), (47, 1), (48, 2), (49, 2), (50, 3), (51, 1), (52, 2), (53, 5), (54, 4), (55, 1), (56, 3), (57, 1), (58, 2), (59, 10), (60, 1), (61, 1), (62, 2), (63, 10), (64, 12), (65, 3), (66, 1), (67, 2), (68, 2), (69, 1), (70, 1), (71, 10), (72, 1), (73, 2), (74, 5), (75, 1), (76, 2), (77, 4), (78, 1), (79, 5), (80, 1), (81, 1), (82, 3), (83, 9), (84, 1), (85, 7), (86, 2), (87, 1), (88, 1), (89, 1), (90, 3), (91, 3), (92, 1), (93, 3), (94, 2), (95, 6), (96, 4), (97, 6), (98, 2), (99, 1), (100, 1), (101, 44), (102, 1), (103, 1), (104, 2), (105, 11), (106, 2), (107, 10), (108, 5), (10

In [23]:
# Gensim creates unique id for each word in the document. 
# The mapping of the corpus is as follows: every pair (a, b) in fact represents (word_id, word_frequency). 

# If you want to see what word corresponds to a given id, 
# then pass the id as a key to dictionary. 
# Example: 
id2word[4]

'accessible'

In [24]:
# to see the word frequency distribution in a given document, run this:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('able', 2),
  ('aboutafadmin', 1),
  ('accelerate', 1),
  ('access', 41),
  ('accessible', 1),
  ('accessing', 1),
  ('account', 2),
  ('accountability', 7),
  ('accountable', 2),
  ('accredit', 6),
  ('accreditation', 31),
  ('achieve', 4),
  ('achievement', 1),
  ('act', 2),
  ('action', 13),
  ('active', 8),
  ('actively', 1),
  ('activity', 11),
  ('actual', 3),
  ('ad_hoc', 5),
  ('adapt', 17),
  ('adaptation', 152),
  ('adaption', 2),
  ('adaptive', 1),
  ('adaptive_capacity', 1),
  ('addition', 3),
  ('additional', 2),
  ('address', 6),
  ('admin', 2),
  ('administrate', 1),
  ('administration', 1),
  ('administrative', 3),
  ('adopt', 2),
  ('adverse_effect', 4),
  ('adverse_impact', 1),
  ('advisory', 3),
  ('af', 95),
  ('af_terg', 7),
  ('afb', 1),
  ('afcia', 5),
  ('affect', 3),
  ('afs', 1),
  ('agenda', 5),
  ('agreement', 1),
  ('agricultural', 1),
  ('agriculture', 8),
  ('aim', 2),
  ('aissatou', 1),
  ('allocate', 2),
  ('allocation', 2),
  ('allow', 3),
  ('along

In [25]:

# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                           id2word=id2word,
#                                           num_topics=20, 
#                                           random_state=100,
#                                           update_every=1,
#                                           chunksize=100,
#                                           passes=10,
#                                           alpha='auto',
#                                           per_word_topics=True)


coherence = []
for k in range(1,51):
    print('Round: '+ str(k))
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # document corpus
                                           id2word=id2word, # dictionary
                                           num_topics=k, # number of topics
                                           random_state=0, # random seem for model reproducibility
                                           update_every=1, # updates the coefficients at every x passes of the dataset; set to 1
                                           chunksize=100, # how many texts to process at once
                                           passes=10, # number of passes through the dataset for coefficient estimation
                                           alpha='auto', # Dirichlet distribution parameter that is automatically inferred in this case
                                           per_word_topics=True) #assign each word to at least a topic; 
                                                                #if set to False, words that are not indicative are going to be omitted
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=lda_model, 
                                                     corpus = corpus,
                                                     coherence='u_mass')   
                                                
    coherence.append((k,cm.get_coherence()))

Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Round: 40
Round: 41
Round: 42
Round: 43
Round: 44
Round: 45
Round: 46
Round: 47
Round: 48
Round: 49
Round: 50


In [36]:
# ?gensim.models.coherencemodel.CoherenceModel

# started: 18:27 -> ended: one hour and a half later
# started: 23:16 -> ended: 00:47

In [None]:
?gensim.models.ldamodel.LdaModel

In [1]:
x_val = [x[0] for x in coherence]
y_val = [x[1] for x in coherence]

plt.figure(figsize=(20,10))
plt.plot(x_val,y_val)
plt.scatter(x_val,y_val)
plt.title('Non-Truncated Texts: Number of Topics vs. Coherence')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence')
plt.xticks(x_val)
#plt.show()
plt.savefig("Non-Truncated Data NTopics Coherence Crossval.png", facecolor = "white")

NameError: name 'coherence' is not defined

In [28]:
# either 15, 16 or 45

# Start with 16:
lda_model_16t = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=16, 
                                           random_state=0,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_model_16t.show_topics(16, num_words=20, formatted=False)




[(0,
  [('organization', 0.011205412),
   ('international', 0.011083748),
   ('member', 0.009891477),
   ('we', 0.008730212),
   ('work', 0.008118346),
   ('world', 0.0073892577),
   ('business', 0.0068451874),
   ('community', 0.0065949876),
   ('people', 0.0063712955),
   ('information', 0.005219194),
   ('website', 0.004930619),
   ('news', 0.004864036),
   ('association', 0.0047145695),
   ('global', 0.004677394),
   ('development', 0.0044757724),
   ('help', 0.004387216),
   ('support', 0.0042619873),
   ('service', 0.0040507615),
   ('resource', 0.00403573),
   ('company', 0.0040334924)]),
 (1,
  [('de', 0.0832463),
   ('france', 0.04071489),
   ('french', 0.036442928),
   ('paris', 0.024229163),
   ('la', 0.023240536),
   ('marseille', 0.011978284),
   ('en', 0.0099555105),
   ('et', 0.00967437),
   ('montreal', 0.007974115),
   ('des', 0.0076740477),
   ('barcelona', 0.0073849964),
   ('catalonia', 0.007147149),
   ('catalan', 0.0068716346),
   ('saint', 0.0060712593),
   ('wat

In [41]:
# lda_model_16t.show_topics(16, num_words=50, formatted=False)
pyLDAvis.enable_notebook()
vis_16t = pyLDAvis.gensim_models.prepare(lda_model_16t, corpus, id2word, mds='mmds', sort_topics = False)
vis_16t
pyLDAvis.save_html(vis_16t, 'nontruncated_16topics_pyLDAvis.html')

In [38]:
lda_model_15t = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=0,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_model_15t.show_topics(15, num_words=20, formatted=False)

[(0,
  [('international', 0.0106734615),
   ('organization', 0.009975259),
   ('member', 0.00927533),
   ('work', 0.007929316),
   ('we', 0.007921274),
   ('world', 0.0069110407),
   ('development', 0.006360619),
   ('business', 0.0061309636),
   ('community', 0.0059821173),
   ('people', 0.005906252),
   ('global', 0.0052281055),
   ('news', 0.0047875782),
   ('information', 0.0045617195),
   ('association', 0.004465451),
   ('support', 0.00435443),
   ('policy', 0.0043487423),
   ('website', 0.0043096687),
   ('group', 0.0041103177),
   ('help', 0.0039304574),
   ('sustainable', 0.0038329689)]),
 (1,
  [('de', 0.081114106),
   ('france', 0.03995391),
   ('french', 0.03533958),
   ('paris', 0.023623956),
   ('la', 0.022956703),
   ('ymca', 0.013153334),
   ('marseille', 0.011859853),
   ('en', 0.009964442),
   ('et', 0.00930097),
   ('montreal', 0.008735617),
   ('barcelona', 0.00729112),
   ('catalonia', 0.0071755303),
   ('des', 0.007054044),
   ('catalan', 0.0068303496),
   ('medit

In [42]:
pyLDAvis.enable_notebook()
vis_15t = pyLDAvis.gensim_models.prepare(lda_model_15t, corpus, id2word, mds='mmds', sort_topics = False)
vis_15t

In [43]:
pyLDAvis.save_html(vis_15t, 'nontruncated_15topics_pyLDAvis.html')

In [None]:
# OBSERVATIONS: SHOULD ELIMINATE WORDS LESS THAN 3 CHARACTERS LONG
# UPDATE STOP WORDS LIST WITH STOP WORDS FROM OTHER LANGUAGES (OR MANUALLY ET STOP WORDS LIKE et, el, de, du, fr, que, des, en, la etc.)
# 

In [50]:
# with open('untruncated_model_3topics.txt', "w") as outfile:
#    outfile.write("\n \n".join(str(item) for item in list(lda_model_3t.show_topics(3, num_words=25, formatted=False))))

In [44]:
lda_model_45t = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=45, 
                                           random_state=0,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_model_45t.show_topics(45, num_words=20, formatted=False)

[(0,
  [('food', 0.025885396),
   ('standard', 0.021256212),
   ('certification', 0.019310227),
   ('farmer', 0.017465504),
   ('agriculture', 0.017040132),
   ('product', 0.015943034),
   ('agricultural', 0.012539813),
   ('market', 0.011795712),
   ('system', 0.010776734),
   ('organic', 0.010128439),
   ('management', 0.009910613),
   ('farm', 0.009868419),
   ('carbon', 0.009752059),
   ('sustainable', 0.009005906),
   ('certify', 0.008009002),
   ('organization', 0.006619926),
   ('submenu', 0.0066047953),
   ('trade', 0.0064717303),
   ('international', 0.0064697685),
   ('gold_standard', 0.0064451196)]),
 (1,
  [('de', 0.1318436),
   ('la', 0.030422045),
   ('en', 0.021695986),
   ('barcelona', 0.021169834),
   ('catalonia', 0.020647727),
   ('catalan', 0.019421713),
   ('international', 0.017245142),
   ('spanish', 0.015362087),
   ('spain', 0.012872547),
   ('association', 0.010455459),
   ('del', 0.009941671),
   ('mediterranean', 0.009024957),
   ('el', 0.008444904),
   ('ne

In [45]:
pyLDAvis.enable_notebook()
vis_45t = pyLDAvis.gensim_models.prepare(lda_model_45t, corpus, id2word, mds='mmds', sort_topics = False)
vis_45t

In [46]:
pyLDAvis.save_html(vis_45t, 'nontruncated_45topics_pyLDAvis.html')