In [1]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
import nltk
import pyLDAvis as vis

# Data

Merge the same sections from each document together to form a single file.

In [2]:
import os 

# directory = 'output/unprocessed/annexes/'
 
# # Merge all files in a particular section together

# data = ""
# for filename in os.listdir(directory):
#     f = os.path.join(directory, filename)
#     # checking if it is a file

#     if (os.path.isfile(f) and (".DS_Store" not in f)):
#         # print(f)
#         with open(f, 'r', encoding="utf8") as fp:
#             currentfiledata = fp.read()
#         data += "\n"
#         data += currentfiledata

# with open('fulltext_annexes.txt', 'w') as fp:
#     fp.write(data)

# 2. Executive summary text

In [3]:
# Don't merge documents, keep them separate in the model so we can keep track of provenance of topics

directory = 'output/unprocessed/summary/'
data = []
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file

    if (os.path.isfile(f) and (".DS_Store" not in f)):
        # print(f)
        with open(f, 'r', encoding="utf8") as fp:
            currentfiledata = fp.read()
        data.append(currentfiledata.lower())

In [4]:
# preprocessing
documents = [doc.strip() for doc in data]
sp = WhiteSpacePreprocessing(documents, stopwords_language='english')
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

# training
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v1")
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)
tp.vocab[:10]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['ability',
 'able',
 'abroad',
 'absence',
 'absent',
 'abstract',
 'abstraction',
 'abundant',
 'acceptable',
 'accepted']

In [5]:
# fit the new topic model (CombinedTM). We will ask the model to find 10,30,50,70 and 100 topics in our collection.
ctm_20 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=10)
ctm_62 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=62, num_epochs=10)
ctm_20.fit(training_dataset) # run the model
ctm_62.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [530/530]	Train Loss: 6846.996462264151	Time: 0:00:07.368978: : 10it [01:14,  7.46s/it]
Epoch: [10/10]	 Seen Samples: [530/530]	Train Loss: 6987.867924528302	Time: 0:00:07.471334: : 10it [01:13,  7.40s/it]


In [6]:
# test topics
ctm_20.get_topic_lists(5)

[['future', 'challenge', 'sustainable', 'demand', 'emerging'],
 ['markets', 'multi', 'facilities', 'guide', 'resilience'],
 ['profile', 'flexibility', 'basic', 'planning', 'yield'],
 ['multi', 'greater', 'challenge', 'meet', 'agriculture'],
 ['accounting', 'certain', 'rayon', 'avoided', 'dealing'],
 ['instruments', 'river', 'basin', 'adoption', 'weather'],
 ['declining', 'ageing', 'refurbishment', 'banks', 'yet'],
 ['cost', 'level', 'gap', 'annual', 'policy'],
 ['recovery', 'polluter', 'regime', 'crops', 'transition'],
 ['fixed', 'cities', 'corruption', 'frequently', 'management'],
 ['willingness', 'structure', 'amount', 'resulted', 'tackle'],
 ['ordinating', 'water', 'allocation', 'factors', 'organisations'],
 ['rayon', 'demand', 'nevertheless', 'incentivise', 'rural'],
 ['resources', 'indicator', 'decision', 'introduction', 'decisions'],
 ['defining', 'especially', 'average', 'council', 'multiple'],
 ['investments', 'spatial', 'america', 'major', 'service'],
 ['compliance', 'recommen

In [12]:
# prepare for visualisation
lda_vis_data_20 = ctm_20.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=40)
lda_vis_data_62 = ctm_62.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=40)

Sampling: [40/40]: : 40it [04:59,  7.48s/it]
Sampling: [40/40]: : 40it [04:58,  7.47s/it]


In [8]:
# visualise / plot topics
ctm_pd_20 = vis.prepare(**lda_vis_data_20, R=10)
ctm_pd_62 = vis.prepare(**lda_vis_data_62, R=10)

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(


In [9]:
vis.display(ctm_pd_20)

In [10]:
vis.display(ctm_pd_62)

In [11]:
# save models
# ctm_20.save(models_dir="models/")
# ctm_62.save(models_dir="models/")