# Data

Merge the same sections from each document together to form a single file.

In [1]:
import os 

# directory = 'output/unprocessed/annexes/'
 
# # Merge all files in a particular section together

# data = ""
# for filename in os.listdir(directory):
#     f = os.path.join(directory, filename)
#     # checking if it is a file

#     if (os.path.isfile(f) and (".DS_Store" not in f)):
#         # print(f)
#         with open(f, 'r', encoding="utf8") as fp:
#             currentfiledata = fp.read()
#         data += "\n"
#         data += currentfiledata

# with open('fulltext_annexes.txt', 'w') as fp:
#     fp.write(data)

In [2]:
# text_file = "fulltext_body.txt"

# 1. Body text

In [3]:
# Don't merge documents, keep them separate in the model so we can keep track of provenance of topics

directory = 'output/unprocessed/body/'
data = []
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file

    if (os.path.isfile(f) and (".DS_Store" not in f)):
        # print(f)
        with open(f, 'r', encoding="utf8") as fp:
            currentfiledata = fp.read()
        data.append(currentfiledata.lower())

## Import topic modelling libraries

In [4]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
import nltk

## Preprocess text

In [5]:
#nltk.download('stopwords')

documents = [doc.strip() for doc in data]
sp = WhiteSpacePreprocessing(documents, stopwords_language='english')
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()



In [6]:
# preprocessed_documents[:2]

We don't discard the non-preprocessed texts, because we are going to use them as input for obtaining the contextualized document representations. 

Create BOW and obtain contextualized BERT representations of documents. This operation creates the training dataset.

Try the contextualized model "paraphrase-distilroberta-base-v1".


In [7]:
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v1")
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Check the first ten words of the vocabulary 

In [8]:
tp.vocab[:10]

['ability',
 'able',
 'absence',
 'abstracted',
 'abstraction',
 'abstractions',
 'acceptability',
 'acceptable',
 'access',
 'accessed']

## Train Combined TM

Fit the new topic model (CombinedTM). We will ask the model to find 10,30,50,70 and 100 topics in our collection.

In [9]:
ctm_20 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=10)
ctm_62 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=62, num_epochs=10)
ctm_20.fit(training_dataset) # run the model
ctm_62.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [540/540]	Train Loss: 235447.09259259258	Time: 0:00:07.421097: : 10it [01:14,  7.45s/it]
Epoch: [10/10]	 Seen Samples: [540/540]	Train Loss: 236534.88888888888	Time: 0:00:07.307885: : 10it [01:13,  7.37s/it]


## Test topics

In [10]:
ctm_20.get_topic_lists(5)

[['plans', 'criteria', 'permits', 'assessment', 'term'],
 ['establishing', 'marine', 'salinity', 'fields', 'reservoir'],
 ['stakeholder', 'authority', 'sewerage', 'concrete', 'instance'],
 ['department', 'owned', 'wells', 'uncertainties', 'name'],
 ['population', 'provinces', 'oblast', 'move', 'six'],
 ['costs', 'technological', 'ways', 'pilot', 'rainwater'],
 ['de', 'oecd', 'co', 'regional', 'food'],
 ['resources', 'water', 'set', 'policies', 'charges'],
 ['allocation', 'april', 'tax', 'government', 'solution'],
 ['savings', 'failure', 'designing', 'pharmaceuticals', 'affordable'],
 ['takes', 'reduces', 'issue', 'investors', 'almost'],
 ['risk', 'environmental', 'benefits', 'development', 'regional'],
 ['six', 'owned', 'oblast', 'marine', 'switzerland'],
 ['allocation', 'oecd', 'van', 'permits', 'regional'],
 ['points', 'load', 'des', 'planned', 'designing'],
 ['aires', 'processes', 'indicators', 'authority', 'housing'],
 ['future', 'scale', 'measures', 'building', 'markets'],
 ['proj

## Plot topics

In [23]:
lda_vis_data_20 = ctm_20.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=20)
lda_vis_data_62 = ctm_62.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=20)

Sampling: [20/20]: : 20it [02:27,  7.39s/it]
Sampling: [20/20]: : 20it [02:29,  7.46s/it]


In [24]:
import pyLDAvis as vis

ctm_pd_20 = vis.prepare(**lda_vis_data_20, R=10)
ctm_pd_62 = vis.prepare(**lda_vis_data_62, R=10)

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(


In [25]:
vis.display(ctm_pd_20)

In [26]:
vis.display(ctm_pd_62)


## Topic Predictions

Take a document and see which topic has been assigned to it.

In [15]:
# topics_predictions = ctm.get_thetas(training_dataset, n_samples=5) # get all the topic predictions

In [16]:
# preprocessed_documents[0] # see the text of our preprocessed document

In [17]:
# import numpy as np
# topic_number = np.argmax(topics_predictions[0]) # get the topic id of the first document

In [18]:
# topic_number

In [19]:
# ctm.get_topic_lists(5)[18]

In [20]:
# ctm.get_topic_lists(5)[topic_number]

## Save Model

In [21]:
# ctm_20.save(models_dir="models/")
# ctm_62.save(models_dir="models/")

In [22]:
# Loading a saved model 
# ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, num_epochs=100, n_components=50)
# ctm.load("/content/contextualized_topic_model_nc_50_tpm_0.0_tpv_0.98_hs_prodLDA_ac_(100, 100)_do_softplus_lr_0.2_mo_0.002_rp_0.99",
#                                                                                                       epoch=19)
# ctm.get_topic_lists(5)