## Contextualized Topic Models (CTMs) 

Using CTMs to produce topic models
 

## Preprocessing

to ensure the citations are stripped of html and preprocessed according to a pipeline 

In [1]:
!pip install contextualized_topic_models pyLDAvis scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz (15.9 MB)


In [2]:
from bs4 import BeautifulSoup
import string
import re

def remove_html(x):
    soup = BeautifulSoup(x, 'html.parser')
    tags = soup.find_all('cite')
    text = soup.get_text()
    text = text.replace('  ', '')
    text = text.replace(' et ', '')
    text = text.replace(' al ', '')
    text = text.replace(' et. ', '')
    text = text.replace(' al. ', '')
    return text

In [3]:
import scispacy
import spacy
import string

nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("merge_entities")

def merge_entities(x):
    doc = nlp(x)
    return " ".join([re.sub(r'\W+', '', t.text.strip().translate(str.maketrans('', '', string.punctuation)).replace(" ", "_").lower()) for t in doc])


In [4]:
import pandas as pd

citations = pd.read_csv('./example_doc_citations.csv')
citations['text'] = citations['text'].apply(
    lambda x : remove_html(x)
)
citations['ner_merged_text'] = citations['text'].apply(
    lambda x: merge_entities(x)
)

In [5]:
from gensim.corpora.dictionary import Dictionary

removal = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
remove_text = ['al', 'et', 'al.']
authors = [
    'mongeon',
    'hus'
]


unpreprocessed_corpus = citations['text']

preprocessed_documents = []
for doc in nlp.pipe(citations['ner_merged_text']):
    proj_tok = [token.lemma_.lower() for token in doc if "_" in token.text or (len(token.text) > 2 and token.text not in remove_text and token.pos_ not in removal and not token.is_stop and token.is_alpha)]
    proj_tok = [
        tok for tok in proj_tok
        if all([author not in tok for author in authors])
    ]
    preprocessed_documents.append(' '.join(proj_tok))
    
    
texts = [doc.split(' ') for doc in preprocessed_documents]
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=2, no_above=0.9)


In [6]:
preprocessed_documents = [' '.join([dictionary[token[0]].replace("-", "_").replace("®", "_") for token in dictionary.doc2bow(text)]) for text in texts]

In [7]:
from contextualized_topic_models.models.ctm import ZeroShotTM, CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
import torch
import random
import numpy as np

In [8]:
def fix_seeds():
    torch.manual_seed(10)
    torch.cuda.manual_seed(10)
    np.random.seed(10)
    random.seed(10)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.deterministic = True

In [9]:
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, InvertedRBO

fix_seeds()

num_topics = [3, 5, 10, 15, 20, 25, 50, 100, 250]

corpus = [doc.split(' ') for doc in preprocessed_documents]
model_results = []
embedding_models = [
    "paraphrase-distilroberta-base-v2",
    "sentence-transformers/allenai-specter",
    "allenai/aspire-sentence-embedder",
    "allenai/aspire-contextualsentence-multim-compsci", 
]
for embedding_model in embedding_models:
    tp = TopicModelDataPreparation(embedding_model)
    training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)
    for n_components in num_topics:
        print("num topics:", n_components)
        ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, 
                     n_components=n_components, num_epochs=50)
        ctm.fit(training_dataset, n_samples=20)
        coh = CoherenceNPMI(ctm.get_topic_lists(10), [doc.split(' ') for doc in preprocessed_documents])
        coh_score = coh.score()
        print("coherence score:", coh_score)
        diversity_score = InvertedRBO(ctm.get_topic_lists(10)).score()
        print("diversity score LDA:", diversity_score)
        model_results.append({
            "coherence": coh_score,
            "diversity": diversity_score,
            "num_topics": n_components,
            "model": ctm,
            "embedding_model": embedding_model,
            "tp": tp,
            "training_dataset": training_dataset
        })



Batches:   0%|          | 0/4 [00:00<?, ?it/s]



num topics: 3


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 177.24774254800096	Time: 0:00:00.357226: : 50it [00:17,  2.83it/s]
Sampling: [20/20]: : 20it [00:06,  3.22it/s]


coherence score: -0.2631906989752105
diversity score LDA: 1.0
num topics: 5


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 172.28467846278446	Time: 0:00:00.358753: : 50it [00:18,  2.67it/s]
Sampling: [20/20]: : 20it [00:06,  3.21it/s]


coherence score: -0.11875868500459016
diversity score LDA: 0.9698818720792857
num topics: 10


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 171.84883775481984	Time: 0:00:00.362376: : 50it [00:19,  2.58it/s]
Sampling: [20/20]: : 20it [00:06,  3.17it/s]


coherence score: -0.062270815673325285
diversity score LDA: 0.9699455483157143
num topics: 15


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 172.15615864017067	Time: 0:00:00.339837: : 50it [00:17,  2.78it/s]
Sampling: [20/20]: : 20it [00:06,  3.15it/s]


coherence score: -0.10312385109917867
diversity score LDA: 0.9643189906817007
num topics: 20


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 173.78861557759166	Time: 0:00:00.355672: : 50it [00:18,  2.75it/s]
Sampling: [20/20]: : 20it [00:06,  3.14it/s]


coherence score: -0.0876926799456792
diversity score LDA: 0.93973130891297
num topics: 25


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 174.93850804460334	Time: 0:00:00.371637: : 50it [00:18,  2.64it/s]
Sampling: [20/20]: : 20it [00:06,  3.15it/s]


coherence score: -0.05637737640789706
diversity score LDA: 0.9400377133493809
num topics: 50


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 184.7961533808075	Time: 0:00:00.346081: : 50it [00:18,  2.74it/s]
Sampling: [20/20]: : 20it [00:06,  3.07it/s]


coherence score: -0.0833063531208772
diversity score LDA: 0.93895836529386
num topics: 100


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 207.1381218641356	Time: 0:00:00.363916: : 50it [00:18,  2.72it/s]
Sampling: [20/20]: : 20it [00:06,  3.02it/s]


coherence score: -0.11704717171364268
diversity score LDA: 0.9371013196203968
num topics: 250


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 267.3013787926675	Time: 0:00:00.353036: : 50it [00:18,  2.70it/s]
Sampling: [20/20]: : 20it [00:06,  2.88it/s]


coherence score: -0.1954924809457843
diversity score LDA: 0.955651476004834




Batches:   0%|          | 0/4 [00:00<?, ?it/s]



num topics: 3


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 172.94141921321904	Time: 0:00:00.365144: : 50it [00:19,  2.50it/s]
Sampling: [20/20]: : 20it [00:06,  2.94it/s]


coherence score: -0.24064952533781359
diversity score LDA: 1.0
num topics: 5


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 172.9044450260746	Time: 0:00:00.396044: : 50it [00:19,  2.58it/s]
Sampling: [20/20]: : 20it [00:06,  2.95it/s]


coherence score: -0.15725600972413495
diversity score LDA: 0.9762778513128572
num topics: 10


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 174.63030689495102	Time: 0:00:00.381443: : 50it [00:19,  2.58it/s]
Sampling: [20/20]: : 20it [00:06,  2.93it/s]


coherence score: -0.056732877122992284
diversity score LDA: 0.9156062199192063
num topics: 15


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 175.94420383612515	Time: 0:00:00.382874: : 50it [00:19,  2.59it/s]
Sampling: [20/20]: : 20it [00:07,  2.75it/s]


coherence score: -0.04725688140891185
diversity score LDA: 0.9128445448623129
num topics: 20


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 179.02070411267383	Time: 0:00:00.383067: : 50it [00:20,  2.43it/s]
Sampling: [20/20]: : 20it [00:07,  2.86it/s]


coherence score: -0.0561339236370925
diversity score LDA: 0.9195980722837218
num topics: 25


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 180.36972952552148	Time: 0:00:00.393448: : 50it [00:19,  2.52it/s]
Sampling: [20/20]: : 20it [00:07,  2.84it/s]


coherence score: -0.02887655045450726
diversity score LDA: 0.9117168991832619
num topics: 50


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 193.46307766573167	Time: 0:00:00.378858: : 50it [00:19,  2.51it/s]
Sampling: [20/20]: : 20it [00:07,  2.78it/s]


coherence score: -0.07849000696410974
diversity score LDA: 0.8837062477956094
num topics: 100


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 215.4784705623815	Time: 0:00:00.391373: : 50it [00:20,  2.50it/s]
Sampling: [20/20]: : 20it [00:07,  2.75it/s]


coherence score: -0.1543823181904119
diversity score LDA: 0.9312996349632006
num topics: 250


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 283.67177870180154	Time: 0:00:00.399798: : 50it [00:20,  2.40it/s]
Sampling: [20/20]: : 20it [00:07,  2.64it/s]


coherence score: -0.3122664871544185
diversity score LDA: 0.9760449095631807




Batches:   0%|          | 0/4 [00:00<?, ?it/s]



num topics: 3


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 173.5848371696231	Time: 0:00:00.390030: : 50it [00:20,  2.47it/s]
Sampling: [20/20]: : 20it [00:07,  2.75it/s]


coherence score: -0.14739614229477613
diversity score LDA: 1.0
num topics: 5


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 174.0824003930942	Time: 0:00:00.404822: : 50it [00:20,  2.47it/s]
Sampling: [20/20]: : 20it [00:07,  2.77it/s]


coherence score: -0.15526260480340076
diversity score LDA: 0.99564749821
num topics: 10


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 175.76144837428888	Time: 0:00:00.383354: : 50it [00:20,  2.46it/s]
Sampling: [20/20]: : 20it [00:08,  2.48it/s]


coherence score: -0.07412771666406018
diversity score LDA: 0.9442119350292063
num topics: 15


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 177.44822305131953	Time: 0:00:00.398130: : 50it [00:20,  2.44it/s]
Sampling: [20/20]: : 20it [00:07,  2.73it/s]


coherence score: -0.05885685333277651
diversity score LDA: 0.8872955236015646
num topics: 20


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 181.0981500869153	Time: 0:00:00.411657: : 50it [00:20,  2.40it/s]
Sampling: [20/20]: : 20it [00:07,  2.68it/s]


coherence score: -0.05727140409580742
diversity score LDA: 0.9072848851711278
num topics: 25


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 180.60768978152655	Time: 0:00:00.409906: : 50it [00:20,  2.41it/s]
Sampling: [20/20]: : 20it [00:07,  2.68it/s]


coherence score: -0.09652241566354144
diversity score LDA: 0.9202648294917619
num topics: 50


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 195.4235731816925	Time: 0:00:00.407855: : 50it [00:21,  2.31it/s]
Sampling: [20/20]: : 20it [00:07,  2.70it/s]


coherence score: -0.1322260383141062
diversity score LDA: 0.9259191739697726
num topics: 100


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 219.96142392936156	Time: 0:00:00.410130: : 50it [00:20,  2.40it/s]
Sampling: [20/20]: : 20it [00:08,  2.26it/s]


coherence score: -0.2679128286418246
diversity score LDA: 0.9648700301235426
num topics: 250


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 283.1849925430626	Time: 0:00:00.413594: : 50it [00:24,  2.03it/s]
Sampling: [20/20]: : 20it [00:07,  2.55it/s]


coherence score: -0.31119549567331867
diversity score LDA: 0.9781470685580108




Batches:   0%|          | 0/4 [00:00<?, ?it/s]



num topics: 3


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 173.7730001456819	Time: 0:00:00.409753: : 50it [00:21,  2.31it/s]
Sampling: [20/20]: : 20it [00:07,  2.69it/s]


coherence score: -0.16846463628813435
diversity score LDA: 1.0
num topics: 5


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 174.5211186270346	Time: 0:00:00.424659: : 50it [00:20,  2.39it/s]
Sampling: [20/20]: : 20it [00:07,  2.56it/s]


coherence score: -0.09549355227732978
diversity score LDA: 0.9401247779207142
num topics: 10


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 174.53641368224558	Time: 0:00:00.409047: : 50it [00:21,  2.32it/s]
Sampling: [20/20]: : 20it [00:07,  2.62it/s]


coherence score: -0.09223133285887493
diversity score LDA: 0.9496266916992063
num topics: 15


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 176.33253527230167	Time: 0:00:00.423592: : 50it [00:21,  2.35it/s]
Sampling: [20/20]: : 20it [00:07,  2.63it/s]


coherence score: -0.06346590866826526
diversity score LDA: 0.9007158973403402
num topics: 20


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 181.22245254227244	Time: 0:00:00.416244: : 50it [00:22,  2.24it/s]
Sampling: [20/20]: : 20it [00:07,  2.61it/s]


coherence score: -0.10074492538430906
diversity score LDA: 0.8860728335767293
num topics: 25


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 186.25960512800253	Time: 0:00:00.424078: : 50it [00:21,  2.33it/s]
Sampling: [20/20]: : 20it [00:07,  2.60it/s]


coherence score: -0.11206173748142596
diversity score LDA: 0.9217933183148095
num topics: 50


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 196.58475846436474	Time: 0:00:00.426249: : 50it [00:21,  2.29it/s]
Sampling: [20/20]: : 20it [00:07,  2.54it/s]


coherence score: -0.15200987226589485
diversity score LDA: 0.9471349979826531
num topics: 100


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 219.07160882190266	Time: 0:00:00.424581: : 50it [00:21,  2.28it/s]
Sampling: [20/20]: : 20it [00:08,  2.32it/s]


coherence score: -0.2804807101646764
diversity score LDA: 0.9628271792083262
num topics: 250


Epoch: [50/50]	 Seen Samples: [39550/39550]	Train Loss: 283.6315211164665	Time: 0:00:00.433919: : 50it [00:21,  2.29it/s]
Sampling: [20/20]: : 20it [00:08,  2.42it/s]


coherence score: -0.30738995591767415
diversity score LDA: 0.9759608815891555


## Select Top Topic Model

In [10]:
results_df = pd.DataFrame(model_results)

In [11]:
results_df.sort_values(by='coherence', ascending=False)[
    ['coherence','diversity','num_topics', 'embedding_model']
]

Unnamed: 0,coherence,diversity,num_topics,embedding_model
14,-0.028877,0.911717,25,sentence-transformers/allenai-specter
12,-0.047257,0.912845,15,sentence-transformers/allenai-specter
13,-0.056134,0.919598,20,sentence-transformers/allenai-specter
5,-0.056377,0.940038,25,paraphrase-distilroberta-base-v2
11,-0.056733,0.915606,10,sentence-transformers/allenai-specter
22,-0.057271,0.907285,20,allenai/aspire-sentence-embedder
21,-0.058857,0.887296,15,allenai/aspire-sentence-embedder
2,-0.062271,0.969946,10,paraphrase-distilroberta-base-v2
30,-0.063466,0.900716,15,allenai/aspire-contextualsentence-multim-compsci
20,-0.074128,0.944212,10,allenai/aspire-sentence-embedder


In [12]:
results_df.sort_values(by='diversity', ascending=False)[
    ['coherence','diversity','num_topics', 'embedding_model']
]

Unnamed: 0,coherence,diversity,num_topics,embedding_model
0,-0.263191,1.0,3,paraphrase-distilroberta-base-v2
9,-0.24065,1.0,3,sentence-transformers/allenai-specter
27,-0.168465,1.0,3,allenai/aspire-contextualsentence-multim-compsci
18,-0.147396,1.0,3,allenai/aspire-sentence-embedder
19,-0.155263,0.995647,5,allenai/aspire-sentence-embedder
26,-0.311195,0.978147,250,allenai/aspire-sentence-embedder
10,-0.157256,0.976278,5,sentence-transformers/allenai-specter
17,-0.312266,0.976045,250,sentence-transformers/allenai-specter
35,-0.30739,0.975961,250,allenai/aspire-contextualsentence-multim-compsci
2,-0.062271,0.969946,10,paraphrase-distilroberta-base-v2


In [30]:
ctm_model = results_df.sort_values(by='coherence', ascending=False).iloc[0]['model']

In [31]:
ctm_model.get_topic_lists(5)

[['search', 'article', 'perform', 'literature', 'keyword'],
 ['search', 'keyword', 'article', 'database', 'field'],
 ['web_of_science', 'scopus', 'bibliometric_analyse', 'compare', 'large'],
 ['relevant', 'keyword', 'search', 'review', 'follow'],
 ['author', 'publication', 'emerge', 'country', 'fact'],
 ['english', 'language', 'publication', 'paulhu', 'researcher'],
 ['citation', 'datum', 'google_scholar', 'bibliometric_analyse', 'evaluation'],
 ['coverage', 'wos', 'scopus', 'citation', 'compare'],
 ['wos', 'journal', 'scopus', 'coverage', 'compare'],
 ['search', 'keyword', 'review', 'literature', 'abstract'],
 ['journal', 'wos', 'scopus', 'compare', 'show'],
 ['research', 'review', 'identify', 'purpose', 'impact'],
 ['exclusion', 'britton', 'sdgs', 'people', 'ınal'],
 ['web_of_science', 'well', 'database', 'scopus_database', 'choose'],
 ['purpose', 'review', 'engine', 'field', 'research'],
 ['identify', 'literature', 'define', 'sustainability', 'keyword'],
 ['coverage', 'scopus', 'wos

## Visualize topics

In [15]:
import pyLDAvis as vis

ctm = results_df.sort_values(by='coherence', ascending=False).iloc[0]['model']
tp = results_df.sort_values(by='coherence', ascending=False).iloc[0]['tp']
training_dataset = results_df.sort_values(by='coherence', ascending=False).iloc[0]['training_dataset']
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=1)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

  from collections import Iterable
Sampling: [1/1]: : 1it [00:00,  2.36it/s]
  by='saliency', ascending=False).head(R).drop('saliency', 1)


## Compute Topic Index

In [19]:
import numpy as np
import json
for i, result in results_df.iterrows():
    documents = []
    topic_predictions = result['model'].get_thetas(training_dataset, n_samples=5)
    for topics, doi, cite_id in zip(topic_predictions, citations['source_doi'], citations['id']):
        topic_number = np.argmax(topics)
        keywords = [term for term in result['model'].get_topic_lists(10)[topic_number]]
        documents.append({
            "doi": doi,
            "cite_id": cite_id,
            "keywords": [word for word in set(keywords)]
        })
        with open(f'./ctm_{result["num_topics"]}_{result["embedding_model"].replace("/", "_")}_topic_index.json', 'w+') as f:
            json.dump({
                "embedding_model": result["embedding_model"],
                "topics": result["num_topics"],
                "diversity": result["diversity"],
                "coherence": result["coherence"],
                "documents": documents
            }, f)

Sampling: [5/5]: : 5it [00:03,  1.57it/s]
Sampling: [5/5]: : 5it [00:02,  2.37it/s]
Sampling: [5/5]: : 5it [00:02,  2.41it/s]
Sampling: [5/5]: : 5it [00:02,  2.45it/s]
Sampling: [5/5]: : 5it [00:02,  2.46it/s]
Sampling: [5/5]: : 5it [00:02,  2.46it/s]
Sampling: [5/5]: : 5it [00:02,  1.76it/s]
Sampling: [5/5]: : 5it [00:02,  2.38it/s]
Sampling: [5/5]: : 5it [00:02,  1.93it/s]
Sampling: [5/5]: : 5it [00:02,  2.45it/s]
Sampling: [5/5]: : 5it [00:02,  2.43it/s]
Sampling: [5/5]: : 5it [00:02,  2.45it/s]
Sampling: [5/5]: : 5it [00:02,  2.47it/s]
Sampling: [5/5]: : 5it [00:02,  2.41it/s]
Sampling: [5/5]: : 5it [00:02,  2.36it/s]
Sampling: [5/5]: : 5it [00:02,  2.37it/s]
Sampling: [5/5]: : 5it [00:02,  2.35it/s]
Sampling: [5/5]: : 5it [00:02,  2.30it/s]
Sampling: [5/5]: : 5it [00:02,  2.40it/s]
Sampling: [5/5]: : 5it [00:02,  2.42it/s]
Sampling: [5/5]: : 5it [00:02,  2.41it/s]
Sampling: [5/5]: : 5it [00:02,  2.42it/s]
Sampling: [5/5]: : 5it [00:02,  2.44it/s]
Sampling: [5/5]: : 5it [00:02,  2.