## BERTopic Models (BTM) 


Using BTMs to produce topic models
 

## Preprocessing

to ensure the citations are stripped of html and preprocessed according to a pipeline 

In [1]:
!pip install contextualized_topic_models pyLDAvis scispacy bertopic
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz (15.9 MB)


In [2]:
from bs4 import BeautifulSoup
import string
import re

def remove_html(x):
    soup = BeautifulSoup(x, 'html.parser')
    tags = soup.find_all('cite')
    text = soup.get_text()
    text = text.replace('  ', '')
    text = text.replace(' et ', '')
    text = text.replace(' al ', '')
    text = text.replace(' et. ', '')
    text = text.replace(' al. ', '')
    return text

In [72]:
import scispacy
import spacy
import string

nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("merge_entities")

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [4]:
import pandas as pd

citations = pd.read_csv('./example_doc_citations.csv')
citations['text'] = citations['text'].apply(
    lambda x : remove_html(x)
)

In [110]:
from gensim.corpora.dictionary import Dictionary

def tokenizer(text):
  removal = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
  remove_text = ['al', 'et', 'al.']
  authors = [
      'mongeon',
      'hus'
  ]

  doc = nlp(text)
  proj_tok = [re.sub(r'\W+', '', token.text.strip().translate(str.maketrans('', '', string.punctuation)).replace(" ", "_").lower()) 
              for token in doc if token.ent_type_ != '' or (len(token.text) > 2 and token.text not in remove_text and token.pos_ not in removal and not token.is_stop and token.is_alpha)]
  proj_tok = [
      tok for tok in proj_tok
      if all([author not in tok for author in authors])
  ]
  
  return proj_tok
      

In [111]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(tokenizer=tokenizer)

In [112]:
import nltk
import torch
import random
import numpy as np

In [113]:
def fix_seeds():
    torch.manual_seed(10)
    torch.cuda.manual_seed(10)
    np.random.seed(10)
    random.seed(10)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.deterministic = True

In [120]:
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, InvertedRBO

fix_seeds()

num_topics = [3, 5, 10, 15, 20, 25, 50, 100, 250]

model_results = []
embedding_models = [
    "paraphrase-distilroberta-base-v2",
    "sentence-transformers/allenai-specter",
    "allenai/aspire-sentence-embedder",
    "allenai/aspire-contextualsentence-multim-compsci", 
]

for embedding_model in embedding_models:
    btm = BERTopic(verbose=True, embedding_model=embedding_model, vectorizer_model=vectorizer_model)
    btm._preprocess_text = lambda x: x

    topics, probs = btm.fit_transform(citations['text'])
    cleaned_docs = btm._preprocess_text(citations['text'])
    vectorizer = btm.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    topic_list = [[word[0] for word in topic]
                  for topic in list(btm.get_topics().values())]
    coh = CoherenceNPMI(topic_list, [analyzer(doc) for doc in cleaned_docs])
    coh_score = coh.score()
    print("coherence score:", coh_score)
    diversity_score = InvertedRBO(topic_list).score()
    print("diversity score LDA:", diversity_score)
    model_results.append({
        "coherence": coh_score,
        "diversity": diversity_score,
        "num_topics": len(btm.get_topics().keys()),
        "model": btm,
        "embedding_model": embedding_model,
        "topcs": topics,
        "probs": probs
    })

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

2022-08-17 01:01:08,061 - BERTopic - Transformed documents to Embeddings
2022-08-17 01:01:12,343 - BERTopic - Reduced dimensionality
2022-08-17 01:01:12,378 - BERTopic - Clustered reduced embeddings


coherence score: 0.004272286176292614
diversity score LDA: 0.8473236894510989


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

2022-08-17 01:02:01,974 - BERTopic - Transformed documents to Embeddings
2022-08-17 01:02:06,359 - BERTopic - Reduced dimensionality
2022-08-17 01:02:06,395 - BERTopic - Clustered reduced embeddings


coherence score: -0.057098685026380165
diversity score LDA: 0.8843560141857824




Batches:   0%|          | 0/25 [00:00<?, ?it/s]

2022-08-17 01:03:00,073 - BERTopic - Transformed documents to Embeddings
2022-08-17 01:03:04,421 - BERTopic - Reduced dimensionality
2022-08-17 01:03:04,450 - BERTopic - Clustered reduced embeddings


coherence score: 0.06827238873350273
diversity score LDA: 0.636974338925




Batches:   0%|          | 0/25 [00:00<?, ?it/s]

2022-08-17 01:03:59,404 - BERTopic - Transformed documents to Embeddings
2022-08-17 01:04:04,177 - BERTopic - Reduced dimensionality
2022-08-17 01:04:04,213 - BERTopic - Clustered reduced embeddings


coherence score: -0.07842726827616794
diversity score LDA: 0.8956448080528361


## Select Top Topic Model

In [121]:
results_df = pd.DataFrame(model_results)

In [122]:
results_df.sort_values(by='coherence', ascending=False)[
    ['coherence','diversity','num_topics', 'embedding_model']
]

Unnamed: 0,coherence,diversity,num_topics,embedding_model
2,0.068272,0.636974,4,allenai/aspire-sentence-embedder
0,0.004272,0.847324,13,paraphrase-distilroberta-base-v2
1,-0.057099,0.884356,15,sentence-transformers/allenai-specter
3,-0.078427,0.895645,17,allenai/aspire-contextualsentence-multim-compsci


In [123]:
results_df.sort_values(by='diversity', ascending=False)[
    ['coherence','diversity','num_topics', 'embedding_model']
]

Unnamed: 0,coherence,diversity,num_topics,embedding_model
3,-0.078427,0.895645,17,allenai/aspire-contextualsentence-multim-compsci
1,-0.057099,0.884356,15,sentence-transformers/allenai-specter
0,0.004272,0.847324,13,paraphrase-distilroberta-base-v2
2,0.068272,0.636974,4,allenai/aspire-sentence-embedder


In [134]:
btm_model = results_df.iloc[1]['model']

In [135]:
for topic in list(btm_model.get_topics().values()):
  print(
      [word[0] for word in topic][:5]
      )

['scopus', 'wos', 'coverage', 'databases', 'science']
['scopus', 'database', 'scopus_database', 'largest', 'journals']
['wos', 'scopus', 'journals', 'coverage', 'social_sciences']
['research', 'keywords', 'literature', 'scopus', 'search']
['countries', 'researchers', 'publication', 'publications', 'bias']
['search', 'based', 'review', 'databases', 'articles']
['data', 'citation', 'limitations', 'coverage', 'reference']
['wos', 'publications', 'india', 'database', 'covid19']
['scopus', 'wos', 'bibliometric_analyses', 'database', 'journals']
['search', 'articles', 'scopus', 'order', 'performed']
['bibliometric_analysis', 'evaluate', 'bibliometric_analyses', 'pritchard', 'communication']
['search', 'selected', 'databases', 'comprehensive', 'scopus']
['science', 'wos', 'web_of_science', 'outputs', 'collection']
['management', 'scopus_index', 'better', 'research', 'superior']
['list', 'social_sciences', 'unethical', 'journals', 'published']


## Visualize topics

In [136]:
btm_model.visualize_topics()


## Compute Topic Index

In [127]:
import numpy as np
import json
for i, result in results_df.iterrows():
    documents = []
    for topic, doi, cite_id in zip(result['topcs'], citations['source_doi'], citations['id']):
        keywords = [term[0] for term in result['model'].get_topic(topic)]
        documents.append({
            "doi": doi,
            "cite_id": cite_id,
            "keywords": [word for word in set(keywords)]
        })
        with open(f'./btm_{result["num_topics"]}_{result["embedding_model"].replace("/", "_")}_topic_index.json', 'w+') as f:
            json.dump({
                "embedding_model": result["embedding_model"],
                "topics": result["num_topics"],
                "diversity": result["diversity"],
                "coherence": result["coherence"],
                "documents": documents
            }, f)

In [None]:
!shopt -s globstar

In [128]:
!rm -rf btm_data
!mkdir btm_data
!mv btm* ./btm_data
!zip -r ./btm_topic_indexes.zip ./btm_data 

mv: cannot move 'btm_data' to a subdirectory of itself, './btm_data/btm_data'
  adding: btm_data/ (stored 0%)
  adding: btm_data/btm_4_allenai_aspire-sentence-embedder_topic_index.json (deflated 91%)
  adding: btm_data/btm_topic_indexes.zip (stored 0%)
  adding: btm_data/btm_15_sentence-transformers_allenai-specter_topic_index.json (deflated 90%)
  adding: btm_data/btm_17_allenai_aspire-contextualsentence-multim-compsci_topic_index.json (deflated 90%)
  adding: btm_data/btm_13_paraphrase-distilroberta-base-v2_topic_index.json (deflated 90%)


In [129]:
from google.colab import files
files.download("./btm_topic_indexes.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>