Install dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Create a function that returns the required local i.e. UTF-8

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install bertopic
!pip install flair
!apt-get -qq install -y libfluidsynth1

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transfor

In [None]:
# Data processing
import pandas as pd
import numpy as np
# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
# Topic model
from bertopic import BERTopic
# Dimension reduction
from umap import UMAP
# Clustering
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
# Count vectorization
from sklearn.feature_extraction.text import CountVectorizer
# Sentence transformer
from sentence_transformers import SentenceTransformer
# Flair
from transformers.pipelines import pipeline
from flair.embeddings import TransformerDocumentEmbeddings, WordEmbeddings, DocumentPoolEmbeddings, StackedEmbeddings

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


Read the dataset

In [None]:
#https://drive.google.com/file/d/1UamUyFMrVT50eQU3qudcTXc_GXqJz9c5/view?usp=sharing

import gdown
import pandas as pd

# Replace 'YOUR_FILE_ID' with the actual file ID of your dataset
file_id = '1UamUyFMrVT50eQU3qudcTXc_GXqJz9c5'

# URL to download the file
url = f'https://drive.google.com/uc?id={file_id}'

# Output path for the downloaded file
output = 'scopus.csv'

# Download the file
gdown.download(url, output, quiet=False)

# Read the downloaded CSV file into a DataFrame
papers_nips = pd.read_csv(output)

# Now you can work with your dataset in the 'papers_nips' DataFrame
print(papers_nips.head())


Downloading...
From: https://drive.google.com/uc?id=1UamUyFMrVT50eQU3qudcTXc_GXqJz9c5
To: /content/scopus.csv
100%|██████████| 3.75M/3.75M [00:00<00:00, 255MB/s]

                                             Authors  \
0  Chae B.; Park Y.-M.; Lee S.-J.; Lee J.-S.; Kan...   
1                                          Hughes J.   
2              Palmer R.F.; Dempsey T.T.; Afrin L.B.   
3  Jin X.; Wang Y.; Huang C.; Luo X.; Gao X.; She...   
4  Park Y.-M.; Lee S.-J.; Lee J.-S.; Na K.-S.; Ka...   

                                   Author full names  \
0  Chae, Boram (57315923900); Park, Young-Min (56...   
1                     Hughes, Jennifer (57210218596)   
2  Palmer, Raymond F. (7402726982); Dempsey, Tani...   
3  Jin, Xingyue (57221405916); Wang, Yuxin (58768...   
4  Park, Young-Min (56245805400); Lee, So-Jin (55...   

                                        Author(s) ID  \
0  57315923900; 56245805400; 55262324300; 5610788...   
1                                        57210218596   
2                7402726982; 57208421590; 6602167168   
3  57221405916; 58768327700; 57216579345; 8511702...   
4  56245805400; 55262324300; 56107882900; 1451




In [None]:
# Get the dataset information
papers_nips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1649 entries, 0 to 1648
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Authors            1642 non-null   object 
 1   Author full names  1593 non-null   object 
 2   Author(s) ID       1593 non-null   object 
 3   Title              1600 non-null   object 
 4   Year               1600 non-null   float64
 5   Link               1600 non-null   object 
 6   Abstract           1600 non-null   object 
dtypes: float64(1), object(6)
memory usage: 90.3+ KB


In [None]:
# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
print(f'There are {len(stopwords)} default stopwords. They are {stopwords}')

There are 179 default stopwords. They are ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'no

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
file_id = '1UamUyFMrVT50eQU3qudcTXc_GXqJz9c5'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'scopus.csv'
papers_nips = pd.read_csv(output)

# Define stopwords and lemmatizer
stopwords = set(stopwords.words('english'))
wn = WordNetLemmatizer()

# Function to remove stopwords and lemmatize
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    words = [w for w in text.split() if w.lower() not in stopwords]
    lemmatized_words = [wn.lemmatize(w) for w in words]
    return ' '.join(lemmatized_words)

# Remove stopwords and lemmatize 'Title' column
papers_nips['abstract_without_stopwords'] = papers_nips['Title'].apply(preprocess_text)

# Lemmatization
papers_nips['abstract_lemmatized'] = papers_nips['abstract_without_stopwords'].apply(preprocess_text)

# Take a look at the data
print(papers_nips.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                             Authors  \
0  Chae B.; Park Y.-M.; Lee S.-J.; Lee J.-S.; Kan...   
1                                          Hughes J.   
2              Palmer R.F.; Dempsey T.T.; Afrin L.B.   
3  Jin X.; Wang Y.; Huang C.; Luo X.; Gao X.; She...   
4  Park Y.-M.; Lee S.-J.; Lee J.-S.; Na K.-S.; Ka...   

                                   Author full names  \
0  Chae, Boram (57315923900); Park, Young-Min (56...   
1                     Hughes, Jennifer (57210218596)   
2  Palmer, Raymond F. (7402726982); Dempsey, Tani...   
3  Jin, Xingyue (57221405916); Wang, Yuxin (58768...   
4  Park, Young-Min (56245805400); Lee, So-Jin (55...   

                                        Author(s) ID  \
0  57315923900; 56245805400; 55262324300; 5610788...   
1                                        57210218596   
2                7402726982; 57208421590; 6602167168   
3  57221405916; 58768327700; 57216579345; 8511702...   
4  56245805400; 55262324300; 56107882900; 1451

In [None]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)
# Clustering model
# hdbscan_model = HDBSCAN(min_cluster_size=5, min_samples = 5,
# metric='euclidean', prediction_data=True)
kmeans_model = KMeans(n_clusters=9)
# Initiate a sentence transformer model
sentence_model = SentenceTransformer("paraphrase-albert-small-v2")
# Initiate a pretrained model
hf_model = pipeline("feature-extraction", model="distilroberta-base")

# Initiate a pretrained embedding model
roberta_model = TransformerDocumentEmbeddings('roberta-base')
# Initiate another pretrained embedding model
glove_embedding = WordEmbeddings('crawl')
document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])
# Stack the two pretrained embedding models
stacked_embeddings = StackedEmbeddings(embeddings=[roberta_model,
document_glove_embeddings])

# Count vectorizer
vectorizer_model = CountVectorizer(min_df=10)

# Initiate BERTopic
# topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True,hdbscan_model=kmeans_model,
#                        embedding_model=stacked_embeddings,min_topic_size=5, n_gram_range=(1, 3),diversity=0.8)#vectorizer_model=vectorizer_model)# Other options for embedding_model are sentence_model, hf_model,roberta_model

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True,hdbscan_model=kmeans_model, n_gram_range=(1, 3))
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(papers_nips['abstract_lemmatized'])#abstract_lemmatized

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

2024-03-02 18:13:38,976 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-crawl-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpkpcewn0k


100%|██████████| 1.12G/1.12G [01:06<00:00, 18.0MB/s]

2024-03-02 18:14:46,319 copying /tmp/tmpkpcewn0k to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M.vectors.npy





2024-03-02 18:14:54,774 removing temp file /tmp/tmpkpcewn0k
2024-03-02 18:14:55,678 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-crawl-300d-1M not found in cache, downloading to /tmp/tmpwxyki6qc


100%|██████████| 37.5M/37.5M [00:03<00:00, 12.6MB/s]

2024-03-02 18:14:59,336 copying /tmp/tmpwxyki6qc to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M





2024-03-02 18:14:59,391 removing temp file /tmp/tmpwxyki6qc


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,285,0_sleep_study_insomnia_association,"[sleep, study, insomnia, association, risk, qu...","[Association Insomnia, Sleep Quality, Sleep Du..."
1,1,244,1_insomnia_sleep_study_disorder,"[insomnia, sleep, study, disorder, treatment, ...","[Erratum: Correction to: Benzodiazepine, Z-dru..."
2,2,209,2_covid19_pandemic_covid19 pandemic_health,"[covid19, pandemic, covid19 pandemic, health, ...",[impact COVID-19 pandemic mental health Taiwan...
3,3,206,3_insomnia_therapy_cognitive_therapy insomnia,"[insomnia, therapy, cognitive, therapy insomni...",[Digital cognitive behavioral therapy insomnia...
4,4,193,4_review_use_safety_cannabis,"[review, use, safety, cannabis, clinical, effi...","[phase 3, multicenter, double-blind, randomize..."
5,5,182,5_depression_disorder_sleep_depressive,"[depression, disorder, sleep, depressive, stud...",[Telehealth parent training sleep disturbance ...
6,6,179,6_cancer_breast_breast cancer_study,"[cancer, breast, breast cancer, study, patient...",[Long-term trajectory postoperative health-rel...
7,7,101,7_syndrome_study_patient_treatment,"[syndrome, study, patient, treatment, disease,...","[Safety efficacy amlitelimab, fully human nond..."
8,8,50,8_editorial___,"[editorial, , , , , , , , , ]","[, , Editorial]"




In [None]:
# Get top 10 terms for a topic
topic_model.get_topic(0)

[('sleep', 0.05932432137249386),
 ('study', 0.024339336680122475),
 ('insomnia', 0.020547250269559475),
 ('association', 0.018602859136832413),
 ('risk', 0.017837116552267076),
 ('quality', 0.016612069176648613),
 ('sleep quality', 0.015572116647899195),
 ('among', 0.015540729919157349),
 ('adults', 0.014670008781626283),
 ('health', 0.014640239223537303)]

In [None]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=12)

In [None]:
# Visualize term rank decrease
topic_model.visualize_term_rank()

In [None]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [None]:
# Visualize connections between topics using hierachical clustering
topic_model.visualize_hierarchy(top_n_topics=10)

In [None]:
# Visualize similarity using heatmap
topic_model.visualize_heatmap()

In [None]:
# Get the topic predictions
topic_prediction = topic_model.topics_[:]
# Save the predictions in the dataframe
papers_nips['topic_prediction'] = topic_prediction
# Take a look at the data
papers_nips.head()

Unnamed: 0,Authors,Author full names,Author(s) ID,Title,Year,Link,Abstract,abstract_without_stopwords,abstract_lemmatized,topic_prediction
0,Chae B.; Park Y.-M.; Lee S.-J.; Lee J.-S.; Kan...,"Chae, Boram (57315923900); Park, Young-Min (56...",57315923900; 56245805400; 55262324300; 5610788...,The Impact of Melatonin on Inflammatory Status...,2023.0,https://www.scopus.com/inward/record.uri?eid=2...,Objective: This study aimed to assess whether ...,Impact Melatonin Inflammatory Status Quality Life,Impact Melatonin Inflammatory Status Quality Life,6
1,Hughes J.,"Hughes, Jennifer (57210218596)",57210218596,Pharmacokinetics and Safety of Group A and B A...,2023.0,https://www.scopus.com/inward/record.uri?eid=2...,Recommendations for treatment of rifampicin-re...,Pharmacokinetics Safety Group B Anti-Tuberculo...,Pharmacokinetics Safety Group B Anti-Tuberculo...,7
2,Palmer R.F.; Dempsey T.T.; Afrin L.B.,"Palmer, Raymond F. (7402726982); Dempsey, Tani...",7402726982; 57208421590; 6602167168,Chemical Intolerance and Mast Cell Activation:...,2023.0,https://www.scopus.com/inward/record.uri?eid=2...,Background: Chemical Intolerance (CI) is chara...,Chemical Intolerance Mast Cell Activation: Sus...,Chemical Intolerance Mast Cell Activation: Sus...,7
3,Jin X.; Wang Y.; Huang C.; Luo X.; Gao X.; She...,"Jin, Xingyue (57221405916); Wang, Yuxin (58768...",57221405916; 58768327700; 57216579345; 8511702...,The association between childhood maltreatment...,2023.0,https://www.scopus.com/inward/record.uri?eid=2...,Background Childhood maltreatment is associate...,association childhood maltreatment internet ad...,association childhood maltreatment internet ad...,2
4,Park Y.-M.; Lee S.-J.; Lee J.-S.; Na K.-S.; Ka...,"Park, Young-Min (56245805400); Lee, So-Jin (55...",56245805400; 55262324300; 56107882900; 1451976...,Efficacy of Prolonged-Release Melatonin Admini...,2023.0,https://www.scopus.com/inward/record.uri?eid=2...,"Objective: Melatonin, both immediate and prolo...",Efficacy Prolonged-Release Melatonin Administr...,Efficacy Prolonged-Release Melatonin Administr...,6


In [None]:
# New data for the review
new_review = "I like the new headphone. Its sound quality is great."
# Find topics
num_of_topics = 3
similar_topics, similarity = topic_model.find_topics(new_review, top_n=num_of_topics);
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The top 3 similar topics are [0, 1, 8], and the similarities are [0.06 0.06 0.05]


In [None]:
# Print the top keywords for the top similar topics
for i in range(num_of_topics):
  print(f'The top keywords for topic {similar_topics[i]} are:')
  print(topic_model.get_topic(similar_topics[i]))

The top keywords for topic 0 are:
[('insomnia', 0.058019533632480996), ('sleep', 0.02395214513137431), ('therapy', 0.017808914279106795), ('cognitive', 0.017087392566078875), ('treatment', 0.01692552721060791), ('therapy insomnia', 0.016757201802407355), ('chronic', 0.016270564427552016), ('review', 0.01570827454922442), ('trial', 0.015042758164222176), ('study', 0.014575450231215613)]
The top keywords for topic 1 are:
[('sleep', 0.06016616243560494), ('study', 0.021639205877867492), ('insomnia', 0.020957228194546276), ('quality', 0.01849149059583676), ('association', 0.017601383816639015), ('among', 0.01726480175239087), ('sleep quality', 0.01722882826949956), ('risk', 0.016698677481619944), ('older', 0.01580113138062572), ('adults', 0.01554238797779955)]
The top keywords for topic 8 are:
[('editorial', 8.693664334532016), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]


In [None]:
# Save the topic model
topic_model.save("papers_nips_topic_model")
# Load the topic model
my_model = BERTopic.load("papers_nips_topic_model")

