<a href="https://colab.research.google.com/github/chianle67/DL-based-HPE-Topic-Modeling-Bibliometric-Analysis/blob/main/HPE_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Topic Modeling using "Llama-2-13b-chat-hf" to label clusters**

##Install libraries and packages

In [None]:
!pip install bitsandbytes sentence_transformers bertopic torch

In [None]:
!pip install -U git+https://github.com/huggingface/accelerate.git

In [None]:
!pip install keybert

##Load libraries and packages for using LLM and Topic Modeling

In [4]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from torch import cuda, bfloat16
from huggingface_hub import login
import torch
import transformers

##Data preprocessing

In [11]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

papers = pd.read_csv('/content/main_dataset_678_abstracts.csv')
db = papers['Abstract']
title_data = papers['Title']
db = pd.DataFrame(db)
db = db.dropna()
print(db.head())
print(db.shape)

db['Abstract'] = db['Abstract'].map(lambda x: x.lower())
db_to_list = db['Abstract'].values.tolist()

truncated_db_to_list = []
for docs in db_to_list:
    truncated_docs = docs.split(" ?")[0]
    truncated_db_to_list.append(truncated_docs)

remove_punc_db_to_list = []
for docs in truncated_db_to_list:
    cleaned_string = ''.join(character for character in docs if character not in string.punctuation)
    remove_punc_db_to_list.append(cleaned_string)

non_empty_punc_db_to_list = []
for docs in remove_punc_db_to_list:
    if docs:
        non_empty_punc_db_to_list.append(docs)

nltk.download('stopwords')
nltk.download('punkt')

input = non_empty_punc_db_to_list
stop_words = set(stopwords.words('english'))
print(stop_words)
stop_words.update(['propose', 'proposes', 'proposed', 'base', 'bases', 'based', 'include', 'contain', 'proceed', 'proceeding', 'proceedings', 'use', 'uses', 'using', 'datum', 'ieee', 'papers', 'stateoftheart', 'use', 'propose', 'approach', 'framework', 'frameworks', 'method', 'methods'])
print(stop_words)

output = []
for docs in input:
    word_tokenized = word_tokenize(docs)
    filtered_docs = []
    for w  in word_tokenized:
        if w not in stop_words:
            filtered_docs.append(w)
            reconstructed_docs = ' '.join(filtered_docs)
    output.append(reconstructed_docs)

docs = output
print(docs)

# Using lemmatization if needed
'''
import gensim
import gensim.corpora as corpora
import spacy

from pprint import pprint
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

def tokenizer(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

docs_tokenized = list(tokenizer(docs))

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
def lemmatizer(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

docs_lemmatized = lemmatizer(docs_tokenized,
                             allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
'''

                                            Abstract
0  Traditional human pose estimation methods typi...
1  Seniors who live alone at home are at risk of ...
2  Human Pose Estimation (HPE) to assess human mo...
3  Video-based 3D human pose estimation is an imp...
4  Human pose estimation is an important Computer...
(678, 1)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


{'now', "hasn't", "isn't", 'my', 'o', 'aren', 'but', 'of', 'he', 'she', 'his', 'itself', 'before', 'up', 'further', "aren't", 'isn', 's', "you're", 'or', "wouldn't", 'your', 'that', 'with', "needn't", 'hers', 'shan', 'me', 'an', 'yourself', "that'll", 'both', 'all', 'this', 'during', 'them', 'it', 'same', 'so', 'if', 'd', "don't", 'herself', 'they', 'be', 'its', 'theirs', 'again', "hadn't", 'mightn', 'too', 'can', 'because', 'himself', "it's", 'most', 'will', 'a', 'yours', 'has', 'only', "weren't", "you'd", "should've", 'once', 'into', 've', "wasn't", 'shouldn', 'other', 'ma', 'needn', 'to', 'off', "you'll", 'and', 'down', "didn't", 'being', 'such', 'whom', 'some', 'those', 'not', 'very', 'don', 'there', 'against', 'weren', 'wasn', 'do', 'wouldn', 'no', "mustn't", 'll', 'haven', 'hadn', 'between', 'her', 'doing', 'here', 'in', 'where', 'why', 'own', 'by', 'what', "she's", 'been', 'over', "shan't", "won't", 'then', 'ourselves', 'couldn', 'at', 'myself', "haven't", 'their', 'out', 'does'

'\nimport gensim\nimport gensim.corpora as corpora\nimport spacy\n\nfrom pprint import pprint\nfrom gensim.utils import simple_preprocess\nfrom gensim.models import CoherenceModel\n\ndef tokenizer(sentences):\n    for sentence in sentences:\n        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n\ndocs_tokenized = list(tokenizer(docs))\n\nnlp = spacy.load("en_core_web_sm", disable=[\'parser\', \'ner\'])\ndef lemmatizer(texts, allowed_postags=[\'NOUN\', \'ADJ\', \'VERB\', \'ADV\']):\n    """https://spacy.io/api/annotation"""\n    texts_out = []\n    for sent in texts:\n        doc = nlp(" ".join(sent))\n        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n    return texts_out\n\ndocs_lemmatized = lemmatizer(docs_tokenized,\n                             allowed_postags=[\'NOUN\', \'ADJ\', \'VERB\', \'ADV\'])\n'

##Model Generation

In [6]:
login("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")
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
cuda:0


In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

base_model = 'meta-llama/Llama-2-13b-chat-hf'
model_quantized = AutoModelForCausalLM.from_pretrained(
    base_model,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)

model_quantized.eval()

config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRM

In [8]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
generator = pipeline(
    model=model_quantized, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=512,
    repetition_penalty=1.5
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [9]:
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""

main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

prompt = system_prompt + example_prompt + main_prompt

In [13]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
from bertopic import BERTopic

embedding_model = SentenceTransformer("thenlper/gte-large")
embeddings = embedding_model.encode(docs, show_progress_bar=True)
# reduced_embeddings = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

representation_model = {
    "KeyBERT": KeyBERTInspired(top_n_words=10),
    "MMR": MaximalMarginalRelevance(diversity=0.1,top_n_words=10),
    "Llama2": TextGeneration(model=generator, prompt=prompt)
}

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [14]:
topic_model = BERTopic(
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  top_n_words=10,
  calculate_probabilities=True,
  min_topic_size=2,
  verbose=True
)

topics, probs = topic_model.fit_transform(docs, embeddings)

2024-02-24 13:05:31,844 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-24 13:05:40,227 - BERTopic - Dimensionality - Completed ✓
2024-02-24 13:05:40,229 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-24 13:05:40,300 - BERTopic - Cluster - Completed ✓
2024-02-24 13:05:40,311 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 6/6 [01:35<00:00, 15.94s/it]
2024-02-24 13:07:20,266 - BERTopic - Representation - Completed ✓


##Visualisation

In [15]:
topic_info = topic_model.get_topic_info()
topic_info.to_csv("/content/topic_info.csv")
topic_info

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Llama2,Representative_Docs
0,-1,11,-1_yoga_system_pose_events,"[yoga, system, pose, events, feedback, emotion...","[yoga, postures, poses, asanas, pose, posture,...","[yoga, system, pose, events, feedback, emotion...","[Yoga Pose Detection System, , , , , , , , , ]",[yoga change way see things transforms person ...
1,0,588,0_pose_human_estimation_3d,"[pose, human, estimation, 3d, deep, model, net...","[pose, poses, convolutional, human, 3d, datase...","[pose, human, estimation, 3d, deep, model, net...","[Human Pose Estimation using Deep Learning, , ...",[rise deep learning technology broadly promote...
2,1,36,1_learning_deep_detection_data,"[learning, deep, detection, data, analysis, to...","[convolutional, ai, segmentation, technologies...","[learning, deep, detection, data, analysis, to...","[Artificial Intelligence Applications, , , , ,...",[180 special focus conference future technolog...
3,2,18,2_signals_rf_human_radar,"[signals, rf, human, radar, pose, 3d, estimati...","[rf, rfbased, rfpose, multiperson, radar, rfid...","[signals, rf, human, radar, pose, 3d, estimati...","[""RF Human Pose Estimation Systems"", , , , , ,...",[advanced human sensing technologies radio fre...
4,3,13,3_fall_falls_elderly_detection,"[fall, falls, elderly, detection, falling, hum...","[falling, falls, elderly, human, fall, skeleto...","[fall, falls, elderly, detection, falling, hum...",[Fall Detection and Recognition in Elderly Per...,[increase age elderly often fall seriously thr...
5,4,12,4_driver_pose_estimation_driving,"[driver, pose, estimation, driving, vehicle, d...","[driving, driver, distracted, distraction, det...","[driver, pose, estimation, driving, vehicle, d...",[Autonomous Vehicle Monitoring using Pose Esti...,[number traffic accidents increased steadily r...


In [16]:
print(len(topics))
print(probs.shape)

678
(678, 5)


In [17]:
docs_topic = pd.DataFrame({"Title": title_data, "Abstract": docs, "Topic": topics, "Probability": probs.tolist()})
docs_topic.to_csv("/content/docs_topic.csv")
docs_topic

Unnamed: 0,Title,Abstract,Topic,Probability
0,Lightweight 2D Human Pose Estimation Based on ...,traditional human pose estimation typically re...,0,"[1.0, 4.23820377351879e-309, 5.72102923067147e..."
1,Human Pose Estimation Using MediaPipe Pose and...,seniors live alone home risk falling injuring ...,3,"[1.4599027479160164e-308, 3.91561320826836e-30..."
2,Design Space Exploration on Efficient and Accu...,human pose estimation hpe assess human motion ...,0,"[1.0, 4.644404132682087e-309, 5.20329303599486..."
3,JoyPose: Jointly learning evolutionary data au...,videobased 3d human pose estimation important ...,0,"[0.7980529524108124, 0.04012789285407543, 0.03..."
4,DUA: A Domain-Unified Approach for Cross-Datas...,human pose estimation important computer visio...,0,"[0.5533676013138721, 0.0630834821922568, 0.065..."
...,...,...,...,...
673,Deep Fully-Connected Part-Based Models for Hum...,2d multilevel appearance representation human ...,0,"[0.8424686549701329, 0.025045081005808857, 0.0..."
674,Deep Learning for Computer Vision: A Brief Review,last years deep learning shown outperform prev...,0,"[0.3581820455794816, 0.1159108561599556, 0.137..."
675,Revisiting Unreasonable Effectiveness of Data ...,success deep learning vision attributed models...,0,"[0.47837174262304527, 0.09456999990074651, 0.1..."
676,3D human pose estimation via deep learning fro...,deep convolutional neural network 3dhuman pose...,0,"[1.0, 3.78634087847647e-309, 3.618224716217967..."


In [18]:
llama2_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Llama2"].values()]
topic_model.set_topic_labels(llama2_labels)
topic_model.visualize_documents(title_data, embeddings=embeddings, reduced_embeddings=None,
                                hide_annotations=False, hide_document_hover=False, custom_labels=True,
                                title = "",topics=topics,width=1920,height=1080)

In [19]:
topic_model.visualize_topics(title="", custom_labels=True)

In [20]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics,
                                title = "",
                                custom_labels = True)

100%|██████████| 4/4 [00:00<00:00, 155.49it/s]


In [21]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─■──fall_falls_elderly_detection_falling ── Topic: 3
└─pose_human_estimation_3d_learning
     ├─■──driver_pose_estimation_driving_vehicle ── Topic: 4
     └─pose_human_estimation_3d_learning
          ├─pose_human_estimation_3d_learning
          │    ├─■──learning_deep_detection_data_analysis ── Topic: 1
          │    └─■──pose_human_estimation_3d_deep ── Topic: 0
          └─■──signals_rf_human_radar_pose ── Topic: 2



In [22]:
topic_model.visualize_barchart(n_words=10
                               , title = "",
                               custom_labels = False)

In [23]:
timestamps = papers['Year']
topics_over_time = topic_model.topics_over_time(title_data, timestamps)
topic_model.visualize_topics_over_time(topics_over_time,title="",custom_labels = True,topics=topics,normalize_frequency=True)

11it [00:00, 58.97it/s]


In [26]:
classes = papers['Document Type']
topics_per_class = topic_model.topics_per_class(docs, classes)
topic_model.visualize_topics_per_class(topics_per_class,custom_labels = True)

8it [00:00, 46.04it/s]


In [29]:
classes = papers['Source title']
topics_per_class = topic_model.topics_per_class(docs, classes)
topic_model.visualize_topics_per_class(topics_per_class,custom_labels = True)

323it [00:03, 104.47it/s]


In [38]:
import pandas as pd
import openpyxl
import string
from keybert import KeyBERT

papers = pd.read_excel("/content/topic_modeling_summary.xlsx",sheet_name=['-1']) # Run from -1 to 4
data_minus_1 = papers['-1']['Abstract']

list_abstract = []
for row in data_minus_1:
    list_abstract.append(row)
doc = " ".join(list_abstract)

kw_model = KeyBERT()
#keywords_highlighted = kw_model.extract_keywords(doc, highlight=True)
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 1)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 2)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 3)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1,1),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(2,2),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(3,3),use_mmr=True, diversity=0.1))

[('yogapose', 0.5431), ('yoga', 0.5368), ('postures', 0.4337), ('posture', 0.432), ('poses', 0.4145)]
[('automation yoga', 0.6373), ('yoga classifier', 0.6268), ('yoga tracker', 0.6221), ('yoga poses', 0.6132), ('persons yoga', 0.612)]
[('yoga pose dataset', 0.6684), ('yoga pose recognition', 0.6618), ('tracking correcting yoga', 0.6611), ('pose dataset yoga', 0.6562), ('detect correct yoga', 0.6507)]
[('yogapose', 0.5431), ('yoga', 0.5368), ('postures', 0.4337), ('poses', 0.4145), ('svm', 0.3976)]
[('automation yoga', 0.6373), ('yoga classifier', 0.6268), ('yoga tracker', 0.6221), ('yoga poses', 0.6132), ('dataset yoga', 0.6093)]
[('yoga pose dataset', 0.6684), ('yoga pose recognition', 0.6618), ('tracking correcting yoga', 0.6611), ('detect correct yoga', 0.6507), ('specifically yoga human', 0.6362)]


In [33]:
papers = pd.read_excel("/content/topic_modeling_summary.xlsx",sheet_name=['0']) # Run from -1 to 4
data_minus_1 = papers['0']['Abstract']

list_abstract = []
for row in data_minus_1:
    list_abstract.append(row)
doc = " ".join(list_abstract)

kw_model = KeyBERT()
#keywords_highlighted = kw_model.extract_keywords(doc, highlight=True)
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 1)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 2)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 3)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1,1),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(2,2),use_mmr=True, diversity=0.1))
#print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(3,3),use_mmr=True, diversity=0.1))

[('poseconfiguration', 0.4429), ('poseresnet', 0.4177), ('postures', 0.4125), ('posetracking', 0.4002), ('humancomputer', 0.3971)]
[('pose manual', 0.5642), ('posture dataset', 0.5572), ('human poseaware', 0.5545), ('pose information', 0.5514), ('pose trackers', 0.5403)]
[('pose datasets human36', 0.6064), ('human poseaware features', 0.6018), ('pose information provides', 0.6015), ('human pose estimationï¼œtwodimensional', 0.5868), ('pose humancentered automation', 0.5838)]
[('poseconfiguration', 0.4429), ('poseresnet', 0.4177), ('postures', 0.4125), ('humancomputer', 0.3971), ('accuracyresource', 0.3849)]
[('pose manual', 0.5642), ('posture dataset', 0.5572), ('human poseaware', 0.5545), ('pose trackers', 0.5403), ('models human36m', 0.5273)]


In [34]:
papers = pd.read_excel("/content/topic_modeling_summary.xlsx",sheet_name=['1']) # Run from -1 to 4
data_minus_1 = papers['1']['Abstract']

list_abstract = []
for row in data_minus_1:
    list_abstract.append(row)
doc = " ".join(list_abstract)

kw_model = KeyBERT()
#keywords_highlighted = kw_model.extract_keywords(doc, highlight=True)
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 1)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 2)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 3)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1,1),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(2,2),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(3,3),use_mmr=True, diversity=0.1))

[('features', 0.3571), ('datasets', 0.3308), ('humancomputer', 0.3282), ('3d', 0.3198), ('ai', 0.3154)]
[('multimodal datasets', 0.3812), ('vision topics', 0.3747), ('machines multimodal', 0.373), ('architecture video', 0.3716), ('development multimodality', 0.3697)]
[('classification incorporates 3d', 0.4153), ('multimodal datasets equipmentlimited', 0.4144), ('development intelligent vr', 0.3964), ('computer vision topics', 0.3946), ('skills immersive vr', 0.3915)]
[('features', 0.3571), ('datasets', 0.3308), ('humancomputer', 0.3282), ('3d', 0.3198), ('ai', 0.3154)]
[('multimodal datasets', 0.3812), ('vision topics', 0.3747), ('architecture video', 0.3716), ('classification drone', 0.3667), ('conference humancomputer', 0.3637)]
[('classification incorporates 3d', 0.4153), ('multimodal datasets equipmentlimited', 0.4144), ('development intelligent vr', 0.3964), ('vision topics challenges', 0.3889), ('motion videos research', 0.3829)]


In [35]:
papers = pd.read_excel("/content/topic_modeling_summary.xlsx",sheet_name=['2']) # Run from -1 to 4
data_minus_1 = papers['2']['Abstract']

list_abstract = []
for row in data_minus_1:
    list_abstract.append(row)
doc = " ".join(list_abstract)

kw_model = KeyBERT()
#keywords_highlighted = kw_model.extract_keywords(doc, highlight=True)
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 1)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 2)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 3)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1,1),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(2,2),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(3,3),use_mmr=True, diversity=0.1))

[('postures', 0.4358), ('pose', 0.3691), ('kinect', 0.3466), ('rfmvp', 0.3371), ('rfpose', 0.3269)]
[('rf pose', 0.5199), ('performing postures', 0.5039), ('radarbased human', 0.4948), ('pose machine', 0.4794), ('radarbased pose', 0.4782)]
[('radarbased human pose', 0.5584), ('signals human poses', 0.541), ('rfbased human pose', 0.5379), ('process human pose', 0.5335), ('pose machine rfmvp', 0.5331)]
[('postures', 0.4358), ('pose', 0.3691), ('kinect', 0.3466), ('rfmvp', 0.3371), ('rfidpose', 0.3265)]
[('rf pose', 0.5199), ('performing postures', 0.5039), ('radarbased human', 0.4948), ('pose machine', 0.4794), ('radarbased pose', 0.4782)]
[('radarbased human pose', 0.5584), ('signals human poses', 0.541), ('rfbased human pose', 0.5379), ('process human pose', 0.5335), ('pose machine rfmvp', 0.5331)]


In [36]:
papers = pd.read_excel("/content/topic_modeling_summary.xlsx",sheet_name=['3']) # Run from -1 to 4
data_minus_1 = papers['3']['Abstract']

list_abstract = []
for row in data_minus_1:
    list_abstract.append(row)
doc = " ".join(list_abstract)

kw_model = KeyBERT()
#keywords_highlighted = kw_model.extract_keywords(doc, highlight=True)
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 1)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 2)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 3)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1,1),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(2,2),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(3,3),use_mmr=True, diversity=0.1))

[('kinematic', 0.3732), ('footage', 0.3488), ('biomechanics', 0.3481), ('bicycle', 0.3398), ('videos', 0.3387)]
[('analysis cyclist', 0.4347), ('fall kinematic', 0.4342), ('estimation cyclist', 0.4307), ('pose knowledge', 0.427), ('physical activities', 0.4177)]
[('video analysis cyclist', 0.529), ('injury estimation cyclist', 0.4982), ('cyclist fall kinematics', 0.4722), ('cyclist fall kinematic', 0.4599), ('practicality videos exercise', 0.4489)]
[('kinematic', 0.3732), ('footage', 0.3488), ('biomechanics', 0.3481), ('cyclist', 0.3368), ('datasets', 0.3334)]
[('analysis cyclist', 0.4347), ('fall kinematic', 0.4342), ('pose knowledge', 0.427), ('precision pedestrian', 0.4119), ('practicality videos', 0.4085)]
[('video analysis cyclist', 0.529), ('injury estimation cyclist', 0.4982), ('cyclist fall kinematics', 0.4722), ('human motion realworld', 0.4387), ('guidance pose knowledge', 0.4371)]


In [37]:
papers = pd.read_excel("/content/topic_modeling_summary.xlsx",sheet_name=['4']) # Run from -1 to 4
data_minus_1 = papers['4']['Abstract']

list_abstract = []
for row in data_minus_1:
    list_abstract.append(row)
doc = " ".join(list_abstract)

kw_model = KeyBERT()
#keywords_highlighted = kw_model.extract_keywords(doc, highlight=True)
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 1)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 2)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 3)))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(1,1),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(2,2),use_mmr=True, diversity=0.1))
print(kw_model.extract_keywords(doc,keyphrase_ngram_range=(3,3),use_mmr=True, diversity=0.1))

[('pose', 0.357), ('dataset', 0.3463), ('datasets', 0.3407), ('poses', 0.3353), ('driver', 0.3326)]
[('driver pose', 0.5094), ('driving tasks', 0.4695), ('learning driver', 0.4653), ('persons dataset', 0.4638), ('vehicle pose', 0.4614)]
[('driver activity deep', 0.5785), ('driver pose information', 0.5574), ('driver distraction classification', 0.5413), ('classification driver pose', 0.5302), ('driver activity recognition', 0.5133)]
[('pose', 0.357), ('dataset', 0.3463), ('driver', 0.3326), ('recognition', 0.3238), ('driving', 0.3233)]
[('driver pose', 0.5094), ('driving tasks', 0.4695), ('learning driver', 0.4653), ('persons dataset', 0.4638), ('keypoints human', 0.4468)]
[('driver activity deep', 0.5785), ('driver pose information', 0.5574), ('driver distraction classification', 0.5413), ('classification driver pose', 0.5302), ('datasets providing poses', 0.493)]
