# Leverage KeyBERT, HDBSCAN and Zephyr-7B-Beta to Build a Knowledge Graph
https://towardsdatascience.com/leverage-keybert-hdbscan-and-zephyr-7b-beta-to-build-a-knowledge-graph-33d7534ee01b

In [None]:
# !pip install llama-index Ipython langchain keybert transformers

In [1]:
import sys
sys.path.append("utils")
from utils.arxiv_parser import *
from utils.llm_utils import *
import textwrap
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from keybert.llm import TextGeneration
from keybert import KeyLLM, KeyBERT
from sentence_transformers import SentenceTransformer

In [2]:
DATA_DIR = "data/"
llm_path = "../../LLMs/text-generation-webui/models/TheBloke/zephyr-7B-beta-GPTQ/"

In [3]:
# Initialize the data parser
parser = ArXivDataProcessor(DATA_DIR)

# Unzip the downloaded file to extract a json file in data_path
parser.unzip_file()

# Select a topic and extract the articles on that topic
topic = 'cs'
entries = parser.select_topic('cs')

# Build a pandas dataframe with specified selections
df = parser.select_articles(
  entries,  # extracted articles
  cols=['id', 'title', 'abstract'],  # features to keep
  min_length=100,  # min tokens an abstract should have
  max_length=120,  # max tokens an abstract should have
  keep_abs_length=False,  # do not keep the abs_length column
  build_corpus=False)  # do not build a corpus column

# Save the selected data to a csv file 'selected_{topic}.csv', uses data_path
parser.save_selected_data(df, topic)

There are 7149 with cs topic.
There are 983 articles selected.


In [4]:
# question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
# prompt=f'''<|system|>
# </s>
# <|user|>
# {question}</s>
# <|assistant|>
# '''

In [5]:
llm = AutoModelForCausalLM.from_pretrained(llm_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main") # change revision for a different branch
tokenizer = AutoTokenizer.from_pretrained(llm_path, 
                     use_fast=True)

CUDA extension not installed.
CUDA extension not installed.


In [6]:
generator = pipeline(
    model=llm,
    tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1,
)

In [7]:
# response = generator(prompt)
# print(textwrap.fill(response[0]['generated_text'],90))

In [8]:
prompt_keywords= """
<|system|>
I have the following document:
Semantics and Termination of Simply-Moded Logic Programs with Dynamic Scheduling
and five candidate keywords:
scheduling, logic, semantics, termination, moded

Based on the information above, extract the keywords or the keyphrases that best describe the topic of the text.
Follow the requirements below:
1. Make sure to extract only the keywords or keyphrases that appear in the text.
2. Provide five keywords or keyphrases! Do not number or label the keywords or the keyphrases!
3. Do not include anything else besides the keywords or the keyphrases! I repeat do not include any comments!

semantics, termination, simply-moded, logic programs, dynamic scheduling</s>

<|user|>
I have the following document:
[DOCUMENT]
and five candidate keywords:
[CANDIDATES]

Based on the information above, extract the keywords or the keyphrases that best describe the topic of the text.
Follow the requirements below:
1. Make sure to extract only the keywords or keyphrases that appear in the text.
2. Provide five keywords or keyphrases! Do not number or label the keywords or the keyphrases!
3. Do not include anything else besides the keywords or the keyphrases! I repeat do not include any comments!</s>

<|assistant|>
"""

In [9]:
# KeyBert TextGeneration pipeline wrapper
llm_tg = TextGeneration(generator, prompt=prompt_keywords)

# Instantiate KeyBERT and specify an embedding model
kw_model= KeyBERT(llm=llm_tg, model = "all-mpnet-base-v2")

  return self.fget.__get__(instance, owner)()


In [11]:
# Retain the articles titles only for analysis
titles_list = df.title.tolist()

# Process the documents and collect the results
titles_keys = kw_model.extract_keywords(titles_list, threshold=0.5)

# Add the results to df
df["titles_keys"] = titles_keys
df.head()

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o