In [7]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration
import spacy
import time
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [5]:
start = time.time()
# Load pre-trained DistilBART model and tokenizer
'''model_name = "sshleifer/distilbart-cnn-6-6"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
'''
# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
# Sample text
text = """
The COVID-19 pandemic has had a significant impact on global economies and public health. 
Various measures have been implemented to curb the spread of the virus, including lockdowns, 
social distancing, and vaccination campaigns. These measures have proven effective in reducing 
the transmission rate and mitigating the effects of the pandemic.
"""

# Tokenize and summarize
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(inputs, max_length=50, min_length=5, length_penalty=10.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary)
print(len(summary) / len(text), len(summary), len(text))
print("Time taken:", time.time() - start)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


the COVID-19 pandemic has had a significant impact on global economies and public health. a number of measures have been implemented to curb the spread of the virus.
0.47413793103448276 165 348
Time taken: 2.3114230632781982


In [6]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = """
The COVID-19 pandemic has had a significant impact on global economies and public health. 
Various measures have been implemented to curb the spread of the virus, including lockdowns, 
social distancing, and vaccination campaigns. These measures have proven effective in reducing 
the transmission rate and mitigating the effects of the pandemic.
"""

# Process text
doc = nlp(text)

# Extract keywords (named entities)
keywords = [ent.text for ent in doc.ents]
print("Keywords:", keywords)

Keywords: ['COVID-19']


In [10]:
# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# Load NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Sample text
text = """
The COVID-19 pandemic has had a significant impact on global economies and public health. 
Various measures have been implemented to curb the spread of the virus, including lockdowns, 
social distancing, and vaccination campaigns. These measures have proven effective in reducing 
the transmission rate and mitigating the effects of the pandemic.
"""

# Perform NER
ner_results = nlp(text)

# Extract and print keywords
keywords = [result['word'] for result in ner_results]
print("Keywords:", keywords)

Keywords: ['CO', '##VI', '##D', '-', '19']


In [28]:
from keybert import KeyBERT
import spacy
kw_model = KeyBERT()

In [27]:
# Sample text
#text = 
"""
The COVID-19 pandemic has had a significant impact on global economies and public health. 
Various measures have been implemented to curb the spread of the virus, including lockdowns, 
social distancing, and vaccination campaigns. These measures have proven effective in reducing 
the transmission rate and mitigating the effects of the pandemic.
"""

text = """
Hello I am an old grandpa who is looking for a job. I have a lot of experience in the field of something that is really important.
"""

# Extract keywords
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english')
print(text)
print("Keywords:", [kw[0] for kw in keywords])

# Additional simple keyword extraction based on POS tagging
for token in doc:
    if token.pos_ in ['NOUN', 'PROPN'] and token.is_stop is False:
        keywords.append(token.text)

# Remove duplicates and print
keywords = list(set(keywords))
print("Final Keywords:", keywords)



Hello I am an old grandpa who is looking for a job. I have a lot of experience in the field of something that is really important.

Keywords: ['old grandpa', 'grandpa looking', 'looking job', 'grandpa', 'job lot']
Final Keywords: ['economies', 'vaccination', ('grandpa', 0.4343), 'distancing', 'transmission', 'COVID-19', 'effects', 'impact', ('job lot', 0.3813), 'spread', 'rate', 'pandemic', 'virus', 'lockdowns', ('looking job', 0.4413), 'health', 'campaigns', ('old grandpa', 0.4931), 'measures', ('grandpa looking', 0.4846)]


In [29]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize KeyBERT model
kw_model = KeyBERT()

# Sample text
text = """
Hello I am an old grandpa who is looking for a job. I have a lot of experience in the field of something carpet manufacturing really important.
"""

# Process text with SpaCy
doc = nlp(text)

# Extract named entities with SpaCy
spacy_keywords = [ent.text for ent in doc.ents]

# Extract keywords with KeyBERT
keybert_keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=5)
keybert_keywords = [kw[0] for kw in keybert_keywords]

# Combine and deduplicate keywords
all_keywords = list(set(spacy_keywords + keybert_keywords))
print("Final Keywords:", all_keywords)

Final Keywords: ['carpet', 'looking job', 'carpet manufacturing', 'old grandpa', 'field carpet']


In [48]:
# Initialize KeyBERT model
kw_model = KeyBERT()

# Sample text
text = """
I am relly invested in the stock market, I would like to go to Tübigen today and find something that intests me. Maybe some ice cream.
"""

text = "I would like to go somewhere maybe Tübingen. Lets say can you drive a boat in Tübingen? Maybe on the Neckar river."

text = "SUP tübingen neckar"

text = "where is townhall"

# Process text with SpaCy
doc = nlp(text)

# Extract named entities and noun chunks with SpaCy
spacy_keywords = [ent.text for ent in doc.ents]

# Additionally extract noun chunks to capture relevant phrases
noun_chunks = [chunk.text for chunk in doc.noun_chunks]

# Extract keywords with KeyBERT
keybert_keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=10)
keybert_keywords = [kw[0] for kw in keybert_keywords]

# Combine, deduplicate, and filter keywords
all_keywords = set(spacy_keywords + noun_chunks + keybert_keywords)

# Further filter keywords to focus on nouns and proper nouns
filtered_keywords = [word for word in all_keywords if any(token.pos_ in ['NOUN', 'PROPN'] for token in nlp(word))]

# Print the final keywords
print("Final Keywords:", filtered_keywords)

Final Keywords: ['townhall']


Final Keywords: []
