<a href="https://colab.research.google.com/github/elliemci/chatbots/blob/main/ner_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chatbot that extracts entities from user message with BERT transformer

In [1]:
!pip install transformers pyTelegramBotAPI

Collecting pyTelegramBotAPI
  Downloading pyTelegramBotAPI-4.14.0.tar.gz (243 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.1/243.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyTelegramBotAPI
  Building wheel for pyTelegramBotAPI (setup.py) ... [?25l[?25hdone
  Created wheel for pyTelegramBotAPI: filename=pyTelegramBotAPI-4.14.0-py3-none-any.whl size=215252 sha256=042a51f47031950fae4ba081ea55b29fd9d97646341b6638c8882715c13e8556
  Stored in directory: /root/.cache/pip/wheels/25/51/2d/24b40a366c85c37928d5aa36ddf257e5a79fad25e1ecd11b2c
Successfully built pyTelegramBotAPI
Installing collected packages: pyTelegramBotAPI
Successfully installed pyTelegramBotAPI-4.14.0


In [2]:
import telebot
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

In [8]:
import os
from getpass import getpass

os.environ['TELEGRAM_BOT_TOKEN'] = getpass('Enter your bot token: ')
TOKEN = os.getenv('TELEGRAM_BOT_TOKEN')

# create the chatbot using pyTelegramBotAPI passing the token
bot = telebot.TeleBot(TOKEN)

Enter your bot token: ··········


In [4]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="SIMPLE")

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# test printing out recognized entities by group
text = "There was a boy named Elliot who loved to run, and boy he could run! \
        He ran the tracks of Issaquah High School, the nearby Tiger Mountain \
        and Mount Raineir tails and the parks of Bellevue, Snoqualmmie, Redmond."

entities = nlp(text)
print(nlp(text))
print()
for entity in entities:
    print(f"Found a {entity['entity_group']} entity, called {entity['word']}")

[{'entity_group': 'PER', 'score': 0.9989543, 'word': 'Elliot', 'start': 22, 'end': 28}, {'entity_group': 'ORG', 'score': 0.5249421, 'word': 'Is', 'start': 98, 'end': 100}, {'entity_group': 'LOC', 'score': 0.91287446, 'word': '##sa', 'start': 100, 'end': 102}, {'entity_group': 'ORG', 'score': 0.7692467, 'word': '##quah High School', 'start': 102, 'end': 118}, {'entity_group': 'LOC', 'score': 0.6807327, 'word': 'Tiger Mountain', 'start': 131, 'end': 145}, {'entity_group': 'LOC', 'score': 0.9535476, 'word': 'Mount Raineir', 'start': 158, 'end': 171}, {'entity_group': 'LOC', 'score': 0.93935955, 'word': 'Bellevue', 'start': 195, 'end': 203}, {'entity_group': 'LOC', 'score': 0.82404053, 'word': 'Snoqualmmie', 'start': 205, 'end': 216}, {'entity_group': 'LOC', 'score': 0.87960684, 'word': 'Redmond', 'start': 218, 'end': 225}]

Found a PER entity, called Elliot
Found a ORG entity, called Is
Found a LOC entity, called ##sa
Found a ORG entity, called ##quah High School
Found a LOC entity, calle

In [6]:
for entity_group in set([entity["entity_group"] for entity in entities]):
    print(f"Found the following {entity_group} entities:")
    for entity in entities:
        if entity["entity_group"] == entity_group:
            print(f" - {entity['word']}")

Found the following LOC entities:
 - ##sa
 - Tiger Mountain
 - Mount Raineir
 - Bellevue
 - Snoqualmmie
 - Redmond
Found the following ORG entities:
 - Is
 - ##quah High School
Found the following PER entities:
 - Elliot


In [9]:
# convert text into lowercase, break into words and check is a string of words containe "yes or "no"
def lowercase(text):
    return text.lower()

def is_yes(text):
    words = lowercase(text).split()
    return any(word in words for word in ["yes", "please", "yep", "yeah", "y", "ya", "sure", "ok", "yup", "yep", "maybe", "right"])

def is_no(text):
    words = lowercase(text).split()
    return any(word in words for word in ["no", "nope", "n", "nah", "not"])


# initalize bot state
state = 0

@bot.message_handler(func=lambda message: True)
def message(message):

    global state
    chat_id = message.chat.id

    if state == 0:
      bot.send_message(chat_id, "Hi there! Do you have any entities to be recognised and extracted?")
      state = 1

    elif state == 1:
        if is_yes(message.text):
            bot.send_message(chat_id, "Please send me the text you want to analyse.")
            state = 2
        elif is_no(message.text):
            bot.send_message(chat_id, "Ok, bye!")
            state = 0
        else:
            bot.send_message(chat_id, "I didn't get that. Please answer with yes or no.")
            state = 1

    elif state == 2:
      entities = nlp(message.text)
      for entity_group in set([entity["entity_group"] for entity in entities]):
        bot.send_message(chat_id, f"Found the following {entity_group} entities:")
        for entity in entities:
          if entity["entity_group"] == entity_group:
            bot.send_message(chat_id, f" - {entity['word']}")
      state = 3

    elif state == 3:
      bot.send_message(chat_id, "Do you have more messages for me to recognize and extract name entites from?")
      state = 1

bot.polling()