<a href="https://colab.research.google.com/github/codeBySejal/ml-colab-notebooks/blob/main/Medical_Entity_Reognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

medical_dataset = load_dataset("tner/bc5cdr")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
medical_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5865
    })
})

In [None]:
medical_dataset["train"][0]

{'tokens': ['Naloxone',
  'reverses',
  'the',
  'antihypertensive',
  'effect',
  'of',
  'clonidine',
  '.'],
 'tags': [1, 0, 0, 0, 0, 0, 1, 0]}

In [None]:
ner_feature = medical_dataset["train"].features
ner_feature

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'tags': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}

In [None]:
label2id_dict = {
    "O": 0,
    "B-Chemical": 1,
    "B-Disease": 2,
    "I-Disease": 3,
    "I-Chemical": 4
}

In [None]:
id2label_dict = {v:k for k,v in label2id_dict.items()}
id2label_dict

{0: 'O', 1: 'B-Chemical', 2: 'B-Disease', 3: 'I-Disease', 4: 'I-Chemical'}

In [None]:
words = medical_dataset["train"][0]["tokens"]
labels = medical_dataset["train"][0]["tags"]
line1 = ""
line2 = ""
for word, label in zip(words,labels):
  full_label = id2label_dict[label]
  max_length = max(len(word),len(full_label))
  line1 += word + " " * (max_length - len(word) + 1)
  line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Naloxone   reverses the antihypertensive effect of clonidine  . 
B-Chemical O        O   O                O      O  B-Chemical O 


In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer.is_fast

True

In [None]:
inputs = tokenizer(medical_dataset["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'Na',
 '##lo',
 '##xon',
 '##e',
 'reverse',
 '##s',
 'the',
 'anti',
 '##hy',
 '##pert',
 '##ens',
 '##ive',
 'effect',
 'of',
 'c',
 '##lon',
 '##id',
 '##ine',
 '.',
 '[SEP]']

In [None]:
inputs.word_ids()

[None, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 7, None]

In [None]:
def align_labels_with_tokens(labels,word_ids):
  new_labels = []
  current_word = None
  for word_id in word_ids:

    if word_id != current_word:
      current_word = word_id

      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)
    elif word_id is None:
      # Special token
      new_labels.append(-100)
    else:
      label = labels[word_id]

      # If the label is B-XXX we change it to I-XXX
      if label % 2 == 1:
          label += 1
      new_labels.append(label)
  return new_labels

In [None]:
labels = medical_dataset["train"][0]["tags"]
word_ids = inputs.word_ids()
print(labels)
print(word_ids)
print(align_labels_with_tokens(labels,word_ids))

[1, 0, 0, 0, 0, 0, 1, 0]
[None, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 7, None]
[-100, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, -100]


In [None]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(
      examples["tokens"], truncation=True, is_split_into_words=True
  )
  all_labels = examples["tags"]
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs["labels"] = new_labels
  return tokenized_inputs

In [None]:
tokenized_datasets = medical_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=medical_dataset["train"].column_names,
)

Map:   0%|          | 0/5865 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors= "tf")

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

<tf.Tensor: shape=(2, 56), dtype=int64, numpy=
array([[-100,    1,    2,    2,    2,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    1,    2,    2,    2,    0, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100],
       [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    2,
           2,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    1,    2,    2,    2,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    1,    2,    2,    0,    0,    0,
        -100]])>

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, -100]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, -100]


In [None]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns= ["attention_mask","input_ids","labels","token_type_ids"]
    ,shuffle=True
    ,batch_size=16
    ,collate_fn=data_collator
)
tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns= ["attention_mask","input_ids","labels","token_type_ids"]
    ,shuffle=False
    ,batch_size=16
    ,collate_fn=data_collator
)

In [None]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label_dict,
    label2id=label2id_dict,
)


All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.num_labels

5

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineG

In [None]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [None]:
!git config --global user.email "sejusejal024@gmail.com"
!git config --global user.name "sejalsaka"

In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(output_dir="medical-bert-finetuned-ner", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/medical-bert-finetuned-ner is already a clone of https://huggingface.co/sejalsaka/medical-bert-finetuned-ner. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7deffe3a75b0>

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "sejalsaka/medical-bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="first"
)
token_classifier("Aspirin is used to treat cardiovascular diseases.")

Some layers from the model checkpoint at sejalsaka/medical-bert-finetuned-ner were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at sejalsaka/medical-bert-finetuned-ner.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.
Device set to use 0


[{'entity_group': 'Chemical',
  'score': 0.995,
  'word': 'Aspirin',
  'start': 0,
  'end': 7},
 {'entity_group': 'Disease',
  'score': 0.9824,
  'word': 'cardiovascular diseases',
  'start': 25,
  'end': 48}]

In [None]:
text= '''
Paracetamol is commonly used to treat fever and mild to moderate pain. It is often prescribed alongside Ibuprofen for better results. Recently, Dr. Smith, a renowned physician at Johns Hopkins Hospital, conducted research on the effects of Paracetamol on cardiovascular diseases. The study, funded by the World Health Organization (WHO), highlighted the importance of dosage control in preventing liver damage.
'''

In [None]:
token_classifier(text)

[{'entity_group': 'Chemical',
  'score': 0.995,
  'word': 'Paracetamol',
  'start': 1,
  'end': 12},
 {'entity_group': 'Disease',
  'score': 0.945,
  'word': 'fever',
  'start': 39,
  'end': 44},
 {'entity_group': 'Disease',
  'score': 0.8345,
  'word': 'pain',
  'start': 66,
  'end': 70},
 {'entity_group': 'Chemical',
  'score': 0.995,
  'word': 'Ibuprofen',
  'start': 105,
  'end': 114},
 {'entity_group': 'Chemical',
  'score': 0.997,
  'word': 'Paracetamol',
  'start': 241,
  'end': 252},
 {'entity_group': 'Disease',
  'score': 0.9766,
  'word': 'cardiovascular diseases',
  'start': 256,
  'end': 279},
 {'entity_group': 'Disease',
  'score': 0.979,
  'word': 'liver damage',
  'start': 398,
  'end': 410}]

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
import gradio as gr

# Define the function to process inputs and return predictions
def predict_ner(text):
    predictions = token_classifier(text)
    # Format the output
    return predictions

# Create Gradio interface
interface = gr.Interface(
    fn=predict_ner,                   # Function to handle predictions
    inputs=gr.Textbox(lines=3, placeholder="Enter your text here..."),  # Input type
    outputs=gr.JSON(),                # Output as JSON for detailed results
    title="Medical NER Model",                # Title of the interface
    description="Enter a sentence to identify medical entities(chemical/diseases) using the Hugging Face model."
)

# Launch the interface
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://aff300c6de02b4f865.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


