<a href="https://colab.research.google.com/github/dileep9968/bert-ner-on-mit-restaurants-dataset-/blob/main/bert_ner_on_mit_restaurants_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import pandas as pd
import json
import requests
import torch

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [4]:
train = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/train.bio', sep='\t', header = None)
train.head()

Unnamed: 0,0,1
0,B-Rating,2
1,I-Rating,start
2,O,restaurants
3,O,with
4,B-Amenity,inside


In [5]:
def create_ner_dataset_from_text(url):
  response = requests.get(url)
  response = response.text
  response = response.splitlines()
  temp_tokens = []
  temp_tags = []
  train_token = []
  train_tags = []
  for line in response:
    if line != '':
      tag, token = line.strip().split('\t')
      temp_tags.append(tag)
      temp_tokens.append(token)
    else:
      train_token.append(temp_tokens)
      train_tags.append(temp_tags)
      temp_tokens, temp_tags = [],[]

  return train_token, train_tags

In [6]:
train_url = "https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/train.bio"
test_url = 'https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/test.bio'
train_token, train_tags = create_ner_dataset_from_text(train_url)
test_token, test_tags = create_ner_dataset_from_text(test_url)
print(f"Length of train tokens: {len(train_token)}")
print(f"Length of train tags: {len(train_tags)}")
print(f"Length of test tokens: {len(test_token)}")
print(f"Length of test tags: {len(test_tags)}")

Length of train tokens: 7659
Length of train tags: 7659
Length of test tokens: 1520
Length of test tags: 1520


## HuggingFace Dataset

In [7]:
from datasets import Dataset, DatasetDict

train_df = pd.DataFrame({'tokens': train_token, 'ner_tags_str': train_tags})
test_df = pd.DataFrame({'tokens': test_token, 'ner_tags_str': test_tags})
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)
dataset = DatasetDict({'train':dataset_train,
                       'test':dataset_test,
                       'valid':dataset_test})
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
})

In [8]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'ner_tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity']}

In [9]:
unique_tag = set()
for tag in dataset['train']['ner_tags_str']:
  for t in tag:
    unique_tag.add(t)

unique_tags=list(set([x[2:] for x in list(unique_tag) if x !='O']))

tag2index = {'O':0}
for i , tag in enumerate(unique_tags):
  tag2index[f'B-{tag}'] = len(tag2index)
  tag2index[f'I-{tag}'] = len(tag2index)

index2tag = {i:tag for tag, i in tag2index.items()}

tag2index, index2tag

({'O': 0,
  'B-Hours': 1,
  'I-Hours': 2,
  'B-Price': 3,
  'I-Price': 4,
  'B-Rating': 5,
  'I-Rating': 6,
  'B-Restaurant_Name': 7,
  'I-Restaurant_Name': 8,
  'B-Location': 9,
  'I-Location': 10,
  'B-Cuisine': 11,
  'I-Cuisine': 12,
  'B-Amenity': 13,
  'I-Amenity': 14,
  'B-Dish': 15,
  'I-Dish': 16},
 {0: 'O',
  1: 'B-Hours',
  2: 'I-Hours',
  3: 'B-Price',
  4: 'I-Price',
  5: 'B-Rating',
  6: 'I-Rating',
  7: 'B-Restaurant_Name',
  8: 'I-Restaurant_Name',
  9: 'B-Location',
  10: 'I-Location',
  11: 'B-Cuisine',
  12: 'I-Cuisine',
  13: 'B-Amenity',
  14: 'I-Amenity',
  15: 'B-Dish',
  16: 'I-Dish'})

In [10]:
dataset =dataset.map(lambda x: {'ner_tags': [tag2index[tag] for tag in x['ner_tags_str']]})

dataset

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1520
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1520
    })
})

In [11]:
dataset['train'][:1]

{'tokens': [['2', 'start', 'restaurants', 'with', 'inside', 'dining']],
 'ner_tags_str': [['B-Rating',
   'I-Rating',
   'O',
   'O',
   'B-Amenity',
   'I-Amenity']],
 'ner_tags': [[5, 6, 0, 0, 13, 14]]}

## Model Building

In [12]:
from transformers import AutoTokenizer

In [13]:
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
def tokenize_and_align_labels(examples):
  tokenized_input = tokenizer(examples['tokens'], truncation=True,is_split_into_words=True)
  labels =[]
  for i, label in enumerate(examples['ner_tags']):
    word_ids = tokenized_input.word_ids(batch_index=i)

    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx != previous_word_idx:
        label_ids.append(label[word_idx])
      else:
        label_ids.append(-100)

      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_input['labels'] = labels
  return tokenized_input

In [15]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

In [16]:
tokenized_dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [5, 6, 0, 9, 10, 10],
 'input_ids': [101, 1019, 2732, 2717, 4648, 7666, 1999, 2026, 2237, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 5, 6, 0, -100, -100, 9, 10, 10, -100]}

## Data Collation and Metrics

In [17]:
!pip install seqeval
!pip install evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=69aded5f30b1d48b207d1e3e31dd3c82cdb74338eb6a75fee43b21b1c23eb2a7
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [19]:
import evaluate
import numpy as np
metric = evaluate.load('seqeval')
label_names = list(tag2index)
label_names

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  prediction = np.argmax(logits, axis=-1)
  true_labels =[[label_names[l] for l in label if l != -100] for label in labels]
  true_predictions =[[label_names[p] for (p,l) in zip(prediction, label) if l != -100]
                     for prediction, label in zip(prediction, labels)]
  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {
      'precision': all_metrics['overall_precision'],
      'recall': all_metrics['overall_recall'],
      'f1': all_metrics['overall_f1'],
      'accuracy': all_metrics['overall_accuracy']
  }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

## Model Traning

In [20]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_ckpt,
                                                        num_labels=len(tag2index),
                                                        id2label=index2tag,
                                                        label2id=tag2index).to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
   

In [22]:
training_args = TrainingArguments(
    'finetuned-ner',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)



In [23]:
trainer = Trainer(
    model = model,
    args  = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['valid'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [24]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 18


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6345,0.303764,0.736827,0.790159,0.762561,0.908408
2,0.2484,0.280107,0.77382,0.806984,0.790054,0.916971
3,0.2026,0.282441,0.775076,0.809524,0.791925,0.917602


TrainOutput(global_step=2874, training_loss=0.31388713223451364, metrics={'train_runtime': 203.0485, 'train_samples_per_second': 113.16, 'train_steps_per_second': 14.154, 'total_flos': 105239751014754.0, 'train_loss': 0.31388713223451364, 'epoch': 3.0})

## Save model

In [38]:
trainer.save_model('ner_model')  # Saves model weights and config
tokenizer.save_pretrained('ner_model')  # Saves tokenizer files


('ner_model/tokenizer_config.json',
 'ner_model/special_tokens_map.json',
 'ner_model/vocab.txt',
 'ner_model/added_tokens.json',
 'ner_model/tokenizer.json')

In [34]:
!rm -rf ner_model

## Load and Prediction

In [40]:
from transformers import pipeline

In [42]:
ner_rec = pipeline('token-classification', model='ner_model',
                   aggregation_strategy='simple', device = device)

In [44]:
ner_rec('Which resturant the best sushi in new delhi')

[{'entity_group': 'Rating',
  'score': 0.9594138,
  'word': 'best',
  'start': 20,
  'end': 24},
 {'entity_group': 'Dish',
  'score': 0.77622414,
  'word': 'sushi',
  'start': 25,
  'end': 30},
 {'entity_group': 'Location',
  'score': 0.9594803,
  'word': 'new delhi',
  'start': 34,
  'end': 43}]