# NER Restaurant Search with TinyBERT

## Import dataset

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd

The dataset is from these links. The first one will be used on this project.
- https://groups.csail.mit.edu/sls/downloads/restaurant/
- https://huggingface.co/datasets/tner/mit_restaurant

In [4]:
df = pd.read_csv("datasets/ner_train.bio", sep="\t", header=None)
df.head()

Unnamed: 0,0,1
0,B-Rating,2
1,I-Rating,start
2,O,restaurants
3,O,with
4,B-Amenity,inside


## Data Preprocessing

In [5]:
with open("datasets/ner_train.bio", "r") as f:
    texts = f.readlines()
print(texts[:2])

# Remove new line char
texts = [text.replace("\n", "") for text in texts]
print(texts[:2])

['B-Rating\t2\n', 'I-Rating\tstart\n']
['B-Rating\t2', 'I-Rating\tstart']


We want to make the format become something like this:
```
{
    'tags': [0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 4, 0],
    'tokens': ['can', 'you', 'find', 'the', 'phone', 'number', 'for', 'the', 'closest', 'family', 'style', 'restaurant']
}
```

In [6]:
texts[9:20]

['B-Rating\t5',
 'I-Rating\tstar',
 'O\tresturants',
 'B-Location\tin',
 'I-Location\tmy',
 'I-Location\ttown',
 '',
 'O\t98',
 'B-Restaurant_Name\thong',
 'I-Restaurant_Name\tkong',
 'O\trestaurant']

Notice that if the line is empty, then the next line is a new sentence.

In [7]:
train_tags = []
train_tokens = []

temp_tags = []
temp_tokens = []
for line in texts:
    if line != "":
        tag, token = line.split('\t')
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        train_tags.append(temp_tags)
        train_tokens.append(temp_tokens)
        temp_tags, temp_tokens = [], []

In [8]:
with open("datasets/ner_test.bio", "r") as f:
    texts = f.readlines()
print(texts[:2])

# Remove new line char
texts = [text.replace("\n", "") for text in texts]
print(texts[:2])

test_tags = []
test_tokens = []

temp_tags = []
temp_tokens = []
for line in texts:
    if line != "":
        tag, token = line.split('\t')
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        test_tags.append(temp_tags)
        test_tokens.append(temp_tokens)
        temp_tags, temp_tokens = [], []

['O\ta\n', 'B-Rating\tfour\n']
['O\ta', 'B-Rating\tfour']


### Split train, val, test

In [9]:
import math
total_val = math.floor(len(test_tokens) / 3)
total_val

506

In [10]:
validation_tags = test_tags[:total_val]
validation_tokens = test_tokens[:total_val]
test_tags = test_tags[total_val:]
test_tokens = test_tokens[total_val:]

In [11]:
len(train_tags), len(train_tokens), len(validation_tags), len(validation_tokens), len(test_tags), len(test_tokens)

(7659, 7659, 506, 506, 1014, 1014)

### Prepare datasetdict

In [12]:
df_train = pd.DataFrame({'tokens': train_tokens, 'tags_str': train_tags})
df_validation = pd.DataFrame({'tokens': validation_tokens, 'tags_str': validation_tags})
df_test = pd.DataFrame({'tokens': test_tokens, 'tags_str': test_tags})

In [13]:
from datasets import Dataset, DatasetDict

In [14]:
dataset = DatasetDict(
    {
        'train': Dataset.from_pandas(df_train, preserve_index=False),
        'validation': Dataset.from_pandas(df_validation, preserve_index=False),
        'test': Dataset.from_pandas(df_test, preserve_index=False)
    }
)

dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags_str'],
        num_rows: 7659
    })
    validation: Dataset({
        features: ['tokens', 'tags_str'],
        num_rows: 506
    })
    test: Dataset({
        features: ['tokens', 'tags_str'],
        num_rows: 1014
    })
})

In [15]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity']}

### Prepare tag2index and index2tag

In [16]:
unique_tags = set()
for tags in dataset['train']['tags_str']:
    unique_tags.update(tags)

In [17]:
unique_tags = list(set(tag.split('-')[1] for tag in unique_tags if tag != 'O'))
unique_tags

['Hours',
 'Rating',
 'Price',
 'Restaurant_Name',
 'Location',
 'Dish',
 'Amenity',
 'Cuisine']

In [18]:
tag2index = {"O": 0}
for i, tag in enumerate(unique_tags):
    tag2index[f"B-{tag}"] = len(tag2index)
    tag2index[f"I-{tag}"] = len(tag2index)

index2tag = {v: k for k, v in tag2index.items()}
print(tag2index)

{'O': 0, 'B-Hours': 1, 'I-Hours': 2, 'B-Rating': 3, 'I-Rating': 4, 'B-Price': 5, 'I-Price': 6, 'B-Restaurant_Name': 7, 'I-Restaurant_Name': 8, 'B-Location': 9, 'I-Location': 10, 'B-Dish': 11, 'I-Dish': 12, 'B-Amenity': 13, 'I-Amenity': 14, 'B-Cuisine': 15, 'I-Cuisine': 16}


### Map dataset with tag2index

In [19]:
dataset = dataset.map(lambda example: {"tags": [tag2index[tag] for tag in example['tags_str']]})
dataset

Map: 100%|██████████| 7659/7659 [00:00<00:00, 15755.84 examples/s]
Map: 100%|██████████| 506/506 [00:00<00:00, 13430.86 examples/s]
Map: 100%|██████████| 1014/1014 [00:00<00:00, 16336.42 examples/s]


DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags_str', 'tags'],
        num_rows: 7659
    })
    validation: Dataset({
        features: ['tokens', 'tags_str', 'tags'],
        num_rows: 506
    })
    test: Dataset({
        features: ['tokens', 'tags_str', 'tags'],
        num_rows: 1014
    })
})

In [20]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity'],
 'tags': [3, 4, 0, 0, 13, 14]}

## Model

### Tokenization and label alignment

In [21]:
from transformers import AutoTokenizer

In [22]:
# model_ckpt = "distilbert/distilbert-base-uncased"
model_ckpt = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [23]:
input = dataset['train'][2]['tokens']
input

['5', 'star', 'resturants', 'in', 'my', 'town']

In [24]:
output = tokenizer(input, is_split_into_words=True)
output

{'input_ids': [101, 1019, 2732, 2717, 4648, 7666, 1999, 2026, 2237, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [25]:
output.word_ids()

[None, 0, 1, 2, 2, 2, 3, 4, 5, None]

In [26]:
tokenizer.convert_ids_to_tokens(output.input_ids)

['[CLS]', '5', 'star', 'rest', '##ura', '##nts', 'in', 'my', 'town', '[SEP]']

This where the problem emerges. The words that already been tagged are splitted into multiple tokens. That means, all the tokens related to the same word must also have the same tag.

In [27]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, max_length=512)
    
    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
    
        previous_word_id = None
        label_ids = []
        
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100) # -100 means it won't be included in loss calculation
            elif word_id != previous_word_id:
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id
        
        labels.append(label_ids)
        
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [28]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 7659/7659 [00:00<00:00, 10285.81 examples/s]
Map: 100%|██████████| 506/506 [00:00<00:00, 12412.81 examples/s]
Map: 100%|██████████| 1014/1014 [00:00<00:00, 13816.37 examples/s]


### Data collation

In [29]:
from transformers import DataCollatorForTokenClassification

In [30]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [31]:
import evaluate
import numpy as np

In [32]:
metric = evaluate.load('seqeval')
label_names = list(tag2index)
label_names

['O',
 'B-Hours',
 'I-Hours',
 'B-Rating',
 'I-Rating',
 'B-Price',
 'I-Price',
 'B-Restaurant_Name',
 'I-Restaurant_Name',
 'B-Location',
 'I-Location',
 'B-Dish',
 'I-Dish',
 'B-Amenity',
 'I-Amenity',
 'B-Cuisine',
 'I-Cuisine']

### Define compute metrics function

In [33]:
def compute_metrics(evaluation_preds):
    logits, labels = evaluation_preds

    # Exclude the -100 labels
    true_labels = [[label_names[l] for l in label if l != -100] 
                   for label in labels]
    
    predictions = np.argmax(logits, axis=-1)
    
    # Exclude the -100
    true_predictions = [[label_names[p] for p, l in zip(prediction, label) if l != -100] 
                        for prediction, label in zip(predictions, labels)]

    metrics = metric.compute(predictions=true_predictions, 
                             references=true_labels)

    return {
        "precision": metrics['overall_precision'],
        "recall": metrics['overall_recall'],
        "f1": metrics['overall_f1'],
        "accuracy": metrics['overall_accuracy'],
    }

### Training the model

In [34]:
import torch
from transformers import AutoModelForTokenClassification, AutoConfig

In [35]:
model_ckpt

'huawei-noah/TinyBERT_General_4L_312D'

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = AutoConfig.from_pretrained(model_ckpt, label2id=tag2index, id2label=index2tag)
model = AutoModelForTokenClassification.from_pretrained(model_ckpt, config=config).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
from transformers import TrainingArguments

In [42]:
training_args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy='epoch',
    disable_tqdm=False,
    seed=42
)

In [43]:
from transformers import Trainer

In [44]:
trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics, 
                  train_dataset=tokenized_dataset['train'], 
                  eval_dataset=tokenized_dataset['validation'], 
                  data_collator=data_collator,
                  tokenizer=tokenizer)

In [45]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.992,0.587155,0.585406,0.664783,0.622575,0.859553
2,0.5367,0.447729,0.650847,0.723164,0.685103,0.885913
3,0.4273,0.404118,0.681661,0.741996,0.71055,0.895825
4,0.3725,0.387807,0.679422,0.752354,0.71403,0.897933
5,0.3581,0.377438,0.698785,0.758004,0.727191,0.900886


TrainOutput(global_step=4790, training_loss=0.5004753797686423, metrics={'train_runtime': 163.4738, 'train_samples_per_second': 234.258, 'train_steps_per_second': 29.301, 'total_flos': 18846679290180.0, 'train_loss': 0.5004753797686423, 'epoch': 5.0})

In [46]:
trainer.save_model('./models/ner_tinybert')

### Load model and predict

In [47]:
from transformers import pipeline

classifier = pipeline('token-classification', model='models/ner_tinybert')

Device set to use cuda:0


In [48]:
classifier("which restaurant serves the best shushi in new york?")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-Rating',
  'score': np.float32(0.8767938),
  'index': 5,
  'word': 'best',
  'start': 28,
  'end': 32},
 {'entity': 'B-Dish',
  'score': np.float32(0.7074773),
  'index': 6,
  'word': 'shu',
  'start': 33,
  'end': 36},
 {'entity': 'B-Dish',
  'score': np.float32(0.40014237),
  'index': 7,
  'word': '##shi',
  'start': 36,
  'end': 39},
 {'entity': 'B-Location',
  'score': np.float32(0.7823262),
  'index': 9,
  'word': 'new',
  'start': 43,
  'end': 46},
 {'entity': 'I-Location',
  'score': np.float32(0.91048986),
  'index': 10,
  'word': 'york',
  'start': 47,
  'end': 51}]