<a href="https://colab.research.google.com/github/charishma27/IPMP/blob/main/ner_model_with_taskmaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install transformers

In [None]:
import json
import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

In [None]:
!wget https://raw.githubusercontent.com/google-research-datasets/Taskmaster/master/TM-3-2020/data/data_00.json

--2022-09-28 17:13:20--  https://raw.githubusercontent.com/google-research-datasets/Taskmaster/master/TM-3-2020/data/data_00.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15832328 (15M) [text/plain]
Saving to: ‘data_00.json’


2022-09-28 17:13:20 (137 MB/s) - ‘data_00.json’ saved [15832328/15832328]



In [None]:
f = open('data_00.json', 'r')
dataset_raw = json.load(f)

In [None]:
ontology = ["name.movie", "name.theater", "date.showing", "duration.movie", "location", "name.person", 
            "num.tickets", "price.ticket", "price.total", "time.preference", "time.showing"] 

In [None]:
dataset = []
for conversation in dataset_raw:
  # all_utterances = ""
  # all_labels = ""
  for utterance in conversation['utterances']:
    cur_text = utterance['text']
    start_indices = [0]
    start_indices += [i+1 for i, char in enumerate(cur_text) if char == ' ']
    words = cur_text.split()
    indices_to_words = dict(zip(start_indices, words))
    ner_tags = ['O'] * len(words)
    try:
      for segment in utterance['segments']:
        for i in range(len(start_indices)):
          if start_indices[i] == segment['start_index']:
            BIO_tag = 'B-'
          elif start_indices[i] > segment['start_index'] and start_indices[i] < segment['end_index']:
            BIO_tag = 'I-'
          else:
            continue
 
          if ner_tags[i] != 'O':
            # todo : add outer break in case outer loop has remainder section
            break
#          elif segment['annotations'][0]['name'] in ontology:
          else:
            ner_tags[i] = BIO_tag + segment['annotations'][0]['name'].split('.')[0]
    except:
      pass
    dataset.append([cur_text, ' '.join(ner_tags)])  
  #   all_utterances += cur_text + ' '
  #   all_labels += ' '.join(ner_tags) + ' '
  # dataset.append([all_utterances, all_labels])

df = pd.DataFrame(dataset, columns=['text', 'labels'])
df.head(20)

Unnamed: 0,text,labels
0,hi....am buying a ticket tonight so we go and ...,O O O O B-date O O O O O O O O B-name I-name I...
1,No problem. Is there a particular type of movi...,O O O O O O O O O O O O
2,hhhmmmmm not at all. i dont have any in mind f...,O O O O O O O O O O O O
3,Sure. I can help with that. Let me listings at...,O O O O O O O O O O O I-name I-name
4,sure you can but i want to see the movie at AM...,O O O O O O O O O O O B-name I-name I-name
5,"Oh, sorry about that. So you’re interested in ...",O O O O O O O O B-name O O B-name I-name I-name O
6,yeah,O
7,OK. I show one action movie playing at AMC Mou...,O O O O B-name O O O B-name I-name I-name B-na...
8,yeah but 9.10pm will be perfect for me,O O B-time O O O O O
9,Great. And how many tickets?,O O O O O


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
label_all_tokens = False

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [None]:
#df = df[0:1000]

labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()

for lb in labels:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])


In [None]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [11]:
def train_loop(model, df_train, df_val):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 5e-3
EPOCHS = 5
BATCH_SIZE = 2

model = BertModel()
train_loop(model, df_train, df_val)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Epochs: 1 | Loss:  0.430 | Accuracy:  nan | Val_Loss:  0.278 | Accuracy:  0.916


100%|██████████| 9624/9624 [31:42<00:00,  5.06it/s]


Epochs: 2 | Loss:  0.256 | Accuracy:  nan | Val_Loss:  0.257 | Accuracy:  0.924


100%|██████████| 9624/9624 [31:43<00:00,  5.05it/s]


Epochs: 3 | Loss:  0.204 | Accuracy:  nan | Val_Loss:  0.216 | Accuracy:  0.935


100%|██████████| 9624/9624 [31:42<00:00,  5.06it/s]


Epochs: 4 | Loss:  0.163 | Accuracy:  nan | Val_Loss:  0.214 | Accuracy:  0.939


100%|██████████| 9624/9624 [31:43<00:00,  5.06it/s]


Epochs: 5 | Loss:  0.141 | Accuracy:  nan | Val_Loss:  0.223 | Accuracy:  0.945


In [12]:
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0
    label_prediction_dict = {}

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)
            # print(loss, logits)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_test += acc
              
              predicted_labels = [ids_to_labels[i] for i in predictions.tolist()] 
              true_labels = [ids_to_labels[i] for i in label_clean.tolist()]
              
              for i in range(len(true_labels)):
                if true_labels[i] not in label_prediction_dict.keys():
                  label_prediction_dict.update({true_labels[i] : [0,0]})
                label_prediction_dict[true_labels[i]][1] += 1
                if predicted_labels[i] == true_labels[i]:
                  label_prediction_dict[true_labels[i]][0] += 1


    val_accuracy = total_acc_test / len(df_test)

    label_accurate_count = []
    label_total_count = []
    label_accuracy = []
    for key in label_prediction_dict.keys():
      label_accuracy.append(label_prediction_dict[key][0] / label_prediction_dict[key][1]) 
      label_accurate_count.append(label_prediction_dict[key][0])
      label_total_count.append(label_prediction_dict[key][1])
    tokenwise_evaluation = pd.DataFrame(list(zip(label_prediction_dict.keys(), label_accuracy, label_accurate_count, label_total_count)), columns = ['labels', 'accuracy', 'accurate labels', 'total labels'])
    print(tokenwise_evaluation)
    print(f'\nTest Accuracy: {total_acc_test / len(df_test): .3f}')
    print('Accuracy average (non-weighted)', tokenwise_evaluation['accuracy'].mean())

evaluate(model, df_test)


           labels  accuracy  accurate labels  total labels
0               O  0.964149            18906         19609
1          B-name  0.788825              960          1217
2          B-time  0.844920              474           561
3          I-time  0.720779              111           154
4          I-name  0.839442             1443          1719
5       B-seating  0.157895                3            19
6       I-seating  0.736842               42            57
7           B-num  0.916667              220           240
8          B-date  0.828729              150           181
9         B-price  0.583333                7            12
10     B-location  0.802469              130           162
11     I-location  0.855932              202           236
12         B-type  0.700000               70           100
13  B-description  0.137615               15           109
14  I-description  0.454822              599          1317
15          I-num  0.638889               69           1

In [13]:
def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)
            
evaluate_one_text(model, 'Bill Gates is the founder of Microsoft')

Bill Gates is the founder of Microsoft
['B-name', 'I-name', 'O', 'O', 'O', 'O', 'O']


In [14]:
evaluate_one_text(model,'I met Kiran at 5 pm')

I met Kiran at 5 pm
['O', 'O', 'B-name', 'O', 'B-time', 'I-time']


In [15]:
evaluate_one_text(model,'We are going to KFC on 5th March')

We are going to KFC on 5th March
['O', 'O', 'O', 'O', 'B-name', 'O', 'B-date', 'I-date']


In [16]:
evaluate_one_text(model,'I met Kiran at 5')

I met Kiran at 5
['O', 'O', 'B-name', 'O', 'B-time']


In [17]:
evaluate_one_text(model,'I met Kiran at 5 pm on Saturday')

I met Kiran at 5 pm on Saturday
['O', 'O', 'B-name', 'O', 'B-time', 'I-time', 'O', 'B-date']


In [18]:
evaluate_one_text(model,'Why is John in Andhra Pradesh ?')

Why is John in Andhra Pradesh ?
['O', 'O', 'B-name', 'O', 'B-location', 'I-location', 'O']


In [19]:
evaluate_one_text(model,'Why is Charishma in Andhra Pradesh ?')

Why is Charishma in Andhra Pradesh ?
['O', 'O', 'O', 'O', 'B-location', 'I-location', 'O']


In [20]:
evaluate_one_text(model, "The director of Stella’s House is Clyde Stewart.	hi....am buying a ticket tonight so we go and see a movie at AMC mountain 16 No problem. Is there a particular type of movie you’re looking for? hhhmmmmm not at all. i dont have any in mind for now Sure. I can help with that. Let me listings at AMC Mercado 24. sure you can but i want to see the movie at AMC mountain 16 Oh, sorry about that. So you’re interested in action films at AMC Mountain 16, right? yeah OK. I show one action movie playing at AMC Mountain 16: No Time To Die. Remaining showtimes are 4:30pm, 6:40pm and 9:10pm. Does any of those work? yeah but 9.10pm will be perfect for me Great. And how many tickets? myself and two other persons are going to see a movie All right. Let me confirm that you’d like three tickets for No Time To Die at AMC Mountain 16 tonight at 9:10pm. Is that all correct? yeah Is it OK to go ahead and purchase these tickets? yes you can OK. Your tickets are purchased and details for how to proceed have been sent to your email address. Can I help with anything else? ok thanks but i dont need anything again OK. Enjoy your movie! ")

The director of Stella’s House is Clyde Stewart.	hi....am buying a ticket tonight so we go and see a movie at AMC mountain 16 No problem. Is there a particular type of movie you’re looking for? hhhmmmmm not at all. i dont have any in mind for now Sure. I can help with that. Let me listings at AMC Mercado 24. sure you can but i want to see the movie at AMC mountain 16 Oh, sorry about that. So you’re interested in action films at AMC Mountain 16, right? yeah OK. I show one action movie playing at AMC Mountain 16: No Time To Die. Remaining showtimes are 4:30pm, 6:40pm and 9:10pm. Does any of those work? yeah but 9.10pm will be perfect for me Great. And how many tickets? myself and two other persons are going to see a movie All right. Let me confirm that you’d like three tickets for No Time To Die at AMC Mountain 16 tonight at 9:10pm. Is that all correct? yeah Is it OK to go ahead and purchase these tickets? yes you can OK. Your tickets are purchased and details for how to proceed have bee