# T5_base

In [1]:
# !pip install transformers
# !pip install sentencepiece

In [2]:
import os
import torch
from torch import cuda, nn, optim
from transformers import TrainingArguments, Trainer, logging
from torch.utils.data import Dataset, DataLoader
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Load model

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

device = 'cuda' if cuda.is_available() else 'cpu'
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

## Data Preprocessing

In [4]:
woz_directory = 'data/'

with open(woz_directory + 'WOZ_train_utt.txt') as q:
    questions = q.readlines()
    for i in range(len(questions)):
        questions[i] = questions[i].strip('\n')

In [5]:
def preprocess_data(utterances, answers):
    with open(woz_directory + utterances) as q:
        input_texts = q.readlines()
        for i in range(len(input_texts)):
            input_texts[i] = input_texts[i].strip('\n')
            input_texts[i] = 'Please generate the slots and intent from this dialogue: ' + input_texts[i]
    
    with open(woz_directory + answers) as a:
        output_texts = a.readlines()
        for i in range(len(output_texts)):
            output_texts[i] = output_texts[i].strip('\n')
    print(input_texts[0], output_texts[0])
    
    inputs = tokenizer(input_texts, return_token_type_ids=False)
    outputs = tokenizer(output_texts, return_token_type_ids=False)
    return inputs, outputs

In [6]:
train_inputs, train_outputs = preprocess_data(
    'WOZ_train_utt.txt', 'WOZ_train_ans.txt'
)
dev_inputs, dev_outputs = preprocess_data(
    'WOZ_dev_utt.txt', 'WOZ_dev_ans.txt'
)
test_inputs, test_outputs = preprocess_data(
    'WOZ_test_utt.txt', 'WOZ_test_ans.txt'
)

Please generate the slots and intent from this dialogue: Guten Tag, I am staying overnight in Cambridge and need a place to sleep. I need free parking and internet. find_hotel|hotel-area=centre|hotel-internet=yes|hotel-parking=yes
Please generate the slots and intent from this dialogue: I'm looking for a local place to dine in the centre that serves chinese food. find_restaurant|restaurant-area=centre|restaurant-food=chinese
Please generate the slots and intent from this dialogue: Hello, I am looking for a restaurant in Cambridge. I believe it is called Golden Wok. find_restaurant|restaurant-name=golden wok


In [7]:
print(len(train_inputs['input_ids']))
print(train_inputs['input_ids'][0])
print(train_inputs[0].ids)
print(train_inputs[0].type_ids)
print(train_inputs[0].tokens)
print(train_inputs[0].offsets)
print(train_inputs[0].attention_mask)
print(train_inputs[0].special_tokens_mask)
print(train_inputs[0].overflowing)

3760
[863, 3806, 8, 9653, 11, 9508, 45, 48, 7478, 10, 7756, 35, 3284, 6, 27, 183, 6597, 8521, 16, 10096, 11, 174, 3, 9, 286, 12, 2085, 5, 27, 174, 339, 3078, 11, 1396, 5, 1]
[863, 3806, 8, 9653, 11, 9508, 45, 48, 7478, 10, 7756, 35, 3284, 6, 27, 183, 6597, 8521, 16, 10096, 11, 174, 3, 9, 286, 12, 2085, 5, 27, 174, 339, 3078, 11, 1396, 5, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['▁Please', '▁generate', '▁the', '▁slots', '▁and', '▁intent', '▁from', '▁this', '▁dialogue', ':', '▁Gut', 'en', '▁Tag', ',', '▁I', '▁am', '▁staying', '▁overnight', '▁in', '▁Cambridge', '▁and', '▁need', '▁', 'a', '▁place', '▁to', '▁sleep', '.', '▁I', '▁need', '▁free', '▁parking', '▁and', '▁internet', '.', '</s>']
[(0, 6), (6, 15), (15, 19), (19, 25), (25, 29), (29, 36), (36, 41), (41, 46), (46, 55), (55, 56), (56, 60), (60, 62), (62, 66), (66, 67), (67, 69), (69, 72), (72, 80), (80, 90), (90, 93), (93, 103), (103, 107), (107, 112), (112, 113),

In [8]:
class DiologueDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        # target_ids = self.inputs['input_ids'][idx]

        target_ids = self.outputs['input_ids'][idx]
        target_attention_mask = self.outputs['attention_mask'][idx]
        return {"input_ids": input_ids, "attention_mask":attention_mask, "output_ids":target_ids}


def collate_fn(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    batch_output = [torch.LongTensor(example['output_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_label = pad_sequence(batch_output, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=0)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask, "labels": padded_batch_label}

def to_device(data, device):
    new_data = {}
    for k in data:
        new_data[k] = data[k].to(device)
    return new_data

In [9]:
train_dataset = DiologueDataset(train_inputs, train_outputs)
train_loader = DataLoader(train_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

dev_dataset = DiologueDataset(dev_inputs, dev_outputs)
dev_loader = DataLoader(dev_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)


## Train model

In [10]:
@torch.no_grad()
def evaluate(model:nn.Module, eval_loader:DataLoader):
    eval_loss = 0.0
    correct = 0
    total = 0
    model.eval()
    print("eval_loader len:", len(eval_loader))
    for batch in eval_loader:
        batch = to_device(batch, device)
        output = model(**batch)
        loss = output.loss
        eval_loss += loss.item()
        pred = output.logits.argmax(-1)
        label = batch["labels"]
        correct += torch.where(label!=0, pred==label, 0).sum().item()
        total += torch.sum(label!=0).item()

    eval_acc = correct / total
    eval_loss = eval_loss / len(eval_loader) 
    print(total, correct)
    return eval_acc, eval_loss

In [11]:
epoches = 1
optimizer = optim.Adam(model.parameters(), lr=3e-5)     # previous version 5e-5 looks better
model.train()

for epoch in range(epoches):
    epoch_loss = 0.0
    log_loss = 0.0
    for idx, batch in enumerate(train_loader):
        model.zero_grad()
        batch = to_device(batch, device)
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        log_loss += loss.item()

        # wandb.log({'batch':idx, 'train_loss': loss.item()})
        # wandb.log({'batch':idx, 'accumulated_train_loss_in_this_1k_batches': log_loss})

        if idx % 150 == 0:
            print(f"Train Step: {idx} Loss: {log_loss / 150}")
            log_loss = 0.0
    print(f"Epoch: {epoch+1} Loss is: {epoch_loss}")
    eval_acc, eval_loss = evaluate(model, dev_loader)
    print(f"Epoch {epoch+1} Eval Acc: {eval_acc}; Eval Loss: {eval_loss}")

Train Step: 0 Loss: 0.03238400777180989
Train Step: 150 Loss: 2.1733781623840334
Train Step: 300 Loss: 0.8113558496907354
Train Step: 450 Loss: 0.5311556944002708
Train Step: 600 Loss: 0.48901610673715673
Train Step: 750 Loss: 0.4515633435640484
Train Step: 900 Loss: 0.36205762611702086
Train Step: 1050 Loss: 0.3129330038403471
Train Step: 1200 Loss: 0.3333235457198073
Train Step: 1350 Loss: 0.27230547653511167
Train Step: 1500 Loss: 0.2871263163156497
Train Step: 1650 Loss: 0.2551037438927839
Train Step: 1800 Loss: 0.231920765908435
Train Step: 1950 Loss: 0.1989355744017909
Train Step: 2100 Loss: 0.1815970986895263
Train Step: 2250 Loss: 0.21971997374047836
Train Step: 2400 Loss: 0.1676079621181513
Train Step: 2550 Loss: 0.1953194213612005
Train Step: 2700 Loss: 0.20159890882360437
Train Step: 2850 Loss: 0.15450358804761588
Train Step: 3000 Loss: 0.15862636485602707
Train Step: 3150 Loss: 0.16593955893302337
Train Step: 3300 Loss: 0.14067676994076464
Train Step: 3450 Loss: 0.134239411

## Evaluation

In [12]:
@torch.no_grad()
def generate_slots(model, loader):

    all_preds = []
    all_labels = []
    model.eval()
    for batch in loader:
        batch = to_device(batch, device)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, return_dict_in_generate=True, pad_token_id=0, max_length=512, top_k=15)
        truncated_outputs = []

        decode_texts = tokenizer.batch_decode([l[l != 0] for l in outputs["sequences"]])
        gold_texts = tokenizer.batch_decode([l[l != 0] for l in labels])

        for gold, decode in zip(gold_texts, decode_texts):
            l = gold.replace('</s>', '')
            p = decode.replace('</s>', '')
            all_labels.append(l)
            all_preds.append(p)
        # print(decode_texts)
        # print(gold_texts)
        # break
    
    return all_preds, all_labels

def accuracy(sys, gold):
    correct = 0
    total = 0
    for s, g in zip(sys, gold):
        total += 1
        if s == g:
            correct += 1
    
    accuracy = correct/total
    return accuracy

In [13]:
# validation set
sys, gold = generate_slots(model, dev_loader)

In [14]:
acc = accuracy(sys, gold)
print(f"Accuracy for Validation set is {acc}")

Accuracy for Validation set is 0.8426150121065376


### Test Set

In [15]:
test_dataset = DiologueDataset(test_inputs, test_outputs)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
pred, gold = generate_slots(model, test_loader)

In [16]:
print(len(pred), pred[:2])
accuracy(pred, gold)

400 ['find_restaurant|restaurant-name=golden wok', 'find_hotel|hotel-internet=yes|hotel-type=guesthouse']


0.8825

In [17]:
with open (woz_directory+'WOZ_test_ans.txt') as t:
    gold_provided = t.readlines()
    for i in range(len(gold_provided)):
        gold_provided[i] = gold_provided[i].strip('\n ')

In [18]:
accuracy(gold_provided, gold)

1.0

In [19]:
accuracy(pred, gold_provided)

0.8825

In [20]:
for p, g in zip(pred, gold_provided):
    if p != g:
        print('p: ', p, '\n', 'g: ', g)

p:  find_hotel|hotel-internet=yes|hotel-type=guesthouse 
 g:  find_hotel|hotel-internet=yes
p:  find_hotel|hotel-stars=2|hotel-type=guesthouse 
 g:  find_hotel|hotel-stars=2
p:  find_hotel|hotel-name=sala thong 
 g:  find_restaurant|restaurant-name=sala thong
p:  find_restaurant|restaurant-area=west|restaurant-food=african 
 g:  find_restaurant|restaurant-area=west|restaurant-food=afghan
p:  find_restaurant|restaurant-name=slug and lettuce restaurant 
 g:  find_restaurant|restaurant-name=the slug and lettuce
p:  find_hotel|hotel-pricerange=cheap|hotel-type=guesthouse 
 g:  find_hotel|hotel-pricerange=cheap|hotel-stars=4
p:  find_restaurant|restaurant-food=African|restaurant-pricerange=expensive 
 g:  find_restaurant|restaurant-food=african|restaurant-pricerange=expensive
p:  find_restaurant|restaurant-area=centre|restaurant-food=north b and b 
 g:  find_hotel|hotel-name=city centre north b and b
p:  find_restaurant|restaurant-area=centre|restaurant-food=brazliian 
 g:  find_restaurant|

In [21]:
with open ('allard_a_pred_t5.csv', 'w') as fout:
    fout.write('ID,Expected\n')
    for i, p in enumerate(pred):
        fout.write(str(i)+','+p+'\n')