In [1]:
import datasets
import pandas as pd
import transformers
import torch
import os
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm as tqdm
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
device = torch.device("cuda")

In [2]:
data = datasets.load_dataset(path='multi_woz_v22', cache_dir='/data/.cache/huggingface/datasets')

No config specified, defaulting to: multi_woz_v22/v2.2_active_only
Found cached dataset multi_woz_v22 (/data/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2', cache_dir='/data/.cache/huggingface/transformers', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForTokenClassification.from_pretrained('gpt2', cache_dir='/data/.cache/huggingface/transformers')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.6.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.0.attn.masked_bias', 'h.11.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'classifier.weight', 'h.10.attn.masked_bias', 'h.1.attn.masked_bias', 'classifier.bias', 'h.7.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
class EarlyStopping:
	def __init__(self, patience=7, verbose=True, delta=0, path='./checkpoint.pt'):
		self.patience = patience
		self.verbose = verbose
		self.counter = 0
		self.best_score = None
		self.early_stop = False
		self.val_loss_min = float("inf")
		self.delta = delta
		self.path = path

	def __call__(self, val_loss, model):
		if self.patience > 0:
			score = -val_loss

			if self.best_score is None:
				self.best_score = score
				self.save_checkpoint(val_loss, model)
			elif score < self.best_score + self.delta:
				self.counter += 1
				print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
				if self.counter >= self.patience:
					self.early_stop = True
			else:
				self.best_score = score
				self.save_checkpoint(val_loss, model)
				self.counter = 0

	def save_checkpoint(self, val_loss, model):
		if self.verbose:
			print("")
			print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
		torch.save(model.state_dict(), self.path)
		self.val_loss_min = val_loss

In [5]:
class DialogDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer = tokenizer, max_length=None):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = self.tokenizer.model_max_length if max_length is None else max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        output = {}
        each_utt = self.data[index]['turns']['utterance']
        label = tokenizer(each_utt)['attention_mask']
        for i in range(len(label)):
            label[i][-1] = 0
        token_label = torch.tensor(sum(label, []), dtype=torch.long)[:self.max_length]
        token_label = (~token_label.bool()).float()
        if len(token_label) < self.max_length: # assign padding token label
            token_label = torch.cat([token_label[:self.max_length], torch.zeros(self.max_length - len(token_label))])
        token_dict = tokenizer(' '.join(each_utt), truncation=True, max_length=self.max_length, padding="max_length", return_tensors='pt')
        output['input_ids'], output['attention_mask'] = token_dict['input_ids'], token_dict['attention_mask']
        output['labels'] = token_label.type(torch.LongTensor)
        return output

In [6]:
train_dataset = DialogDataset(data['train'], tokenizer)
valid_dataset = DialogDataset(data['validation'], tokenizer)
test_dataset = DialogDataset(data['test'], tokenizer)

In [7]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [8]:
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
early_stopping = EarlyStopping(patience=3, verbose=True, path=f'saved/best_model.pt')

In [9]:
epochs = 20

In [11]:
train_loss = 0.0
for each_epoch in range(1, epochs):
    model.train()
    for each_batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = each_batch['input_ids'].to(device)
        attention_mask = each_batch['attention_mask'].to(device)
        labels = each_batch['labels'].to(device)
        out = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = out.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    with torch.no_grad():
        model.eval()
        valid_loss = 0.0
        for each_batch in tqdm(valid_loader):
            input_ids = each_batch['input_ids'].to(device)
            attention_mask = each_batch['attention_mask'].to(device)
            labels = each_batch['labels'].to(device)
            out = model(input_ids, attention_mask=attention_mask, labels=labels)
            valid_loss += out.loss.item()
        print(f'Epoch {each_epoch}: Train Loss: {train_loss / len(train_loader):.4f}, Valid Loss: {valid_loss / len(valid_loader):.4f}')

        early_stopping(valid_loss, model)
        if early_stopping.early_stop:
            print(f"Early stopping at {epoch-early_stopping_patience} !")
            break

  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 1: Train Loss: 0.1031, Valid Loss: 0.0872

Validation loss decreased (inf --> 5.490523).  Saving model ...


  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 2: Train Loss: 0.1875, Valid Loss: 0.0853

Validation loss decreased (5.490523 --> 5.371929).  Saving model ...


  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 3: Train Loss: 0.2630, Valid Loss: 0.0810

Validation loss decreased (5.371929 --> 5.101747).  Saving model ...


  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 4: Train Loss: 0.3303, Valid Loss: 0.0846
EarlyStopping counter: 1 out of 3


  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 5: Train Loss: 0.3888, Valid Loss: 0.0864
EarlyStopping counter: 2 out of 3


  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 6: Train Loss: 0.4392, Valid Loss: 0.0949
EarlyStopping counter: 3 out of 3


  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 7: Train Loss: 0.4831, Valid Loss: 0.1028
EarlyStopping counter: 4 out of 3


  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 8: Train Loss: 0.5220, Valid Loss: 0.0988
EarlyStopping counter: 5 out of 3


  0%|          | 0/528 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
def test_model(model, test_loader):
	model.eval()
	pred = []
	with torch.no_grad():
		for each_batch in tqdm(test_loader):
			input_ids = each_batch['input_ids'].to(device)
			attention_mask = each_batch['attention_mask'].to(device)
			out = model(input_ids, attention_mask=attention_mask)
			pred.append(out.logits.cpu().numpy())
	return pred

In [26]:
def get_pred(pred):
	prediction = []
	for each_pred in pred:
		prediction.append(each_pred[0])
	return prediction

In [27]:
test_pred = test_model(model, test_loader)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [50]:
len(test_pred)

1000

In [30]:
test_pred[0].shape

(1, 1, 1024, 2)

In [46]:
prob = torch.nn.functional.sigmoid(torch.tensor(test_pred[0].squeeze())[:,1])

In [66]:
v, ind = prob.sort(descending=True)
prob.sort(descending=True)

torch.return_types.sort(
values=tensor([0.8800, 0.8568, 0.8547,  ..., 0.0011, 0.0010, 0.0010]),
indices=tensor([ 30, 159,  64,  ..., 160, 659,  20]))

In [68]:
v[:10], ind[:10]

(tensor([0.8800, 0.8568, 0.8547, 0.8234, 0.7969, 0.7779, 0.7765, 0.6645, 0.5243,
         0.3890]),
 tensor([ 30, 159,  64,  84,  44, 139, 118, 172, 144, 138]))

In [60]:
(test_dataset[0]['labels']==1).int().nonzero()

tensor([[  9],
        [ 30],
        [ 44],
        [ 64],
        [ 84],
        [118],
        [139],
        [159],
        [172],
        [179]])

In [70]:
acc = 0
for each_label in (test_dataset[0]['labels']==1).int().nonzero().squeeze():
    if each_label in ind[:10]:
       acc+=1
    else:
        print(each_label)
print(f'The first sample data has {acc/10} accuracy') 
    

tensor(9)
tensor(179)
The first sample data has 0.8 accuracy


In [76]:
test_dataset[0]['input_ids'].squeeze()[(test_dataset[0]['labels']==1).int().nonzero().squeeze()]

tensor([9458,   30,   13,   30,   13,   30,   30,   30,    0,   13])

In [77]:
tokenizer.decode(test_dataset[0]['input_ids'].squeeze()[(test_dataset[0]['labels']==1).int().nonzero().squeeze()]) # real turn.

'bridge?.?.???!.'

In [78]:
tokenizer.decode(test_dataset[0]['input_ids'].squeeze()[ind[:10]]) # predict turn.

'???..??! cinema cinema'

기호 전처리 필요,,,,,

In [39]:
torch.tensor(test_pred[0])[:,:,:,1]

tensor([[[-5.5570, -5.4620, -4.2655,  ..., -5.7574, -5.7514, -4.8050]]])

In [15]:
predictions = get_pred(test_pred)

In [21]:
test_dataset[0]

{'input_ids': tensor([[   40,   761,  4512,  ..., 50256, 50256, 50256]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 0, 0,  ..., 0, 0, 0])}

In [23]:
predictions[0].sum(), 

9

https://huggingface.co/transformers/v3.0.2/_modules/transformers/modeling_bert.html#BertForTokenClassification

AutoModelForTokenClassification: 
1. sequence_output = AutoModel 의 output(각 토큰의 vector)
2. sequence_output = dropout(config.hidden_dropout_prob)
3. logits = Linear(token_dim -> num_labels)
4. loss = CrossEntropy 인데, attention_mask가 1인 토큰만 계산. logits, labels 모두