In [1]:
# imports and PAD_TOKEN
PAD_TOKEN = 0
import os
import json

import torch
print(torch.cuda.is_available())
device = 'cuda:0' # cuda:0 means we are using the GPU with id 0, if you have multiple GPU
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # Used to report errors on CUDA side
import torch.utils.data as data

from torch.utils.data import DataLoader

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# import bert
from transformers import BertTokenizer, BertModel, BertForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


True


In [2]:
# datasets
def word2id(raw_dataset):
# returns a dictionary of words and their ids
    words = []
    for entry in raw_dataset:
       words.extend(entry['utterance'].split())
    words = list(set(words))
    words_dict = {'pad': PAD_TOKEN}
    words_dict.update({w:i+1 for i, w in enumerate(words)})
    words_dict['unk'] = len(words_dict)
    return words_dict

def slot2id(raw_dataset):
# returns a dictionary of slots and their ids
    slots = ['pad']
    for entry in raw_dataset:
       slots.extend(entry['slots'].split())
    slots = list(set(slots))
    slots_dict = {s:i for i, s in enumerate(slots)}
    return slots_dict

def intent2id(raw_dataset):
# returns a dictionary of intents and their ids
    intents = [entry['intent'] for entry in raw_dataset]
    intents = list(set(intents))
    intents_dict = {inte:i for i, inte in enumerate(intents)}
    return intents_dict

class Lang():
    def __init__(self, train_raw, dev_raw, test_raw):
        self.word2id = word2id(train_raw)
        self.slot2id = slot2id(train_raw + dev_raw + test_raw)
        self.intent2id = intent2id(train_raw + dev_raw + test_raw)
        self.id2word = {v:k for k, v in self.word2id.items()}
        self.id2slot = {v:k for k, v in self.slot2id.items()}
        self.id2intent = {v:k for k, v in self.intent2id.items()}
        # self.intent_list = list(set(list(self.intent2id.keys())))
        # self.slot_list = list(set(list(self.slot2id.keys())))
def load_data(path):
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

In [3]:
class IntentsAndSlots (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, dataset, lang, unk='unk'):
        self.utterances = []
        self.intents = []
        self.slots = []
        self.unk = unk
        
        for x in dataset:
            self.utterances.append(x['utterance'])
            self.slots.append(x['slots'])
            self.intents.append(x['intent'])

        self.utt_ids = self.mapping_seq(self.utterances, lang.word2id)
        self.slot_ids = self.mapping_seq(self.slots, lang.slot2id)
        self.intent_ids = self.mapping_lab(self.intents, lang.intent2id)

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        utt = torch.Tensor(self.utt_ids[idx])
        slots = torch.Tensor(self.slot_ids[idx])
        intent = self.intent_ids[idx]
        sample = {'utterance': utt, 'slots': slots, 'intent': intent}
        return sample
    
    # Auxiliary methods
    
    def mapping_lab(self, data, mapper):
        return [mapper[x] if x in mapper else mapper[self.unk] for x in data]
    
    def mapping_seq(self, data, mapper): # Map sequences to number
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq.split():
                if x in mapper:
                    tmp_seq.append(mapper[x])
                else:
                    tmp_seq.append(mapper[self.unk])
            res.append(tmp_seq)
        return res

def collate_fn(data):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len 
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths)==0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape 
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(len(sequences),max_len).fill_(PAD_TOKEN)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq # We copy each sequence into the matrix
        # print(padded_seqs)
        padded_seqs = padded_seqs.detach()  # We remove these tensors from the computational graph
        return padded_seqs, lengths
    # Sort data by seq lengths
    data.sort(key=lambda x: len(x['utterance']), reverse=True) 
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]
    # We just need one length for packed pad seq, since len(utt) == len(slots)
    src_utt, _ = merge(new_item['utterance'])
    y_slots, y_lengths = merge(new_item["slots"])
    intent = torch.LongTensor(new_item["intent"])
    
    src_utt = src_utt.to(device) # We load the Tensor on our seleceted device
    y_slots = y_slots.to(device)
    intent = intent.to(device)
    y_lengths = torch.LongTensor(y_lengths).to(device)
    
    new_item["utterances"] = src_utt
    new_item["intents"] = intent
    new_item["y_slots"] = y_slots
    new_item["slots_len"] = y_lengths
    return new_item

In [4]:
def prepare_data(dataset):
    train_raw = load_data(os.path.join('data', dataset, 'train.json'))
    test_raw = load_data(os.path.join('data', dataset, 'test.json'))
    dev_raw = load_data(os.path.join('data', dataset, 'valid.json'))
    
    lang = Lang(train_raw, dev_raw, test_raw)

    ##############################
    train_dataset = IntentsAndSlots(train_raw, lang)
    dev_dataset = IntentsAndSlots(dev_raw, lang)
    test_dataset = IntentsAndSlots(test_raw, lang)
    
    ##############################
    train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn,  shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=64, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)
    
    return train_loader, dev_loader, test_loader, lang

In [5]:
dataset = "ATIS"
train_loader, dev_loader, test_loader, lang = prepare_data(dataset)
hid_size = 200
emb_size = 300

lr = 0.0001 # learning rate
clip = 5 # Clip the gradient

out_slot = len(lang.slot2id)
out_int = len(lang.intent2id)
vocab_len = len(lang.word2id)

train_raw = load_data(os.path.join('data', dataset, 'train.json'))
test_raw = load_data(os.path.join('data', dataset, 'test.json'))
dev_raw = load_data(os.path.join('data', dataset, 'valid.json'))

In [6]:
class BERTClassification(nn.Module):
    def __init__ (self):
        super(BERTClassification, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.bert_drop = nn.Dropout(0.4)
        self.out = nn.Linear(768, 1)
        
    def forward(self, ids, mask, token_type_ids):
        _, pooledOut = self.bert(torch.tensor(ids), attention_mask = torch.tensor(mask),
                                token_type_ids=torch.tensor(token_type_ids), return_dict=False)
        bertOut = self.bert_drop(pooledOut)
        output = self.out(bertOut)
        
        return output

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_utterances = []
train_slots = []
train_intents = []
for element in train_raw:
    train_utterances.append(element['utterance'])
    train_slots.append(element['slots'])
    train_intents.append(element['intent'])
max_len_utt = max([len(x) for x in train_utterances])

valid_utterances = []
valid_slots = []
valid_intents = []
for element in dev_raw:
    valid_utterances.append(element['utterance'])
    valid_slots.append(element['slots'])
    valid_intents.append(element['intent'])
max_len_valid = max([len(x) for x in valid_utterances])

test_utterances = []
test_slots = []
test_intents = []
for element in test_raw:
    test_utterances.append(element['utterance'])
    test_slots.append(element['slots'])
    test_intents.append(element['intent'])
max_len_test = max([len(x) for x in test_utterances])

model = BERTClassification()
tokenized_valid = tokenizer(valid_utterances, padding=True)
res = model(tokenized_valid['input_ids'], tokenized_valid['attention_mask'], tokenized_valid['token_type_ids'])
print(tokenizer.decode(res))


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'