**!!! Edit -- Notebook settings -- GPU !!!**

# ATIS dataset import

In [1]:
# download ATIS testing data
!wget 'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/standard_format/rasa/test.json'
# download ATIS training data
!wget 'https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/standard_format/rasa/train.json'

--2020-11-01 20:09:00--  https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/standard_format/rasa/test.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 788180 (770K) [text/plain]
Saving to: ‘test.json.2’


2020-11-01 20:09:00 (20.2 MB/s) - ‘test.json.2’ saved [788180/788180]

--2020-11-01 20:09:00--  https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/standard_format/rasa/train.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4586495 (4.4M) [text/plain]
Saving to: ‘train.json.2’


2020-11-01

In [2]:
# reading downloaded json files and extracting the relevant data into pandas DataFrame

import os
import pandas as pd

def load_json(data_file):
  if os.path.isfile(data_file):
    with open(data_file, 'r') as read_file:
      lines = read_file.readlines()
      return [l.strip('\n').lstrip() for l in lines]
        
        
def get_dataframe(data):
  texts = []
  intents = []
    
  for line in data:
    if 'text' in line:
      texts.append(line[len("text\":  \""):-2])
    elif 'intent' in line:
      i = line[len("intent\":  \""):-2]

      # it there are more intents, only the first one is selected
      if '+' in i: i = i.split('+')[0]
            
      intents.append(i)

  return pd.DataFrame({'text':texts, 'intent':intents})

In [3]:
# getting training dataframe
df_train = get_dataframe(load_json('train.json'))
# getting testing dataframe
df_test = get_dataframe(load_json('test.json'))

In [4]:
# merging the train and test files together + shuffling
df = pd.concat([df_train, df_test]).reset_index(drop=True)

In [5]:
# making the 'intent' coulmn categorical
df['intent'] = pd.Categorical(df['intent'])
# adding a new column with category codes instead of strings
df['codes'] = df.intent.cat.codes

In [6]:
df.head()

Unnamed: 0,text,intent,codes
0,i want to fly from boston at 838 am and arrive...,flight,10
1,what flights are available from pittsburgh to ...,flight,10
2,what is the arrival time in san francisco for ...,flight_time,12
3,cheapest airfare from tacoma to orlando,airfare,2
4,round trip fares from pittsburgh to philadelph...,airfare,2


In [7]:
# transformes (BERT models) installation
!pip install transformers



In [8]:
import numpy as np
import time
from sklearn import metrics

from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import tensorflow as tf
import torch
from torch.utils.data import Dataset, DataLoader

# Tokenizer building

Selection based on the Hugging Face models: https://huggingface.co/models

In [9]:
# smallest version of BERT
MODEL_TYPE='google/bert_uncased_L-2_H-128_A-2'

# standard version of BERT
# MODEL_TYPE='bert-base-uncased'

In [10]:
# maximal length (in tokens) of a sentence
MAX_LEN = 128
# training batch size
TRAIN_BATCH_SIZE = 32
# valiadtion batch size
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE * 2
# number of training epochs
EPOCHS = 50
# learning rate
LEARNING_RATE = 2e-5

# tokenizer initialization
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

In [11]:
# the class handles data extraction
class CustomDataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
    # tokenizer to tokenize the sentences
    self.tokenizer = tokenizer
    # dataset as dataframe
    self.data = dataframe
    # sentences
    self.text = dataframe.text
    # intent targets
    self.targets = self.data.codes
    # max length for tokenizer
    self.max_len = max_len

  # returns the length of the dataset
  def __len__(self):
    return len(self.text)

  # returns information about a sentence indexed by the index parameter
  def __getitem__(self, index):
    text = str(self.text[index])
    text = " ".join(text.split())

    # tokenization of the sentence
    inputs = self.tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        truncation=True,
        padding='max_length',
        return_token_type_ids=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.targets[index], dtype=torch.float)
    }

In [12]:
# ratio of the training set and validation set
train_size = 0.8
train_dataset = df.sample(frac=train_size)

# make testing dataset by dropping the "training rows"
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
# make training dataset
train_dataset = train_dataset.reset_index(drop=True)

# prints the sizes of the datasets
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# make classes that handle the data
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (5871, 3)
TRAIN Dataset: (4697, 3)
TEST Dataset: (1174, 3)


In [13]:
# training parameters
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

# validation parameters
test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

# torch loaders for data handling
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [14]:
# set device to GPU
device = 'cuda'

In [15]:
# load the pretrained BERT classifier for sequences -> attentions need to be outputted for later visualizations
model = BertForSequenceClassification.from_pretrained(MODEL_TYPE, output_attentions=True, num_labels=len(np.unique(np.array(df.intent))))
# making use of the GPU
model.to(device)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, element

In [16]:
# loss function defition
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

In [17]:
# optimizer for training
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [18]:
# training epoch definition
def train(epoch):
  model.train()

  # for counting the average accuracy on epoch data
  accuracy = 0
  # for counting the average loss on epoch data
  loss_cnt = 0

  for _, data in enumerate(training_loader, 0):
    # extracting batch data
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.float)

    # extracting predicted logits
    outputs = model(ids, mask, token_type_ids)[0]

    # selecting the predicted sentiment from the logits
    _outputs = [np.argmax(i) for i in outputs.cpu().detach().numpy()]
    # formatting the batch targets = true sentiment
    _targets = np.array(targets.cpu().detach().numpy().tolist())
    
    # adding batch accuracy
    accuracy += metrics.accuracy_score(_targets, _outputs)

    optimizer.zero_grad()
    # calculating the batch loss
    loss = loss_fn(outputs, targets)
    # adding batch loss
    loss_cnt += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # print epoch results
  print(f'Epoch: {epoch + 1}, Loss: {loss_cnt/len(training_loader)}, Accuracy: {accuracy/len(training_loader)}')

In [19]:
# training loop
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 1, Loss: 2.3364123106002808, Accuracy: 0.6891496598639456
Epoch: 2, Loss: 1.7811467444815603, Accuracy: 0.7409268707482993
Epoch: 3, Loss: 1.446283881356116, Accuracy: 0.7408673469387755
Epoch: 4, Loss: 1.243121866060763, Accuracy: 0.7409268707482993
Epoch: 5, Loss: 1.0569073462567362, Accuracy: 0.770017006802721
Epoch: 6, Loss: 0.9231394411755257, Accuracy: 0.8026955782312926
Epoch: 7, Loss: 0.8220816080262061, Accuracy: 0.8325850340136054
Epoch: 8, Loss: 0.7420371898583004, Accuracy: 0.8595748299319728
Epoch: 9, Loss: 0.675845709787745, Accuracy: 0.8775595238095238
Epoch: 10, Loss: 0.6116389989041958, Accuracy: 0.8972619047619048
Epoch: 11, Loss: 0.5600735523871013, Accuracy: 0.911658163265306
Epoch: 12, Loss: 0.5077246048418033, Accuracy: 0.9224999999999999
Epoch: 13, Loss: 0.45826152919912017, Accuracy: 0.932763605442177
Epoch: 14, Loss: 0.42126902795973276, Accuracy: 0.9377465986394558
Epoch: 15, Loss: 0.38330657861265194, Accuracy: 0.9423894557823129
Epoch: 16, Loss: 0.353

In [20]:
def validation(epoch):
  model.eval()
  
  # for batch accuracy calculation
  accuracy = 0

  with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
      # extracting batch data
      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.float)

      # extracting predicted logits
      outputs = model(ids, mask, token_type_ids)[0]
      
      # selecting the predicted sentiment from the logits
      _outputs = [np.argmax(i) for i in outputs.cpu().detach().numpy()]
      # formatting the batch targets = true sentiment
      _targets = np.array(targets.cpu().detach().numpy().tolist())
      
      # adding batch accuracy
      accuracy += metrics.accuracy_score(_targets, _outputs)

  return accuracy/len(testing_loader)

In [21]:
# for counting the overall average validation accuracy
accuracy = 0

# validation loop
for epoch in range(EPOCHS):
    accuracy += validation(epoch)

# print validation accuracy
print(f'Validation accuracy: {accuracy/EPOCHS}')

Validation accuracy: 0.9745125598086116


# Attention visualization

In [22]:
# download the bertviz module
import sys

!rm -r bertviz_repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path: sys.path += ['bertviz_repo']

Cloning into 'bertviz_repo'...
remote: Enumerating objects: 1074, done.[K
remote: Total 1074 (delta 0), reused 0 (delta 0), pack-reused 1074[K
Receiving objects: 100% (1074/1074), 99.41 MiB | 26.79 MiB/s, done.
Resolving deltas: 100% (687/687), done.


In [23]:
# function for attention visualization in the cell
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [24]:
# import bertviz
from bertviz import head_view

In [25]:
# tokenizes selected sentence
def process_sentence(sentence):
  # tokenize sentence
  inputs = tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            return_token_type_ids=True
        )

  # extract data from tokenization
  ids = torch.tensor([inputs['input_ids']], dtype=torch.long)
  mask = torch.tensor([inputs['attention_mask']], dtype=torch.long)
  token_type_ids = torch.tensor([inputs['token_type_ids']], dtype=torch.long)

  # calculate BERT output
  output = model(ids, mask, token_type_ids)

  return output, ids

In [26]:
# the visualization might not appear during the first exection, but it will with the following cell execution
# this handles the error message
first_run = True

In [27]:
# ####################################################################################
#   THE FOLLOWING CODE SELECTS RANDOM SENTIMENT AND RANDOM SENTENCE FROM THE DATASET 
# ####################################################################################


# model needs to run on CPU to visualize
model = model.to('cpu')

# array of all available intents in the dataset
intent_categ = np.unique(np.array(df.intent))
# selecting random intent for visualization
intent_index = np.random.randint(0, len(intent_categ))

# extracting the senetences with the selected intent
text_extract = df[df.intent == intent_categ[intent_index]].text
# selecting random sentence index from the extracted set
text_index = np.random.randint(0, len(text_extract))

# selecting the random sentence
sentence = text_extract.iloc[text_index]

# processing the sentence (tokenization mainly)
output, ids = process_sentence(sentence)

# selecting the result of the model (predicted intent)
result = np.argmax(np.array(output[0].detach().numpy())[0])

# print results (intent frequency - occurencies in the dataset)
print('Selected intent:', intent_categ[intent_index], ', intent frequency:', len(text_extract),'/',len(df))
print('Predicted intent:', np.array(intent_categ)[result])
print()
print('Selected sentence:', sentence)
print()

# ------------------------------------------------------------
# VISUALIZATION SECTION

# saving the attention output
attention = output[-1]

input_id_list = ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()

# visualization module
head_view(attention, tokens)

if first_run:
  print('Reload the cell if the visualization didn\'t appear.')
  first_run = False

Selected intent: flight_no , intent frequency: 21 / 5871
Predicted intent: flight_no

Selected sentence: i need the flight numbers of flights leaving from cleveland and arriving at dallas



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Reload the cell if the visualization didn't appear.
