**!!! Edit -- Notebook settings -- GPU !!!**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# MultiWOZ dataset import

In [2]:
!wget 'https://www.repository.cam.ac.uk/bitstream/handle/1810/294507/MULTIWOZ2.1.zip'

--2021-01-11 10:24:27--  https://www.repository.cam.ac.uk/bitstream/handle/1810/294507/MULTIWOZ2.1.zip
Resolving www.repository.cam.ac.uk (www.repository.cam.ac.uk)... 131.111.98.67
Connecting to www.repository.cam.ac.uk (www.repository.cam.ac.uk)|131.111.98.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13794372 (13M) [application/zip]
Saving to: ‘MULTIWOZ2.1.zip’


2021-01-11 10:24:29 (11.7 MB/s) - ‘MULTIWOZ2.1.zip’ saved [13794372/13794372]



In [3]:
!unzip 'MULTIWOZ2.1.zip'

Archive:  MULTIWOZ2.1.zip
   creating: MULTIWOZ2.1/
  inflating: MULTIWOZ2.1/train_db.json  
   creating: __MACOSX/
   creating: __MACOSX/MULTIWOZ2.1/
  inflating: __MACOSX/MULTIWOZ2.1/._train_db.json  
  inflating: MULTIWOZ2.1/testListFile.json  
  inflating: __MACOSX/MULTIWOZ2.1/._testListFile.json  
  inflating: MULTIWOZ2.1/.DS_Store   
  inflating: __MACOSX/MULTIWOZ2.1/._.DS_Store  
  inflating: MULTIWOZ2.1/police_db.json  
  inflating: __MACOSX/MULTIWOZ2.1/._police_db.json  
  inflating: MULTIWOZ2.1/ontology.json  
  inflating: __MACOSX/MULTIWOZ2.1/._ontology.json  
  inflating: MULTIWOZ2.1/dialogue_acts.json  
  inflating: __MACOSX/MULTIWOZ2.1/._dialogue_acts.json  
  inflating: MULTIWOZ2.1/data.json   
  inflating: __MACOSX/MULTIWOZ2.1/._data.json  
  inflating: MULTIWOZ2.1/taxi_db.json  
  inflating: __MACOSX/MULTIWOZ2.1/._taxi_db.json  
  inflating: MULTIWOZ2.1/README.json  
  inflating: __MACOSX/MULTIWOZ2.1/._README.json  
  inflating: MULTIWOZ2.1/restaurant_db.json  
  infla

In [4]:
# reading downloaded json files and extracting the relevant data into pandas DataFrame

import os
import json
import re
import pandas as pd
import random

def preload_data():
    # from MultiWOZ-Parser; reading the names of the files for training, testing, validation datasets
    # https://github.com/jojonki/MultiWOZ-Parser/blob/master/parser.py

    def load_json(data_file):
        if os.path.isfile(data_file):
            with open(data_file, 'r') as read_file:
                data = json.load(read_file)
                return data

    def load_list_file(list_file):
        with open(list_file, 'r') as read_file:
            dialog_id_list = read_file.readlines()
            dialog_id_list = [l.strip('\n') for l in dialog_id_list]
            return dialog_id_list
        return
    
    # extracts the utterances from the MultiWOZ dataset
    def get_utterances(data):
        utterances = []

        for block in data:
            data = block['log']

            for ut in data:
                # replacing whitespace characters with spaces
                text = re.sub("\\s", " ", ut['text'])
                text = re.sub("[^a-zA-Z0-9 ]+", "", ut['text'])

                utterances.append(text)

        return utterances
    
    def split_data(data):
        X = []
        Y = []
        for i in range(len(data)):
            tokens = data[i].split()

            if (i <= len(data)/2) and (len(tokens) > 4):
                # picking random point for splitting the conversation turn
                l = random.randrange(1, len(tokens) - 3)
                # splitting data
                X.append(' '.join(tokens[:l]))
                # adding 0 to the target list -> 0 -- interrupted turn 
                Y.append('interrupted')

            # second section of the dataset is made out of full utterances
            else:
                # adding the full uninterrupted conversation turn
                X.append(data[i])
                # adding 1 to the target list -> 1 -- uninterrupted turn 
                Y.append('finished')

        # shuffling the dataset
        c = list(zip(X, Y, data))
        random.shuffle(c)
        X, Y, data = zip(*c)

        return X,Y,data

    # extracting data
    dialog_data_file = './MULTIWOZ2.1/data.json'
    dialog_data = load_json(dialog_data_file)
    dialog_id_list = list(set(dialog_data.keys()))

    valid_list_file = './MULTIWOZ2.1/valListFile.json'
    test_list_file = './MULTIWOZ2.1/testListFile.json'

    valid_id_list = list(set(load_list_file(valid_list_file)))
    test_id_list = load_list_file(test_list_file)
    train_id_list = [did for did in dialog_id_list if did not in (valid_id_list + test_id_list)]

    train_data = [v for k, v in dialog_data.items() if k in train_id_list]
    valid_data = [v for k, v in dialog_data.items() if k in valid_id_list]
    test_data = [v for k, v in dialog_data.items() if k in test_id_list]
    
    # merging all datasets together
    data = train_data + valid_data + test_data
    utterances = get_utterances(data)
    
    X, Y, data_clean = split_data(utterances)
    
    return pd.DataFrame(data={'text': X, 'intent': Y}).sample(20000)

In [5]:
# getting the dataset as dataframe
df = preload_data()

In [6]:
# making the 'intent' coulmn categorical
df['intent'] = pd.Categorical(df['intent'])
# adding a new column with category codes instead of strings
df['codes'] = df.intent.cat.codes

In [7]:
df.head()

Unnamed: 0,text,intent,codes
466,Great when would you like the reservation and ...,finished,0
13895,Yes can you help me find a train departing Cam...,finished,0
92865,Can I get the postcode of that place,finished,0
32334,Okay I can help you with that Where will you b...,finished,0
133062,Where would you like to depart from,finished,0


In [8]:
# transformes (BERT models) installation
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 14.5MB/s eta 0:00:01[K     |▍                               | 20kB 16.5MB/s eta 0:00:01[K     |▋                               | 30kB 10.0MB/s eta 0:00:01[K     |▉                               | 40kB 8.8MB/s eta 0:00:01[K     |█                               | 51kB 5.4MB/s eta 0:00:01[K     |█▎                              | 61kB 6.0MB/s eta 0:00:01[K     |█▌                              | 71kB 6.2MB/s eta 0:00:01[K     |█▊                              | 81kB 6.7MB/s eta 0:00:01[K     |██                              | 92kB 6.2MB/s eta 0:00:01[K     |██▏                             | 102kB 5.3MB/s eta 0:00:01[K     |██▍                             | 112kB 5.3MB/s eta 0:00:01[K     |██▋                             | 122kB 5.

In [9]:
import numpy as np
import time
from sklearn import metrics

from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import tensorflow as tf
import torch
from torch.utils.data import Dataset, DataLoader

# Tokenizer building

Selection based on the Hugging Face models: https://huggingface.co/models

In [10]:
# smallest version of BERT
# MODEL_TYPE='google/bert_uncased_L-2_H-128_A-2'

# standard version of BERT
MODEL_TYPE='bert-base-uncased'

In [11]:
# maximal length (in tokens) of a sentence
MAX_LEN = 128
# training batch size
TRAIN_BATCH_SIZE = 32
# valiadtion batch size
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE * 2
# number of training epochs
EPOCHS = 10
# learning rate
LEARNING_RATE = 2e-5

# tokenizer initialization
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [12]:
# the class handles data extraction
class CustomDataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
    # tokenizer to tokenize the sentences
    self.tokenizer = tokenizer
    # dataset as dataframe
    self.data = dataframe
    # sentences
    self.text = dataframe.text
    # intent targets
    self.targets = self.data.codes
    # max length for tokenizer
    self.max_len = max_len

  # returns the length of the dataset
  def __len__(self):
    return len(self.text)

  # returns information about a sentence indexed by the index parameter
  def __getitem__(self, index):
    text = str(self.text[index])
    text = " ".join(text.split())

    # tokenization of the sentence
    inputs = self.tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        truncation=True,
        padding='max_length',
        return_token_type_ids=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.targets[index], dtype=torch.float)
    }

In [13]:
# ratio of the training set and validation set
train_size = 0.8
train_dataset = df.sample(frac=train_size)

# make testing dataset by dropping the "training rows"
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
# make training dataset
train_dataset = train_dataset.reset_index(drop=True)

# prints the sizes of the datasets
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# make classes that handle the data
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (20000, 3)
TRAIN Dataset: (16000, 3)
TEST Dataset: (4000, 3)


In [14]:
# training parameters
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

# validation parameters
test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

# torch loaders for data handling
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [15]:
# set device to GPU
device = 'cuda'

In [16]:
# loss function defition
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

In [17]:
# training epoch definition
def train(epoch):
  model.train()

  # for counting the average accuracy on epoch data
  accuracy = 0
  # for counting the average loss on epoch data
  loss_cnt = 0

  for _, data in enumerate(training_loader, 0):
    # extracting batch data
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.float)

    # extracting predicted logits
    outputs = model(ids, mask, token_type_ids)[0]

    # selecting the predicted sentiment from the logits
    _outputs = [np.argmax(i) for i in outputs.cpu().detach().numpy()]
    # formatting the batch targets = true sentiment
    _targets = np.array(targets.cpu().detach().numpy().tolist())

    # adding batch accuracy
    accuracy += metrics.accuracy_score(_targets, _outputs)

    optimizer.zero_grad()
    # calculating the batch loss
    loss = loss_fn(outputs, targets)
    # adding batch loss
    loss_cnt += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # print epoch results
  print(f'Epoch: {epoch + 1}, Loss: {loss_cnt/len(training_loader)}, Accuracy: {accuracy/len(training_loader)}')

In [18]:
!mkdir "ued_model"

In [19]:
LOAD_SAVED=True

MODEL_DIR="ued_model"
SAVED_DIR="drive/My Drive/Colab Notebooks/AlquistAI/"

if LOAD_SAVED:
  !cp "drive/My Drive/Colab Notebooks/AlquistAI/ued_model.zip" "./"
  !unzip "ued_model.zip"

  model = BertForSequenceClassification.from_pretrained("ued_model", local_files_only=True)

  # making use of the GPU
  # model.to(device)

else:
  # load the pretrained BERT classifier for sequences -> attentions need to be outputted for later visualizations
  model = BertForSequenceClassification.from_pretrained(MODEL_TYPE, output_attentions=True, num_labels=len(np.unique(np.array(df.intent))))
  # making use of the GPU
  model.to(device)

  # optimizer for training
  optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

  # training loop
  for epoch in range(EPOCHS):
      train(epoch)

  # save model
  !mkdir "ued_model"
  model.save_pretrained(MODEL_DIR)
  !zip -r "ued_model.zip" "ued_model"
  !cp "ued_model.zip" "drive/My Drive/Colab Notebooks/AlquistAI/"

Archive:  ued_model.zip
  inflating: ued_model/config.json   
  inflating: ued_model/pytorch_model.bin  


In [20]:
def validation(epoch):
  model.eval()
  
  # for batch accuracy calculation
  accuracy = 0

  with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
      # extracting batch data
      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.float)

      # extracting predicted logits
      outputs = model(ids, mask, token_type_ids)[0]
      
      # selecting the predicted sentiment from the logits
      _outputs = [np.argmax(i) for i in outputs.cpu().detach().numpy()]
      # formatting the batch targets = true sentiment
      _targets = np.array(targets.cpu().detach().numpy().tolist())
      
      # adding batch accuracy
      accuracy += metrics.accuracy_score(_targets, _outputs)

  return accuracy/len(testing_loader)

In [21]:
if not LOAD_SAVED:  
  # for counting the overall average validation accuracy
  accuracy = 0

  # validation loop
  for epoch in range(EPOCHS):
      accuracy += validation(epoch)

  # print validation accuracy
  print(f'Validation accuracy: {accuracy/EPOCHS}')

# Attention visualization

In [22]:
# download the bertviz module
import sys

!rm -r bertviz_repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path: sys.path += ['bertviz_repo']

rm: cannot remove 'bertviz_repo': No such file or directory
Cloning into 'bertviz_repo'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 1151 (delta 3), reused 8 (delta 3), pack-reused 1140[K
Receiving objects: 100% (1151/1151), 130.05 MiB | 24.34 MiB/s, done.
Resolving deltas: 100% (728/728), done.


In [23]:
# function for attention visualization in the cell
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [24]:
# import bertviz
from bertviz import head_view

In [25]:
# tokenizes selected sentence
def process_sentence(sentence):
  # tokenize sentence
  inputs = tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            return_token_type_ids=True
        )

  # extract data from tokenization
  ids = torch.tensor([inputs['input_ids']], dtype=torch.long)
  mask = torch.tensor([inputs['attention_mask']], dtype=torch.long)
  token_type_ids = torch.tensor([inputs['token_type_ids']], dtype=torch.long)

  # calculate BERT output
  output = model(ids, mask, token_type_ids)

  return output, ids

In [26]:
# the visualization might not appear during the first exection, but it will with the following cell execution
# this handles the error message
first_run = True

In [50]:
# ####################################################################################
#   THE FOLLOWING CODE SELECTS RANDOM SENTIMENT AND RANDOM SENTENCE FROM THE DATASET 
# ####################################################################################


# model needs to run on CPU to visualize
model = model.to('cpu')

# array of all available intents in the dataset
intent_categ = np.unique(np.array(df.intent))
# selecting random intent for visualization
intent_index = np.random.randint(0, len(intent_categ))

# extracting the senetences with the selected intent
text_extract = df[df.intent == intent_categ[intent_index]].text
# selecting random sentence index from the extracted set
text_index = np.random.randint(0, len(text_extract))

# selecting the random sentence
sentence = text_extract.iloc[text_index]

# processing the sentence (tokenization mainly)
output, ids = process_sentence(sentence)

# selecting the result of the model (predicted intent)
result = np.argmax(np.array(output[0].detach().numpy())[0])

# print results (intent frequency - occurencies in the dataset)
print('Selected intent:', intent_categ[intent_index], ', intent frequency:', len(text_extract),'/',len(df))
print('Predicted intent:', np.array(intent_categ)[result])
print()
print('Selected sentence:', sentence)
print()

# ------------------------------------------------------------
# VISUALIZATION SECTION

# saving the attention output
attention = output[-1]

input_id_list = ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()

# visualization module
head_view(attention, tokens)

if first_run:
  print('Reload the cell if the visualization didn\'t appear.')
  first_run = False

Selected intent: interrupted , intent frequency: 9548 / 20000
Predicted intent: interrupted

Selected sentence: Yes I am looking for



<IPython.core.display.Javascript object>

# Analysis

In [28]:
import matplotlib.pyplot as plt
from collections import Counter

In [29]:
def get_top_index(attention):
  n_layers = np.array(attention).shape[0]
  n_heads = attention[0].detach().numpy()[0].shape[0]
  n_tokens = attention[0].detach().numpy()[0].shape[1]

  sum_cls = np.zeros(n_tokens)

  for layer in range(n_layers):
    layer_output = attention[layer].detach().numpy()[0]

    for head in range(n_heads):
      sum_cls += layer_output[head][0]

  # average attention
  avg_cls = sum_cls/(n_layers * n_heads)

  # index of the token with the most attention
  top_index = np.argmax(avg_cls)

  # index of the token with the second most attention
  second_top_index = np.argsort(-avg_cls)[1]

  return top_index, second_top_index

In [30]:
analysis_data = test_dataset

In [31]:
# stores the n-th number from the end ("I would like to" -- "to" -> 1, "like" -> 2, "would" -> 3, ...)
interrupted_position = []
# second most important position
interrupted_position_2 = []
# the token with the most attention
interrupted_token = []
# the bigram at the end of an utterance -> without [SEP] and [CLS]
interrupted_bigram = []
# number of tokens in a sentence
interrupted_token_n = []

finished_position = []
finished_position_2 = []
finished_token = []
finished_token_n = []
finished_bigram = []

for i in range(len(analysis_data)):
  sentence = analysis_data.iloc[i].text

  output, ids = process_sentence(sentence)

  result = np.argmax(np.array(output[0].detach().numpy())[0])

  attention = output[-1]

  input_id_list = ids[0].tolist()
  tokens = tokenizer.convert_ids_to_tokens(input_id_list)

  top_index, second_top_index = get_top_index(attention)

  # classified as interrupted
  if result == 1:
    interrupted_position.append(abs(top_index - len(tokens)))
    interrupted_position_2.append(abs(second_top_index - len(tokens)))
    interrupted_token.append(tokens[top_index])
    if tokens[-3] != '[CLS]':
      interrupted_bigram.append(tokens[-3] + ' ' + tokens[-2])
    interrupted_token_n.append(len(tokens))

  # classified as finished
  else:
    finished_position.append(abs(top_index - len(tokens)))
    finished_position_2.append(abs(second_top_index - len(tokens)))
    finished_token.append(tokens[top_index])
    if tokens[-3] != '[CLS]':
      finished_bigram.append(tokens[-3] + ' ' + tokens[-2])
    finished_token_n.append(len(tokens))

# UED based on words only

Word selection based on training dataset

### Last word

In [32]:
n_interrupted_sentences = len(train_dataset[train_dataset.intent=='interrupted'])

last_words = []

for sentence_index in range(n_interrupted_sentences):
  word = train_dataset[train_dataset.intent=='interrupted'].text.iloc[sentence_index].split()[-1].lower()
  last_words.append(word)

In [33]:
best_param = 0
best_acc = 0

classification_true = list(test_dataset.codes)

n_test_sentences = len(test_dataset)

for TOP_PARAMETER in np.arange(1,250,1):
  top_last_words = [x[0] for x in Counter(last_words).most_common(TOP_PARAMETER)]

  # ------------------------------------------------------------------------------

  classification = []

  for sentence_index in range(n_test_sentences):
    last_word = test_dataset.text.iloc[sentence_index].split()[-1].lower()

    if last_word in top_last_words:
      classification.append(1)
    else:
      classification.append(0)

  # ------------------------------------------------------------------------------

  acc = metrics.accuracy_score(classification_true, classification)

  # ------------------------------------------------------------------------------

  print(TOP_PARAMETER, acc)

  if acc > best_acc:
    best_acc = acc
    best_param = TOP_PARAMETER

print(best_acc, best_param)

1 0.54175
2 0.53175
3 0.551
4 0.56125
5 0.57275
6 0.5785
7 0.59175
8 0.60125
9 0.61
10 0.6155
11 0.622
12 0.6315
13 0.63225
14 0.63425
15 0.6425
16 0.6505
17 0.65625
18 0.663
19 0.66975
20 0.6765
21 0.6835
22 0.685
23 0.6885
24 0.691
25 0.69
26 0.692
27 0.69575
28 0.70025
29 0.70225
30 0.7045
31 0.70825
32 0.6865
33 0.6885
34 0.69075
35 0.693
36 0.69175
37 0.6915
38 0.69325
39 0.696
40 0.69875
41 0.7015
42 0.70325
43 0.70525
44 0.70675
45 0.69925
46 0.699
47 0.69025
48 0.68275
49 0.68325
50 0.68375
51 0.67475
52 0.6765
53 0.674
54 0.66775
55 0.66975
56 0.66825
57 0.66975
58 0.665
59 0.66575
60 0.666
61 0.66875
62 0.67
63 0.672
64 0.67325
65 0.672
66 0.67325
67 0.6735
68 0.66725
69 0.6705
70 0.67
71 0.67175
72 0.67275
73 0.6625
74 0.66325
75 0.666
76 0.66725
77 0.66475
78 0.6655
79 0.66525
80 0.64625
81 0.648
82 0.638
83 0.638
84 0.64025
85 0.63925
86 0.637
87 0.63125
88 0.63075
89 0.632
90 0.6335
91 0.634
92 0.63375
93 0.63425
94 0.6355
95 0.63675
96 0.63675
97 0.63775
98 0.63975
99 0.

In [34]:
print('train:',len(train_dataset[train_dataset.codes==0]), len(train_dataset[train_dataset.codes==1]))
print('test:',len(test_dataset[test_dataset.codes==0]), len(test_dataset[test_dataset.codes==1]))

train: 8389 7611
test: 2063 1937


### Last 2 words

In [35]:
last_2_words = []

for sentence_index in range(n_interrupted_sentences):
  sentence = train_dataset[train_dataset.intent=='interrupted'].text.iloc[sentence_index]

  if len(sentence.split()) >= 2:
    word_1 = sentence.split()[-1].lower()
    word_2 = sentence.split()[-2].lower()
      
    bigram = word_1 + " " + word_2

    last_2_words.append(bigram)

In [36]:
best_param = 0
best_acc = 0

for TOP_PARAMETER in np.arange(1,250,1):
  top_last_2_words = [x[0] for x in Counter(last_2_words).most_common(TOP_PARAMETER)]

  # ------------------------------------------------------------------------------

  classification = []
  classification_true = []

  for sentence_index in range(n_test_sentences):
    sentence = test_dataset.text.iloc[sentence_index]

    if len(sentence.split()) >= 2:
      word_1 = sentence.split()[-1].lower()
      word_2 = sentence.split()[-2].lower()      
      bigram = word_1 + " " + word_2

      if bigram in top_last_2_words:
        classification.append(1)
      else:
        classification.append(0)

      classification_true.append(test_dataset.codes.iloc[sentence_index])

  # ------------------------------------------------------------------------------

  acc = metrics.accuracy_score(classification_true, classification)

  # ------------------------------------------------------------------------------

  if acc > best_acc:
    best_acc = acc
    best_param = TOP_PARAMETER

print(best_acc, best_param)

0.6341599331290053 156


### Logistic regression

In [37]:
N_LAST_WORDS=1

logreg_tokens = []
logreg_classes = []

for index in range(len(train_dataset)):
  row = train_dataset.iloc[index]

  row_class = int(row.codes)
  row_sentence = ' '.join(row.text.lower().split()[-N_LAST_WORDS:])

  if len(row.text.lower().split()) >= N_LAST_WORDS:
    _, ids = process_sentence(row_sentence)
    tokens = ids[0].tolist()[:-1][1:]

    logreg_tokens.append(tokens)
    logreg_classes.append(row_class)

In [38]:
max_len = 50

# finding the longest sequence (tokenizing of some words produces more tokens)
# for seq in logreg_tokens: max_len = max(len(seq), max_len)

# padding tokens
logreg_tokens = [np.hstack([tokens, np.zeros(max_len - len(tokens))]) for tokens in logreg_tokens]

In [39]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import RidgeCV

token_model = LogisticRegression(random_state=12122020, max_iter=5000).fit(logreg_tokens, logreg_classes)

In [40]:
print(
  list(token_model.predict(logreg_tokens)).count(0),
  list(token_model.predict(logreg_tokens)).count(1)
)

5433 10567


In [41]:
train_acc = metrics.accuracy_score(token_model.predict(logreg_tokens), logreg_classes)

print('training accuracy:', train_acc)

training accuracy: 0.618625


In [42]:
logreg_test_tokens = []
logreg_test_classes = []

for index in range(len(test_dataset)):
  row = test_dataset.iloc[index]

  row_class = int(row.codes)
  row_sentence = ' '.join(row.text.lower().split()[-N_LAST_WORDS:])

  if len(row.text.lower().split()) >= N_LAST_WORDS:
    _, ids = process_sentence(row_sentence)
    tokens = ids[0].tolist()[:-1][1:]

    logreg_test_tokens.append(tokens)
    logreg_test_classes.append(row_class)

# padding tokens
logreg_test_tokens = [np.hstack([tokens, np.zeros(max_len - len(tokens))]) for tokens in logreg_test_tokens]

In [43]:
test_acc = metrics.accuracy_score(token_model.predict(logreg_test_tokens), logreg_test_classes)

print('testing accuracy:', test_acc)

testing accuracy: 0.614
