In [1]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
#change current working directory
#%cd "/content/drive/MyDrive/1_dialog_act"

In [3]:
!pip install transformers
!pip install accelerate
#!pip install datasets
!python3 -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
from transformers import AlbertForSequenceClassification, AlbertTokenizer, AlbertConfig
from transformers import AdamW
from transformers import EarlyStoppingCallback, IntervalStrategy
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, f1_score
from torch import cuda
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.preprocessing import normalize
from itertools import product
import json
import shutil

import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

#import nltk

#nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")

#sys.path.append(os.path.abspath('../../'))
#from util import generate_metrics_latex_table



## Parameters

In [5]:
SAVE_MODEL_TO_PATH = "./saved_models/"
#TRAIN_DATA_SAVE_PATH = "utterances_act_types/train.json"
#TEST_DATA_SAVE_PATH = "utterances_act_types/test.json"
#VALID_DATA_SAVE_PATH = "utterances_act_types/valid.json"
TRAIN_DATA_SAVE_PATH = "/kaggle/input/preprocessed-data-question-tags-no-none/train.json"
TEST_DATA_SAVE_PATH = "/kaggle/input/preprocessed-data-question-tags-no-none/test.json"
VALID_DATA_SAVE_PATH = "/kaggle/input/preprocessed-data-question-tags-no-none/valid.json"

#best_model_path = SAVE_MODEL_TO_PATH + './best_model_dialog_act.pt'

best_model_path = '/kaggle/input/saved-models/2_2_model_question_tags_no_none.pt'

PRETRAINED_MODELS = {
    'bert': 'bert-large-uncased',
    'roberta': 'roberta-base',
    'xlnet': 'xlnet-large-cased',
    'xlm': 'xlm-mlm-en-2048',
    'distilbert': 'distilbert-base-uncased',
    'albert':'albert-base-v2'
}

MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig),
    'albert':(AlbertForSequenceClassification,AlbertTokenizer, AlbertConfig)
}

MODEL_TYPE = 'roberta'
PRETRAINED_MODEL_NAME = PRETRAINED_MODELS[MODEL_TYPE]

model_class, tokenizer_class, config_class = MODEL_CLASSES[MODEL_TYPE]

LEARNING_RATE = 1e-5
BATCH_SIZE = 32
EPOCHS = 50
WEIGHT_DECAY = 0.01

## Loading preprocessed data

In [6]:
def save_variable_to_json(variable, file_path):
    with open(file_path, 'w') as file:
        json.dump(variable, file)

def load_variable_from_json(file_path):
    with open(file_path, 'r') as file:
        variable = json.load(file)
    return variable

In [7]:
try:
    X_train, y_train = load_variable_from_json(TRAIN_DATA_SAVE_PATH)
    X_test, y_test = load_variable_from_json(TEST_DATA_SAVE_PATH)
    X_valid, y_valid = load_variable_from_json(VALID_DATA_SAVE_PATH)

    print('Data loaded from file.')
except:
    print('No saved data found.')

Data loaded from file.


In [8]:
longest_train_data = max(X_train + X_test + X_valid, key=lambda x: len(x.split()))
print('Longest utterance length:', len(longest_train_data.split()))

num_labels = len(set([act for act_list in y_train for act in act_list ]))

all_labels = sorted(set([act for act_list in y_train for act in act_list ]))
print('Labels:', all_labels)


Longest utterance length: 121
Labels: ['hotel-address', 'hotel-area', 'hotel-internet', 'hotel-name', 'hotel-parking', 'hotel-phone', 'hotel-postcode', 'hotel-pricerange', 'hotel-ref', 'hotel-stars', 'hotel-type', 'restaurant-address', 'restaurant-area', 'restaurant-food', 'restaurant-name', 'restaurant-phone', 'restaurant-postcode', 'restaurant-pricerange', 'restaurant-ref']


## Tokenizing and creating dataloader

In [9]:
# This will help us to transform the labels into a one-hot encoded numeric array
mlb = MultiLabelBinarizer(classes=list(all_labels))
mlb.fit(y_train)

In [10]:
# Load the pre-trained BERT model and tokenizer
tokenizer = tokenizer_class.from_pretrained(PRETRAINED_MODEL_NAME)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
max_length = min(2 ** (len(tokenizer.tokenize(longest_train_data))-1).bit_length(), 512)
print('Max chosen length:', max_length)


Max chosen length: 256


In [12]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, X, y, max_length):
        self.X = X
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Fit the label binarizer and transform the labels into one-hot encoded format
        self.labels = mlb.fit_transform(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Encode the utterance using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            self.X[idx],
            add_special_tokens=True,
            max_length = self.max_length,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            truncation=False,
            return_tensors='pt'
        )
        # Convert the list of strings into a one-hot encoded format
        label = self.labels[idx]  # This should now be a binary vector instead of a list of strings
        # Return the encoding and the label
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float),
            'token_type_ids': encoding['token_type_ids'].flatten()
        }

In [13]:
# Smaller dataset to try
p = 1
n_train_samples = int(len(X_train) * p)
n_test_samples = int(len(X_test) * p)
n_valid_samples = int(len(X_valid) * p)

print("Train samples: ", n_train_samples)
print("Test samples: ", n_test_samples)
print("Valid samples: ", n_valid_samples)

Train samples:  4800
Test samples:  571
Valid samples:  591


In [14]:
train_dataset = CustomDataset(tokenizer, X_train[:n_train_samples], y_train[:n_train_samples], max_length)
test_dataset = CustomDataset(tokenizer, X_test[:n_test_samples], y_test[:n_test_samples], max_length)
valid_dataset = CustomDataset(tokenizer, X_valid[:n_valid_samples], y_valid[:n_valid_samples], max_length)

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [16]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Training

In [17]:
def load_ckp(checkpoint_fpath, model, mlb):
    checkpoint = torch.load(checkpoint_fpath, map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    mlb.set_params(**checkpoint['mlb'])
    return model, mlb

def save_ckp(state, best_model_path):
    torch.save(state, best_model_path)


In [18]:
class BERTClass(torch.nn.Module):
    def __init__(self, pretrained_model_name, num_labels):
        super(BERTClass, self).__init__()
        self.num_labels = num_labels
        self.l1 = model_class.from_pretrained(pretrained_model_name, num_labels=self.num_labels)
        self.pre_classifier = torch.nn.Linear(self.num_labels, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, self.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = output.view(-1, self.num_labels)  # Reshape the output
        return output

model = BERTClass(PRETRAINED_MODEL_NAME, num_labels)
model = model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY, correct_bias=False)



In [20]:
def valid(model, valid_dataloader):
    val_targets = []
    val_outputs = []
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.float)

            outputs = model(input_ids, attention_mask, token_type_ids)

            loss = loss_fn(outputs, labels)
            val_loss += loss.item()

            val_targets.extend(labels.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            

    val_loss /= len(valid_dataloader)
    
    return val_loss, val_targets, val_outputs

In [21]:
def train(model, train_dataloader):
    model.train()
    train_loss = 0
    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device, dtype=torch.long)
        attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.float)

        model.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids)

        loss = loss_fn(outputs, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    train_loss /= len(train_dataloader)
    
    return train_loss

In [22]:
def train_model(num_epochs, train_dataloader, valid_dataloader, model, optimizer, best_model_path, patience = 1):
    valid_loss_min = np.Inf

    num_not_improved = 0
    for epoch in range(1, num_epochs):
        print()
        print("#################### Epoch {}: Training Start    ####################".format(epoch))

        train_loss = train(model, train_dataloader)
        print('#################### Epoch {}: Training End      ####################'.format(epoch))

        print()
        print("#################### Epoch {}: Validation Start ####################".format(epoch))

        valid_loss, val_targets, val_outputs = valid(model, valid_dataloader)

        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(epoch, train_loss, valid_loss))

        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min, valid_loss))

            checkpoint = {
                        'state_dict': model.state_dict(),
                        'mlb' : mlb.get_params()
                    }

            save_ckp(checkpoint, best_model_path)
            valid_loss_min = valid_loss
            num_not_improved = 0
        else:
            num_not_improved += 1
            if num_not_improved >= patience:
                print('Not improvement for more than:', num_not_improved)
                break
        print("#################### Epoch {}: Validation End   ####################".format(epoch))
        print()

    print("#################### Training finished     ####################")
    return model


In [23]:
try:
  trained_model, mlb = load_ckp(best_model_path, model, mlb)
except:
  print('No saved model found. Need to be train from scratch.')
  trained_model = train_model(EPOCHS, train_dataloader, valid_dataloader, model, optimizer,  best_model_path)


## Evaluation

In [24]:
test_loss, test_labels , test_predictions_probs = valid(trained_model, test_dataloader)


100%|██████████| 18/18 [00:06<00:00,  2.60it/s]


In [25]:
threshold = 0.5
test_predictions = [[prob > threshold for prob in prob_list] for prob_list in test_predictions_probs ]

In [26]:
print('Accuracy:', accuracy_score(test_labels, test_predictions))
print('Precision:', precision_score(test_labels, test_predictions, average='weighted'))
print('Recall:', recall_score(test_labels, test_predictions, average='weighted'))
print('F1:', f1_score(test_labels, test_predictions, average='weighted'))

report = classification_report(test_labels, test_predictions, target_names=mlb.classes_)
print(report)

Accuracy: 0.8353765323992994
Precision: 0.9373894479220644
Recall: 0.8848560700876095
F1: 0.9076853720656518
                       precision    recall  f1-score   support

        hotel-address       0.94      0.81      0.87        54
           hotel-area       0.89      0.63      0.74        27
       hotel-internet       1.00      0.78      0.88        32
           hotel-name       0.92      0.81      0.86        27
        hotel-parking       0.93      0.84      0.88        31
          hotel-phone       0.96      0.99      0.97        73
       hotel-postcode       0.91      0.91      0.91        55
     hotel-pricerange       1.00      0.85      0.92        34
            hotel-ref       0.98      1.00      0.99        44
          hotel-stars       0.00      0.00      0.00         7
           hotel-type       0.62      0.38      0.48        13
   restaurant-address       0.96      0.93      0.94       101
      restaurant-area       0.83      0.71      0.77        14
      re

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Generating report for LaTex

In [27]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, coverage_error, label_ranking_average_precision_score, label_ranking_loss
import numpy as np
import pandas as pd

def generate_metrics_latex_table(model_name, task_number, true_labels, binary_predictions, prediction_probs, target_names):
    report = classification_report(true_labels, binary_predictions, target_names=target_names, digits=3, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df['support'] = df['support'].astype(int)
    df = df.rename({'precision': r'\textbf{Precision}', 'recall': r'\textbf{Recall}', 'f1-score': r'\textbf{F1-Score}', 'support': r'\textbf{Support}'}, axis=1)

    # Generating additional metrics
    accuracy = accuracy_score(true_labels, binary_predictions)
    precision, recall, f_score, _ = precision_recall_fscore_support(true_labels, binary_predictions, average='micro')

    # Calculating multilabel-specific metrics
    coverage_err = coverage_error(true_labels, prediction_probs)
    lrap = label_ranking_average_precision_score(true_labels, prediction_probs)
    ranking_loss = label_ranking_loss(true_labels, prediction_probs)

    # Calculate best/worst/expected values where applicable
    # Best possible coverage error is the average number of true labels per instance
    best_coverage = true_labels.sum(axis=1).mean()
    # The worst case is the total number of labels
    worst_coverage = true_labels.shape[1]

    # For LRAP, the best value is 1 and the worst is 0. Expected is the baseline or random performance.
    best_lrap = 1.0
    worst_lrap = 0.0  # This is theoretical; in practice, it's unlikely to get 0

    # For ranking loss, the best value is 0. 
    best_rl = 0.0
    # The worst case needs to account for the number of possible incorrect pairings. For each instance, it's the number of true labels times the number of false labels
    worst_rl = np.mean([(sum(row) * (len(row) - sum(row))) for row in true_labels])


    # Converting to LaTeX table
    latex_table = df.to_latex(float_format="%.3f", column_format='|l|c|c|c|c|')
    # Removing some stuff from df.to_latex() output
    latex_table = latex_table.replace('\\toprule\n ', r'\hline' + '\n' + r'\textbf{Class}') \
                             .replace('\\midrule\n', '') \
                             .replace('\\bottomrule', r'\multicolumn{5}{c}{}\\') \
                             .replace('\\end{tabular}\n', '') \
                             .replace(r'\\', r'\\ \hline') \
                             .replace('\nmicro avg','\\hline\nmicro avg')
    
    # Adding overall metrics
    overall_metrics = f"""
{latex_table}
\\textbf{{Accuracy}}                    & \\multicolumn{{4}}{{c|}}{{{accuracy:.3f}}}                                 \\\\ \\hline
\\textbf{{Overall Precision}}           & \\multicolumn{{4}}{{c|}}{{{precision:.3f}}}                                \\\\ \\hline
\\textbf{{Overall Recall}}              & \\multicolumn{{4}}{{c|}}{{{recall:.3f}}}                                   \\\\ \\hline
\\textbf{{Overall F1-Score}}            & \\multicolumn{{4}}{{c|}}{{{f_score:.3f}}}                                  \\\\ \\hline
\\textbf{{Label Ranking Avg Precision}} & \\multicolumn{{4}}{{c|}}{{{lrap:.3f}}}                                    \\\\ \\hline
\\textbf{{Coverage Error}}              & \\multicolumn{{4}}{{c|}}{{{coverage_err:.3f} (worst: {worst_coverage:.3f}, best: {best_coverage:.3f})}}                             \\\\ \\hline
\\textbf{{Ranking Loss}}                & \\multicolumn{{4}}{{c|}}{{{ranking_loss:.3f} (worst: {worst_rl:.3f}, best: {best_rl:.3f})}}                             \\\\ \\hline
\\end{{tabular}}
"""

    # Final LaTeX output with caption and label
    final_latex_output = f"""
\\begin{{table}}[h]
\\centering
{overall_metrics}
\\caption{{Metrics Overview of {model_name} Model for Task {task_number}}}
\\label{{table:{model_name}_metrics_task_{task_number}}}
\\end{{table}}
    """

    # Print or write to a file
    with open('metrics.tex', 'w') as f:
        f.write(final_latex_output)


In [28]:
generate_metrics_latex_table(model_name= 'roberta', task_number = '01', true_labels = np.array(test_labels), binary_predictions = test_predictions, prediction_probs = test_predictions_probs, target_names=mlb.classes_)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Prediction function

In [29]:
def parse(sentence):
    # Tokenize
    sentence = nlp(sentence)
    # Remove stop words
    sentence = " ".join([token.lemma_ for token in sentence])
    
    return sentence

In [30]:
def predict(model, tokenizer, sentence):
    model.eval()
    device = 'cuda' if cuda.is_available() else 'cpu'
    sentence = parse(sentence)
    inputs = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length = max_length,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            truncation=False,
            return_tensors='pt'
        )
    

    input_ids = inputs['input_ids'].to(device, dtype=torch.long)
    attention_mask = inputs['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = inputs['token_type_ids'].to(device, dtype=torch.long)

    outputs = model(input_ids, attention_mask, token_type_ids)

    outputs = torch.sigmoid(outputs).cpu().detach().numpy()
    threshold = 0.5
    outputs = test_predictions = [[prob > threshold for prob in prob_list] for prob_list in outputs ]
    
    outputs = mlb.inverse_transform(np.array(outputs))
    return sentence, outputs