In [None]:
import xml.etree.ElementTree as ET
import enum
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import string
import torch
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from tqdm import tqdm
from torch.utils.data import DataLoader

# Code for ABSA

This will be the code for the aspect-based sentiment detection task.

## Parsing the train/test data

Our first step will be parsing the train/test data. 
It comes as an XML file. We parse it into some custom
data classes. We have four polarities, aspects and text. For
the aspects, we will later use a set data structure
to calculate precision/recall scores, so we need to provide
a hash and equality operator.

In [None]:
# We have four possible polarities

class Polarity(enum.IntEnum):
  NEUTRAL = 0,
  POSITIVE = 1,
  NEGATIVE = 2,
  CONFLICT = 3

polarities = {
    "positive" : Polarity.POSITIVE,
    "negative" : Polarity.NEGATIVE,
    "conflict" : Polarity.CONFLICT,
    "neutral" : Polarity.NEUTRAL
}

strings_polarities = {
   Polarity.POSITIVE : "positive", 
   Polarity.NEGATIVE : "negative",
   Polarity.CONFLICT : "conflict",
   Polarity.NEUTRAL : "neutral"
}

class ABSATrainExample:
  def __init__(self, text, aspects):
    self.text = text
    self.aspects = aspects
    
class ABSAAspect:
  def __init__(self, polarity, from_, to):
    self.polarity = polarity
    self.end = to
    self.start = from_

  def __eq__(self, other):
    if isinstance(other, ABSAAspect):
      return self.polarity == other.polarity and \
       self.start == other.start and \
       self.end == other.end
    else:
      return False

  def __ne__(self, other):
    return (not self.__eq__(other))

  def __hash__(self):
    return hash(self.start + self.end)

## Parsing the file

Each training example comes with a sentence and a list of aspects. We parse
both into our data structures. We save token ids, rather than the text, for later processing.

When parsing the files, we have to take tokenization into account. We use the tokenization provided by the Huggingface library; because we need to later transform the examples into label sequences, we need to be able to associate specific terms to specific tokens. To do that, we split the data into words first and pass the ```split_into_words``` argument to the Tokenizer.

We then locate the aspects in the token sequence and mark the positions. This
allows for the creation of label sequences later on.

In [None]:
def locate_subsequence(query, base):
    l = len(query)
    for i in range(len(base)):
        if base[i:i+l] == query:
            return i, i + l
    print("Not found")
    raise RuntimeError("Token sequence not found")

def parse_file(filename, tokenizer):
  tree = ET.parse(filename)
  sentences = tree.getroot()
  train_examples = []
  sentence_texts = []
  for sentence in sentences:
    # Extract text and clean it
    text = sentence.find("text").text
    # Remove punctuation
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))).lower()
    # Split into words
    # We do the splitting so we can locate the tokens later
    text_split = text.split(' ')
    # Add to list of texts
    sentence_texts.append(text_split)

  # Tokenize texts, adding padding and truncating to max length
  tok_results = tokenizer(sentence_texts, is_split_into_words=True,
                          truncation=True, padding=True,
                          return_attention_mask=False)
  
  num_fail = 0
  for text_tokenized, sentence in zip(tok_results['input_ids'], sentences):
    # Extract aspects
    aspects = sentence.find("aspectTerms")
    aspectList = []
    if aspects is not None:
      for aspect in aspects:
        try:
          # Clean aspect term, just like the text
          term = aspect.get("term")
          if not term:
            term = aspect.get("aspectTerm")
          term = term.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))).lower()
          term = term.split(' ')

          # Tokenize and locate in text for later processing
          term_tokenized = tokenizer(term, is_split_into_words=True)['input_ids']
          start, end = locate_subsequence(term_tokenized[1:len(term_tokenized) - 1], text_tokenized)
          polarity = polarities[aspect.get("polarity")]

          aspectList.append(ABSAAspect(polarity, start, end))
        except Exception as e:
          print(term)
          print(text_tokenized)
          print(term_tokenized[1:len(term_tokenized) - 1])
          num_fail += 1
    train_examples.append(ABSATrainExample(text_tokenized, aspectList))
  return train_examples, num_fail

## Converting training examples to labels

Here, we take the training examples and convert them to label sequences.

In [None]:
# Take training examples and convert them into sequence labels
# We use a BIO scheme
# First, define classes

class SequenceLabel(enum.IntEnum):
  OUTSIDE = 0,
  BEGIN_POS = 1,
  BEGIN_NEG = 2,
  BEGIN_CON = 3,
  BEGIN_NEU = 4,
  INSIDE_POS = 5,
  INSIDE_NEG = 6,
  INSIDE_CON = 7,
  INSIDE_NEU = 8

NUM_LABELS = 9

polarities_to_labels = {
    Polarity.POSITIVE : [SequenceLabel.BEGIN_POS, SequenceLabel.INSIDE_POS],
    Polarity.NEGATIVE : [SequenceLabel.BEGIN_NEG, SequenceLabel.INSIDE_NEG],
    Polarity.NEUTRAL : [SequenceLabel.BEGIN_NEU, SequenceLabel.INSIDE_NEU],
    Polarity.CONFLICT : [SequenceLabel.BEGIN_CON, SequenceLabel.INSIDE_CON]
}

def example_to_labels(example):
  aspects = example.aspects
  text_tokens = example.text
  initial_labels = [0] * len(text_tokens)
  for aspect in aspects:
    label_beg, label_end = polarities_to_labels[aspect.polarity]
    aspect_begin = aspect.start
    aspect_end = aspect.end
    initial_labels[aspect_begin] = int(label_beg)
    for i in range(aspect_begin + 1, aspect_end):
      initial_labels[i] = int(label_end)
  return initial_labels

## Creation of dataset for PyTorch

We make use of the PyTorch Dataset API to later train our models. We create a Pytorch dataset from our datapoints.

In [None]:
class ABSADataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels):
      self.tokens = tokens
      self.labels = labels

    def __getitem__(self, idx):
      item = {'tokens' : torch.tensor(self.tokens[idx])}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

    def __len__(self):
      return len(self.labels)

def create_datasets_tokenizer(tokenizer):
  laptops_trial, fail_lt = parse_file("Laptop_Test.xml", tokenizer)
  restaurants_trial, fail_rt = parse_file("Restaurants_Test.xml", tokenizer)
  laptops_train, fail_ltr = parse_file("Laptop_Train.xml", tokenizer)
  restaurants_train, fail_rtr = parse_file("Restaurants_Train.xml", tokenizer)


  labels_laptops_trial = [example_to_labels(example) for example in laptops_trial]
  tokens_laptops_trial = [example.text for example in laptops_trial]
  labels_laptops_train = [example_to_labels(example) for example in laptops_train]
  tokens_laptops_train = [example.text for example in laptops_train]

  labels_restaurants_trial = [example_to_labels(example) for example in restaurants_trial]
  tokens_restaurants_trial = [example.text for example in restaurants_trial]
  labels_restaurants_train = [example_to_labels(example) for example in restaurants_train]
  tokens_restaurants_train = [example.text for example in restaurants_train]

  laptops_train_ds = ABSADataset(tokens_laptops_train, labels_laptops_train)
  laptops_trial_ds = ABSADataset(tokens_laptops_trial, labels_laptops_trial)

  restaurants_train_ds = ABSADataset(tokens_restaurants_train, labels_restaurants_train)
  restaurants_trial_ds = ABSADataset(tokens_restaurants_trial, labels_restaurants_trial)

  return laptops_train_ds, laptops_trial_ds, restaurants_train_ds, restaurants_trial_ds

## Evaluation metrics

Here, we provide the evaluation metrics. We have the F1 score, precision and recall. We compute these grouped by polarity. For the precision and recall, we retrieve true positives, false positives and false negatives by building a set of true and predicted examples. For each predicted example, we check if it is the set of true examples; if it is, it is a true positive. If it is not, it is a false positive. We remove true positives from the set of true examples; once we have gone through all predicted examples, if there are any true examples left in the set, they are false negatives.

To do this, we also need to write functions to convert label sequences to our aspect data structure. We added support for hashing above.

We allow for the calculation of both the macro and micro F1 score, precision and recall.

In [None]:

# We add the inside labels because starting with an inside label is incorrect
beg_labels_polarity = {
    int(SequenceLabel.BEGIN_POS) : Polarity.POSITIVE,
    int(SequenceLabel.BEGIN_NEU) : Polarity.NEUTRAL,
    int(SequenceLabel.BEGIN_NEG) : Polarity.NEGATIVE,
    int(SequenceLabel.BEGIN_CON) : Polarity.CONFLICT,
}

in_labels = {
    int(SequenceLabel.INSIDE_POS),
    int(SequenceLabel.INSIDE_NEU),
    int(SequenceLabel.INSIDE_NEG),
    int(SequenceLabel.INSIDE_CON),
}

def labels_to_aspects(labels_seqs):
  aspects = []
  for label_seq in labels_seqs:
    i = 0
    while i < len(label_seq):
      cur_label = label_seq[i]
      if cur_label in beg_labels_polarity:
        polarity = beg_labels_polarity[cur_label]
        start = i
        end = i + 1
        i += 1
        while label_seq[i] in in_labels:
          i += 1
          end += 1
        aspects.append(ABSAAspect(polarity, start, end))
      else:
        i += 1
  return aspects

def group_by_polarities(aspect_list):
  aspects = {
      Polarity.POSITIVE : [],
      Polarity.NEUTRAL : [],
      Polarity.NEGATIVE : [],
      Polarity.CONFLICT : []
  }
  
  for aspect in aspect_list:
    aspects[aspect.polarity].append(aspect)
  return aspects

def calculate_f1(true_aspects, pred_aspects, micro=False):
  precision = []
  recalls = []
  f1_scores = []

  tp_total = 0
  fp_total = 0
  fn_total = 0

  true_aspects_grouped = group_by_polarities(true_aspects)
  pred_aspects_grouped = group_by_polarities(pred_aspects)

  for key, value in true_aspects_grouped.items():
    pred_aspects_for_item = pred_aspects_grouped[key]
    true_aspect_set = set(value)
    pred_aspect_set = set(pred_aspects_for_item)

    tp = 0
    fp = 0

    while pred_aspect_set:
      aspect = pred_aspect_set.pop()
      if aspect in true_aspect_set:
        true_aspect_set.remove(aspect)
        tp += 1
      else:
        fp += 1
    fn = len(true_aspect_set)

    tp_total += tp
    fp_total += fp
    fn_total += fn

    if tp + fp > 0 and tp + fn > 0:
      prec = tp / (tp + fp)
      recall = tp / (tp + fn)
      if prec + recall > 0:
        f1 = 2 * ((prec * recall) / (prec + recall))
      else:
        f1 = 0

      precision.append(prec)
      recalls.append(recall)
      f1_scores.append(f1)
    else:
      precision.append(0)
      recalls.append(0)
      f1_scores.append(0)

  if micro:
    prec = tp_total / (tp_total + fp_total)
    recall = tp_total / (tp_total + fn_total)
    if prec + recall > 0:
      f1 = 2 * ((prec * recall) / (prec + recall))
    else:
      f1 = 0
    return f1, prec, recall

  else:
    return np.mean(np.array(f1_scores)), np.mean(np.array(precision)), \
     np.mean(np.array(recalls))


def eval_model(model, trial_loader, device, micro=False):
    model.to(device)
    model.eval()
    f1_vals = []
    aspects_real_full = []
    aspects_pred_full = []
    with torch.no_grad():
      for batch in trial_loader:
        input_ids = batch['tokens'].to(device)
        outputs = model(input_ids).logits.cpu().numpy()

        labels_pred = np.argmax(outputs, axis=-1)
        labels_real = batch['labels'].numpy()

        aspects_pred = labels_to_aspects(labels_pred)
        aspects_real = labels_to_aspects(labels_real)

        aspects_real_full += aspects_real
        aspects_pred_full += aspects_pred

    f1_score, precision, recall = calculate_f1(aspects_real_full,
                                               aspects_pred_full,
                                               micro=micro)

    return f1_score, precision, recall

## Code for training the model

This is some code for training the model. Nothing too fancy; we add some rudimentary support for early stopping.

In [None]:
def train_model(model, train_loader, optim, num_epochs,
                device, early_stopping=False, val_loader=None, min_delta=0.05):
  model.to(device)

  best_f1 = 0
  losses = []

  for epoch in range(num_epochs):
    pbar = tqdm(train_loader)
    for batch in pbar:
      model.train()
      # Do train step
      optim.zero_grad()
      input_ids = batch['tokens'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, labels=labels)
      loss = outputs[0]
      loss.backward()
      optim.step()

      # Update progress bar
      pbar_string = "Epoch {}/{} loss {:.2f}".format(
          epoch + 1, num_epochs, loss.item())
      pbar.set_description(pbar_string)

      # Add loss
      losses.append(loss.item())
    if early_stopping:
      f1, _, _ = eval_model(model, val_loader, device)
      print("Val F1 {:.4f}".format(f1))
      if f1 < (best_f1 - min_delta):
        return losses, epoch
      elif f1 > best_f1:
        best_f1 = f1
  return losses, num_epochs

## DistillBERT - Laptops and Restaurants

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
laptops_train_ds, laptops_trial_ds, restaurants_train_ds, restaurants_trial_ds = create_datasets_tokenizer(tokenizer)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Laptops
train_loader = DataLoader(laptops_train_ds, batch_size=16, shuffle=True)
trial_loader = DataLoader(laptops_trial_ds, batch_size=16, shuffle=True)

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased',
                                                         num_labels=NUM_LABELS)
model.to(device)
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
_, num_epochs = train_model(model, train_loader, optim, 15, device)

In [None]:
f1, prec, recall = eval_model(model, trial_loader, device)
print("Laptops f1, prec, recall trial", f1, prec, recall)

In [None]:
# Restaurants
train_loader = DataLoader(restaurants_train_ds, batch_size=16, shuffle=True)
trial_loader = DataLoader(restaurants_trial_ds, batch_size=16, shuffle=True)

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased',
                                                         num_labels=NUM_LABELS)
model.to(device)
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
_, num_epochs = train_model(model, train_loader, optim, 20, device)

In [None]:
f1, prec, recall = eval_model(model, trial_loader, device)
print("Restaurants f1, prec, recall trial", f1, prec, recall)

## Dataset statistics

In [None]:
# Produce plots detailing distributions of data points


laptops_train, fail_ltr = parse_file("Laptop_Test.xml", tokenizer)
restaurants_train, fail_rtr = parse_file("Restaurants_Test.xml", tokenizer)

print("Total number test samples laptop", len(laptops_train))
print("Total number test samples restaurant", len(restaurants_train))
print("Total number failed to parse", fail_ltr + fail_rtr)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
laptops_train, fail_ltr = parse_file("Laptop_Train.xml", tokenizer)
restaurants_train, fail_rtr = parse_file("Restaurants_Train.xml", tokenizer)

print("Total number training samples laptop", len(laptops_train))
print("Total number training samples restaurant", len(restaurants_train))
print("Total number failed to parse", fail_ltr + fail_rtr)

aspect_distribution_laptop = {
    "positive" : 0,
    "negative" : 0,
    "conflict" : 0,
    "neutral" : 0
}

for example in laptops_train:
  for aspect in example.aspects:
    aspect_distribution_laptop[strings_polarities[aspect.polarity]] += 1

df_aspect_dist_laptop = pd.DataFrame(aspect_distribution_laptop.items(),
                                     columns=["Aspect", "Number of items"])
sns.barplot(data=df_aspect_dist_laptop, x="Aspect", y="Number of items")

In [None]:
aspect_distribution_restaurant = {
    "positive" : 0,
    "negative" : 0,
    "conflict" : 0,
    "neutral" : 0
}

for example in restaurants_train:
  for aspect in example.aspects:
    aspect_distribution_restaurant[strings_polarities[aspect.polarity]] += 1

df_aspect_dist_restaurants = pd.DataFrame(aspect_distribution_restaurant.items(),
                                     columns=["Aspect", "Number of items"])
sns.barplot(data=df_aspect_dist_restaurants, x="Aspect", y="Number of items")

In [None]:
def parse_file_baseline(filename):
  tree = ET.parse(filename)
  sentences = tree.getroot()
  train_examples = []
  for sentence in sentences:
    aspectList = []
    text = sentence.find("text").text
    aspects = sentence.find("aspectTerms")
    if aspects:
      for aspect in aspects:
          polarity = polarities[aspect.get("polarity")]
          start = int(aspect.get("from"))
          end = int(aspect.get("to"))
          aspectList.append(ABSAAspect(polarity, start, end)) 
    train_examples.append(ABSATrainExample(text, aspectList))
  return train_examples

## Baseline classifier

In [None]:
def create_baseline_classifier(dataset):
  baseline_map = {}

  for example in dataset:
    for aspect in example.aspects:
      list_string = example.text[aspect.start:aspect.end]
      if list_string not in baseline_map:
        baseline_map[list_string] = np.zeros(4)
      baseline_map[list_string][int(aspect.polarity)] += 1
  
  for k, v in baseline_map.items():
    baseline_map[k] = np.argmax(v)

  return baseline_map

In [None]:
def get_confusion_matrix_by_aspect(true_aspects, pred_aspects):
  true_aspects_grouped = group_by_polarities(true_aspects)
  pred_aspects_grouped = group_by_polarities(pred_aspects)

  tps = [0, 0, 0, 0]
  fps = [0, 0, 0, 0]
  fns = [0, 0, 0, 0]

  for key, value in true_aspects_grouped.items():
    pred_aspects_for_item = pred_aspects_grouped[key]
    true_aspect_set = set(value)
    pred_aspect_set = set(pred_aspects_for_item)

    tp = 0
    fp = 0

    while pred_aspect_set:
      aspect = pred_aspect_set.pop()
      if aspect in true_aspect_set:
        true_aspect_set.remove(aspect)
        tp += 1
      else:
        fp += 1
    fn = len(true_aspect_set)

    tps[int(key)] = tp
    fps[int(key)] = fp
    fns[int(key)] = fn

  return tps, fps, fns

def extract_aspects_baseline(dataset, baseline_classifier):
  # Convert token sequences to strings
  f1 = 0
  prec = 0
  recall = 0

  fp_matrix = []
  tp_matrix = []
  fn_matrix = []

  for datapoint in dataset:
    aspects_processed = []
    found_aspects = []
    for k, v in baseline_classifier.items():
      idx_substr = datapoint.text.find(k)
      if idx_substr != -1:
        polarity = Polarity(v)
        start = idx_substr
        end = len(k) + idx_substr
        found_aspects.append(ABSAAspect(polarity, start, end))
    tps, fps, fns = get_confusion_matrix_by_aspect(datapoint.aspects, found_aspects)

    fp_matrix.append(fps)
    tp_matrix.append(tps)
    fn_matrix.append(fns)

  tp_matrix = np.array(tp_matrix)
  fp_matrix = np.array(fp_matrix)
  fn_matrix = np.array(fn_matrix)

  tp_matrix = np.sum(tp_matrix, axis=0)
  fp_matrix = np.sum(fp_matrix, axis=0)
  fn_matrix = np.sum(fn_matrix, axis=0)

  prec_matrix = tp_matrix / (tp_matrix + fp_matrix)
  recall_matrix = tp_matrix / (tp_matrix + fn_matrix)
  f1_matrix = np.nan_to_num(2 * ((prec_matrix * recall_matrix) / (prec_matrix + recall_matrix)))

  return np.mean(f1_matrix), np.mean(prec_matrix), np.mean(f1_matrix)

In [None]:
laptop_baseline = parse_file_baseline("Laptop_Train.xml")
laptop_test = parse_file_baseline("Laptop_Test.xml")
class_laptop = create_baseline_classifier(laptop_baseline)
extract_aspects_baseline(laptop_test, class_laptop)

In [None]:
rest_baseline = parse_file_baseline("Restaurants_Train.xml")
rest_test = parse_file_baseline("Restaurants_Test.xml")
class_rest = create_baseline_classifier(rest_baseline)
extract_aspects_baseline(rest_test, class_rest)