# NLBSE2022 Tool Competition Submission

For the NLBSE2022 Tool Competition, our team from *University of Bari* built two classifiers. This is the first one, based on RoBERTa fine-tuning.

## Set up persistent storage

In [None]:
from google.colab import drive
drive.mount('/content/drive')

prefix_dir = '/content/drive/MyDrive/'

## Check GPU

In [None]:
!nvidia-smi

## Install dependecies

In [None]:
!pip install ekphrasis
!pip install transformers
!pip install imblearn

In [None]:
import os
import time
import numpy as np
import pandas as pd
import sklearn.metrics
from tqdm.auto import tqdm
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import re
from keras.preprocessing.sequence import pad_sequences
from _datetime import datetime as dt
import random
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset
import datetime
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
import json

## Download the dataset

In [None]:
# download the training set if it does not exist
if not os.path.isfile("github-labels-top3-803k-train.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-train.tar.gz" | tar -xz

trainset = pd.read_csv("github-labels-top3-803k-train.csv")

In [None]:
trainset.head(5)

## Dataset distribution

In [None]:
trainset.groupby("issue_label").size()

## Setup Preprocessing

In [None]:
# remember to match this !\[(.*)\]\(.*\) and other markdown things

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

img_1 = re.compile('!\[(.*)\]\(.*\)')
link_1 = re.compile('\[(.*)\]\(.*\)')
link_2 = re.compile('\[(.*)\]: [^\s]+')
code_1 = re.compile('(:?`[^`]+`|```[^`]*```)')

def preprocess(row):
  # concatenate title and body, then remove whitespaces
  doc = ""
  doc += str(row.issue_title)
  doc += " "
  doc += str(row.issue_body)
  return clean_text(doc)

def clean_text(text):
  cleaned = re.sub(img_1, r'\1 <img>', text)
  cleaned = re.sub(link_1, r'\1 <url>', cleaned)
  cleaned = re.sub(link_2, r'\1 <url>', cleaned)
  cleaned = re.sub(code_1, '<code>', cleaned)
  ekph_cleaned = " ".join(text_processor.pre_process_doc(cleaned))
  return ekph_cleaned


In [None]:
def encode_text(corpus, tokenizer):
    input_ids = []
    max_length = 128


    for sent in tqdm(corpus):
        encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens = True,
            max_length = max_length)  # orignal value 512
        
        input_ids.append(encoded_sent)

    input_ids = pad_sequences(input_ids, maxlen = max_length, dtype = "long",
                                    value = tokenizer.pad_token_id, truncating = "pre", padding = "pre")

    return input_ids

In [None]:
def create_attention_masks(input_ids):
    attention_masks = []

    # For each tweet in the training set
    for sent in tqdm(input_ids):
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]

        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)
    return attention_masks

## Utilities to get the data

In [None]:
def get_data(df):
  text = []
  labels = []
  ids = []
  for row in tqdm(df.itertuples(), desc="Getting data...", total=len(df)):
    text.append(preprocess(row))
    labels.append(row.issue_label)
    ids.append(row[0])
  return text, labels, ids

def get_labels(df):
  labels = []
  for row in tqdm(df.itertuples(), desc="Getting data...", total=len(df)):
    labels.append(row.issue_label)
  return labels

## Pick a seed for reproducible experiments

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

print('Using seed: {}'.format(seed_val))

In [None]:
dateTimeObj = str(dt.now()).replace(" ", "_")

model_name = "roberta-base"
# Change model_dir if you want to load a custom model
model_dir = model_name

# -----------------------------
# Load Pre-trained BERT model
# -----------------------------
config_class = AutoConfig
model_class = AutoModelForSequenceClassification
tokenizer_class = AutoTokenizer


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load a trained model and vocabulary pre-trained for specific language
print("Loading model")

# Load pre-trained Tokenizer from directory, change this to load a tokenizer from ber package
tokenizer = tokenizer_class.from_pretrained(model_dir)

# Load Bert for classification 'container'
model = model_class.from_pretrained(
     model_dir, # Use pre-trained model from its directory, change this to use a pre-trained model from bert
     num_labels = 3, # The number of output labels--2 for binary classification.
                     # You can increase this for multi-class tasks.
     output_attentions = False, # Whether the model returns attentions weights.
     output_hidden_states = False, # Whether the model returns all hidden-states.
     ignore_mismatched_sizes=True,
)

# Set the model to work on CPU if no GPU is present
model.to(device)
print("Model loaded!")

## Setup directory for storing data

In [None]:
data_folder = os.path.join(*[prefix_dir, 'NLBSE', 'data'])

## Preprocess data

In [None]:
train_set, train_labs, _ = get_data(trainset)

train_ids = encode_text(train_set, tokenizer)
train_masks = create_attention_masks(train_ids)

## Serialize preprocessing output
Preprocessing can take some time, so serialization is good if you have to run multiple experiments.



In [None]:
# serialize
with open(data_folder + 'train_set.json', 'w') as f:
    json.dump(train_set, f)
with open(data_folder + 'train_labs.json', 'w') as f:
    json.dump(train_labs, f)

Labels are textual, so we encode them

In [None]:
lenc = LabelEncoder()
train_labs = lenc.fit_transform(train_labs)

## Undersampling
Undersampling can be enabled, but the best performances are achieved without it.

In [None]:
from imblearn.under_sampling import RandomUnderSampler
undersampling = False # @param ["True","False"]
test_size =  0.1

if undersampling:
  rus = RandomUnderSampler(sampling_strategy = 'not minority', random_state = seed_val)
  train_ids, train_labs_re = rus.fit_resample(train_ids, train_labs)
  train_masks, train_labs_re1 = rus.fit_resample(train_masks, train_labs)
  train_labs = train_labs_re


## Set this to true if you want to use validation set

In [None]:
validation = False

In [None]:
if validation:
  train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
      train_ids, train_labs, random_state=seed_val, test_size=test_size, stratify=train_labs
      )
  train_masks, validation_masks, _, _ = train_test_split(
      train_masks, train_labs, random_state=seed_val, test_size=test_size, stratify=train_labs
      )
else:
  train_inputs, train_labels, train_masks = train_ids, train_labs, train_masks

In [None]:
if validation: 
  validation_inputs = torch.tensor(validation_inputs)
  validation_labels = torch.tensor(validation_labels)
  validation_masks = torch.tensor(validation_masks)
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)


## Create directory for saving the model

In [None]:
if undersampling:
  und_str = 'und'
else: 
  und_str = ''

# Directory in which the model will be saved along with the log
output_model_dir = os.path.join(*[prefix_dir, 'NLBSE', 'models', model_name.replace('/', ''), dateTimeObj + "_SEED_" + str(seed_val) + und_str])

# Make dir for model serializations
os.makedirs(os.path.dirname(output_model_dir), exist_ok=True)

## Create Dataloaders for training

In [None]:
batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

if validation: 
  # Create the DataLoader for our validation set.
  validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
  validation_sampler = SequentialSampler(validation_data)
  # Note that the number of batch has to be the same, this means that we have to aggregate results in the end
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [None]:
# --------------------------------------------------------------------
# -------------- Optimizer and Learning Rate Scheduler ---------------
# --------------------------------------------------------------------

optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8
                )

# Number of training epochs
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# Store the average loss after each epoch
loss_values = []

# For each epoch...
for epoch_i in tqdm(range(0, epochs), desc="Training"):

    # ========================================
    #               Training
    # ========================================

    # Store true lables for global eval
    gold_labels = []
    # Store  predicted labels for global eval
    predicted_labels = []

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    model.train()

    for step, batch in tqdm(enumerate(train_dataloader), desc="Batch"):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically.
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        # The call to `model` always returns a tuple, so we need to pull the
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
    if validation:
    
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")

      t0 = time.time()

      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      model.eval()

      # Tracking variables
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      # Evaluate data for one epoch
      for batch in validation_dataloader:
          # Add batch to GPU/CPU
          batch = tuple(t.to(device) for t in batch)

          # Unpack the inputs from our dataloader
          b_input_ids, b_input_mask, b_labels = batch

          # Telling the model not to compute or store gradients, saving memory and
          # speeding up validation
          with torch.no_grad():
              # Forward pass, calculate logit predictions.
              # This will return the logits rather than the loss because we have
              # not provided labels.
              # token_type_ids is the same as the "segment ids", which
              # differentiates sentence 1 and 2 in 2-sentence tasks.
              # The documentation for this `model` function is here:
              # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
              outputs = model(b_input_ids,
                              token_type_ids=None,
                              attention_mask=b_input_mask)

          # Get the "logits" output by the model. The "logits" are the output
          # values prior to applying an activation function like the softmax.
          logits = outputs[0]

          # Move logits and labels to CPU
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()

          # Calculate the accuracy for this batch of test sentences.
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)

          # Accumulate the total accuracy.
          eval_accuracy += tmp_eval_accuracy

          # Track the number of batches
          nb_eval_steps += 1

          pred_flat = np.argmax(logits, axis=1).flatten()
          labels_flat = label_ids.flatten()

          # Store gold labels single list
          gold_labels.extend(labels_flat)
          # Store predicted labels single list
          predicted_labels.extend(pred_flat)

          # The classification report is printed on the log, note that print one report for each validation epoch,
          # if we want to compute an average P/R/F1 we can do the same as accuracy, that is an accumulator that
          # stores P/R over epochs or compute the average at the end

          print(classification_report(labels_flat,pred_flat, digits=4))

    # ------------------------------------------------------------------------------------------------------------------
    # Todo: Cut code until here to remove the validation step
    # ------------------------------------------------------------------------------------------------------------------

    # Report the final accuracy for this validation run.
    if validation:
      print("  Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))
      print("  Validation took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Evaluation on validation set")
    print("Gold labels" + str(len(gold_labels)))
    print("Predicted labels" + str(len(predicted_labels)))
    cr = classification_report(gold_labels,predicted_labels, digits=4, output_dict=True)
    print(cr)
    f1_m = f1_score(gold_labels, predicted_labels, average='micro')
    print('f1_micro = {}'.format(f1_m))


    chkpt_path = os.path.join(*[output_model_dir, 'chkpt' + str(epoch_i+1)])
    os.makedirs(chkpt_path, exist_ok=True)
    optimizer_path = os.path.join(chkpt_path, "optimizer")
    os.makedirs(optimizer_path, exist_ok=True)


    with open(os.path.join(chkpt_path, 'val_cr_report.json'), 'w') as f:
      json.dump(cr, f)

    torch.save(optimizer.state_dict(), os.path.join(optimizer_path, 'optimizer_state.pt'))
    model_to_save = (model.module if hasattr(model, "module") else model)
    model_to_save.save_pretrained(chkpt_path)
    tokenizer.save_pretrained(chkpt_path)


print("")
print("Training complete!")

print("Saving model to: " + output_model_dir)
model_to_save = (
    model.module if hasattr(model, "module") else model
)
model_to_save.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

## Load test set

In [None]:
if not os.path.isfile("github-labels-top3-803k-test.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-test.tar.gz" | tar -xz

testset = pd.read_csv("github-labels-top3-803k-test.csv")

In [None]:
testset.groupby("issue_label").size()

## Load model to test

In [None]:
model_dir = output_model_dir

config_class = AutoConfig
model_class = AutoModelForSequenceClassification
tokenizer_class = AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading model from: '" + model_dir + "', it may take a while...")

tokenizer = tokenizer_class.from_pretrained(model_dir)

model = model_class.from_pretrained(
    model_dir,
    num_labels = 3,  
    output_attentions = False,
    output_hidden_states = False, 
)

model.to(device)
print("Model has been loaded!")

model.to(device)

## Preprocess test set

In [None]:
test_tweets, test_labs, tweet_ids = get_data(testset)
test_ids = encode_text(test_tweets, tokenizer)
test_masks = create_attention_masks(test_ids)

with open(data_folder + 'test_set.json', 'w') as f:
    json.dump(test_tweets, f)
with open(data_folder + 'test_labs.json', 'w') as f:
    json.dump(test_labs, f)

test_labs = lenc.transform(test_labs)

prediction_inputs = torch.tensor(test_ids)

prediction_masks = torch.tensor(test_masks)

label_encoder = LabelEncoder()
targets = label_encoder.fit_transform(tweet_ids)

prediction_ids = torch.as_tensor(targets)

## Set up Dataloaders for testing

In [None]:
# Set the batch size.
batch_size = 32

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_ids)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

## Run testing phase

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions = []

# Predict
for batch in tqdm(prediction_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_tweet_ids = batch

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()

    flat_logits = np.argmax(logits, axis=1).flatten()
    # Get tweet ids for prediction output
    ids = label_encoder.inverse_transform(b_tweet_ids.cpu().numpy())
    # Store predictions and true labels
    predictions.extend(list(zip(ids, flat_logits)))

# Print the list of prediction & store for evaluation
with open(model_dir + 'output_test.csv', 'w') as out_file:
    # store predictions in list for eval
    pred_ = []
    # Get each tweet id
    for tweet_id_prediction in predictions:
        label = tweet_id_prediction[1]
        pred_.append(label)
        out_file.write(str(tweet_id_prediction[0]) + "," + str(label) + '\n')

## Results:

In [None]:
cr = classification_report(test_labs,pred_, digits=4, output_dict=True)

print(classification_report(test_labs,pred_, digits=4))

with open(os.path.join(model_dir, "cr_report.csv"), 'w') as f:
  json.dump(cr, f)

f1_m = f1_score(test_labs, pred_, average='micro')
print('f1_micro = {}'.format(f1_m))

In [None]:
cm = confusion_matrix(test_labs, pred_)
print(cm)