# NLBSE2022 Tool Competition Submission

For the NLBSE2022 Tool Competition, our team from *University of Bari* built two classifiers. This is the second one, based on *Multi-layer Perceptron*, using RoBERTa embeddings together with the non-textual information of *issue-author association* one-hot encoded.

## Set up persistent storage

In [None]:
from google.colab import drive
drive.mount('/content/drive')

prefix_dir = '/content/drive/MyDrive/'

## Check GPU

In [None]:
!nvidia-smi

## Install dependecies

In [None]:
!pip install ekphrasis
!pip install transformers
!pip install pandas
!pip install sklearn
!pip install numpy
!pip install keras
!pip install torch
!pip install transformers

## Set up preprocessing

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import re

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

img_1 = re.compile('!\[(.*)\]\(.*\)')
link_1 = re.compile('\[(.*)\]\(.*\)')
link_2 = re.compile('\[(.*)\]: [^\s]+')
code_1 = re.compile('(:?`[^`]+`|```[^`]*```)')

def preprocess(row):
  # concatenate title and body, then remove whitespaces
  doc = ""
  doc += str(row.issue_title)
  doc += " "
  doc += str(row.issue_body)
  doc += " "
  doc += "<" + str(row.issue_author_association) + ">"

  return clean_text(doc)

def clean_text(text):
  cleaned = re.sub(img_1, r'\1 <img>', text)
  cleaned = re.sub(link_1, r'\1 <url>', cleaned)
  cleaned = re.sub(link_2, r'\1 <url>', cleaned)
  cleaned = re.sub(code_1, '<code>', cleaned)
  ekph_cleaned = " ".join(text_processor.pre_process_doc(cleaned))
  return ekph_cleaned


In [None]:
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
def get_data(df):
  text = []
  labels = []
  ids = []
  for row in tqdm(df.itertuples(), desc="Getting data...", total=len(df)):
    text.append(preprocess(row))
    labels.append(row.issue_label)
    ids.append(row[0])
  return text, labels, ids

def get_labels(df):
  labels = []
  for row in tqdm(df.itertuples(), desc="Getting data...", total=len(df)):
    labels.append(row.issue_label)
  return labels

def encode_text(corpus, tokenizer):
    input_ids = []
    max_length = 128


    for sent in tqdm(corpus):
        encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens = True,
            max_length = max_length)  # orignal value 512
        
        input_ids.append(encoded_sent)

    input_ids = pad_sequences(input_ids, maxlen = max_length, dtype = "long",
                                    value = tokenizer.pad_token_id, truncating = "pre", padding = "pre")

    return input_ids

def create_attention_masks(input_ids):
    attention_masks = []

    # For each tweet in the training set
    for sent in tqdm(input_ids):
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]

        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)
    return attention_masks

## Get the data

In [None]:
import os
import pandas as pd

if not os.path.isfile("github-labels-top3-803k-train.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-train.tar.gz" | tar -xz

trainset = pd.read_csv("github-labels-top3-803k-train.csv")

In [None]:
if not os.path.isfile("github-labels-top3-803k-test.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-test.tar.gz" | tar -xz

testset = pd.read_csv("github-labels-top3-803k-test.csv")

## Load the model

In [None]:
!git clone https://github.com/collab-uniba/Issue-Report-Classification-Using-RoBERTa

In [None]:
model_name = '/content/Issue-Report-Classification-Using-RoBERTa/RoBERTa finetuned - NLBSE2022 - archive/'  # can be a dir

In [None]:
from transformers import AutoConfig, AutoModel, AutoTokenizer
import torch


config_class = AutoConfig
model_class = AutoModel
tokenizer_class = AutoTokenizer

# Load Bert for classification 'container'
bert_model = model_class.from_pretrained(
     model_name,
     output_hidden_states = True,
     ignore_mismatched_sizes=True,
)

# Set the model to work on GPU
device = torch.device("cuda")
bert_model.to(device)
print(device)

tokenizer = tokenizer_class.from_pretrained(model_name)

## Preprocess data

In [None]:
data, labels, _ = get_data(trainset)
inputs = encode_text(data, tokenizer)
masks = create_attention_masks(inputs)

In [None]:
from sklearn.preprocessing import LabelEncoder

lenc = LabelEncoder()
labels = lenc.fit_transform(labels)

In [None]:
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset

batch_size = 32

inputs = torch.tensor(inputs)
labels = torch.tensor(labels)
masks = torch.tensor(masks)

# Create the DataLoader for our dataset
data = TensorDataset(inputs, masks)
sampler = SequentialSampler(data)
dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size, num_workers=2)

## Function to extract the embeddings given directory and dataloader

In [None]:
from tqdm import tqdm
import os

def extract_embeddings(embed_dir, dataloader):
    i = 0
    for batch in tqdm(dataloader):
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask = batch
        with torch.no_grad():
          outputs = bert_model(b_input_ids,
                          token_type_ids=None,
                          attention_mask=b_input_mask)
        embeddings = [torch.squeeze(tensor).cpu() for tensor in outputs["pooler_output"]]
        for idx, x in enumerate(embeddings):
          torch.save(x, os.path.join(embed_dir, str(i + idx)+'.pt'))
        i += idx + 1

## Function to get the one hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

def get_one_hot(column, enc=None):
    if enc is None:
        enc = OneHotEncoder(handle_unknown='ignore')
        X = np.reshape(column, (-1,1))
        return enc.fit_transform(X).toarray(), enc
    else:
        X = np.reshape(column, (-1,1))
        return enc.transform(X).toarray()

## Define model architecture

In [None]:
import torch
import torch.nn as nn

bert_output_size = 768
one_hot_size = 6
input_size = bert_output_size + one_hot_size
num_classes = 3

hidden_size_2 = 256
hidden_size_1 = 128
class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size_2)  
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size_2, hidden_size_1) 
        self.fc3 = nn.Linear(hidden_size_1, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

## Define dataset class

In [None]:
import os
import torch

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self, embed_path, issue_author_association, labels):
        self.embed_path = embed_path
        self.issue_author_association = issue_author_association # already one hot encoded
        self.labels = labels
        assert len(os.listdir(self.embed_path)) == len(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        label = self.labels[i]
        issue_author_association = self.issue_author_association[i]      
        embed = torch.load(os.path.join(self.embed_path, str(i) + '.pt'))
        tensor = torch.cat((embed, torch.from_numpy(issue_author_association)))
        return tensor, label
    def get_targets(self):
        return self.labels

## Define early stopping class with callback to get the model with best performance after stopping

In [None]:
import torch
import numpy as np


class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given number of consecutive epochs."""
    def __init__(self, model, chkpt_path, patience=5, delta=1e-4):
        """
        Instantiate an EarlyStopping object.
        :param model: The model.
        :param chkpt_path: The filepath of the checkpoint file.
        :param patience: The number of consecutive epochs to wait.
        :param delta: The minimum change of the monitored quantity.
        """
        if patience <= 0:
            raise ValueError("The patience value must be positive")
        if delta <= 0.0:
            raise ValueError("The delta value must be positive")
        self.model = model
        self.chkpt_path = chkpt_path
        self.patience = patience
        self.delta = delta
        self.best_loss = np.inf
        self.counter = 0

    @property
    def should_stop(self):
        """
        Check if the training process should stop.
        """
        return self.counter >= self.patience

    def get_best_state(self):
        """
        Get the best model's state dictionary.
        """
        with open(self.chkpt_path, 'rb') as f:
            best_state = torch.load(f)
        return best_state

    def __call__(self, loss):
        """
        Call the object.
        :param loss: The validation loss measured.
        """
        # Check if an improved of the loss happened
        if loss < self.best_loss - self.delta:
            self.best_loss = loss
            self.counter = 0

            # Save the best model state parameters
            with open(self.chkpt_path, 'wb') as f:
                torch.save(self.model.state_dict(), f)
        else:
            self.counter += 1


class RunningAverageMetric:
    """Running (batched) average metric."""
    def __init__(self, batch_size):
        """
        Initialize a running average metric object.
        :param batch_size: The batch size.
        """
        self.batch_size = batch_size
        self.metric_accumulator = 0.0
        self.n_metrics = 0

    def __call__(self, x):
        """
        Accumulate a metric.
        :param x: The metric value.
        """
        self.metric_accumulator += x
        self.n_metrics += 1

    def average(self):
        """
        Get the metric average.
        :return: The metric average.
        """
        return self.metric_accumulator / (self.n_metrics * self.batch_size)


def get_optimizer(optimizer):
    return {
        'sgd': torch.optim.SGD,
        'adam': torch.optim.Adam,
        'rmsprop': torch.optim.RMSprop
    }[optimizer]


## Define training function

In [None]:
import os
import time
import torch
import torchvision
import numpy as np
from tqdm import tqdm


def train_classifier(
        model,
        train_data,
        valid_data,
        lr=1e-5,
        optimizer='adam',
        batch_size=32,
        epochs=100,
        patience=5,
        steps_per_epoch=None,
        weight_decay=0,
        n_workers=2,
        device=None,
        verbose=True,
        chkpt_path='checkpoint.pt'
):

    # Get the device to use
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Train using device: ' + str(device))

    # Setup the data loaders
    train_loader = torch.utils.data.DataLoader(
        train_data, batch_size=batch_size, shuffle=True, num_workers=n_workers, drop_last=True
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_data, batch_size=batch_size, shuffle=False, num_workers=n_workers, drop_last=False
    )
    

    
    # Compute the class weights (due to dataset im-balance)
    _, class_counts = np.unique(train_data.dataset.get_targets(), return_counts=True)
    class_weights = np.min(class_counts) / class_counts
    
    # Instantiate the NLL losses (with weights)
    criterion = torch.nn.NLLLoss(
        weight=torch.tensor(class_weights, dtype=torch.float32, device=device),
        reduction='sum'
    )


    # Move the model to device
    model.to(device)

    # Instantiate the optimizer
    optimizer_kwargs = dict()
    optimizer_class = get_optimizer('adam')
    if optimizer_class == torch.optim.SGD:
        # If using SGD, introduce Nesterov's momentum
        optimizer_kwargs['momentum'] = 0.9
        optimizer_kwargs['nesterov'] = True
    optimizer = optimizer_class(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=lr, weight_decay=weight_decay, **optimizer_kwargs
    )

    # Instantiate the early stopping callback
    early_stopping = EarlyStopping(model, chkpt_path, patience=patience)

    # Compute the steps per epoch, if needed
    if steps_per_epoch is None:
        steps_per_epoch = len(train_loader)

    history = {
        'train': {'loss': [], 'accuracy': []},
        'validation': {'loss': [], 'accuracy': []}
    }

    for epoch in range(epochs):
        start_time = time.time()

        # Initialize the tqdm train data loader, if verbose is enabled
        if verbose:
            tk_train = tqdm(
                train_loader, total=steps_per_epoch, leave=False,
                bar_format='{l_bar}{bar:32}{r_bar}', desc='Train Epoch %d/%d' % (epoch + 1, epochs)
            )
        else:
            tk_train = train_loader

        # Make sure the model is set to train mode
        model.train()

        # Training phase
        running_train_loss = RunningAverageMetric(train_loader.batch_size)
        running_train_hits = RunningAverageMetric(train_loader.batch_size)
        for i, (inputs, targets) in enumerate(tk_train):
            if i >= steps_per_epoch:
                break

            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = torch.log_softmax(model(inputs.float()), dim=1)
            loss = criterion(outputs, targets)
            running_train_loss(loss.item())
            loss /= train_loader.batch_size
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                predictions = torch.argmax(outputs, dim=1)
                hits = torch.eq(predictions, targets).sum()
                running_train_hits(hits.item())

        # Close the tqdm train bar
        if verbose:
            tk_train.close()

        # Initialize the tqdm validation data loader, if verbose is specified
        if verbose:
            tk_val = tqdm(
                valid_loader, leave=False, bar_format='{l_bar}{bar:32}{r_bar}',
                desc='Validation Epoch %d/%d' % (epoch + 1, epochs)
            )
        else:
            tk_val = valid_loader

        # Make sure the model is set to evaluation mode
        model.eval()

        # Validation phase
        running_val_loss = RunningAverageMetric(valid_loader.batch_size)
        running_val_hits = RunningAverageMetric(valid_loader.batch_size)
        with torch.no_grad():
            for inputs, targets in tk_val:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = torch.log_softmax(model(inputs.float()), dim=1)
                loss = criterion(outputs, targets)
                running_val_loss(loss.item())
                predictions = torch.argmax(outputs, dim=1)
                hits = torch.eq(predictions, targets).sum()
                running_val_hits(hits.item())

        # Get the average train and validation losses and accuracies and print it
        end_time = time.time()
        train_loss = running_train_loss.average()
        train_accuracy = running_train_hits.average()
        val_loss = running_val_loss.average()
        val_accuracy = running_val_hits.average()
        print('Epoch %d/%d - train_loss: %.4f, validation_loss: %.4f, train_acc: %.1f%%, validation_acc: %.1f%% [%ds]' %
              (epoch + 1, epochs, train_loss, val_loss, train_accuracy*100, val_accuracy*100, end_time - start_time))

        # Append losses and accuracies to history data
        history['train']['loss'].append(train_loss)
        history['train']['accuracy'].append(train_accuracy)
        history['validation']['loss'].append(val_loss)
        history['validation']['accuracy'].append(val_accuracy)

        # Check if training should stop according to early stopping
        early_stopping(val_loss)
        if early_stopping.should_stop:
            print('Early Stopping... Best Loss: %.4f' % early_stopping.best_loss)
            break

    # Reload the best parameters state
    model.load_state_dict(early_stopping.get_best_state())
    return history


## Extract training set embeddings

In [None]:
bert_model.eval()
embed_dir = "Embed_Train"
os.makedirs(embed_dir, exist_ok=True)
extract_embeddings(embed_dir, dataloader)

## Take a part of the training set as validation set

In [None]:
import torch
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split

ohe, enc = get_one_hot(list(trainset['issue_author_association']))
labels = lenc.transform(get_labels(trainset))
labels = torch.tensor(labels)
train_dataset = EmbedDataset(embed_dir, ohe, labels)

train_indices, val_indices = train_test_split(list(range(len(train_dataset.get_targets()))), test_size=0.1, stratify=train_dataset.get_targets(), random_state=42)
split_train_dataset = torch.utils.data.Subset(train_dataset, train_indices)
val_dataset = torch.utils.data.Subset(train_dataset, val_indices)

## Train

In [None]:
state_filepath = 'mlp.pt'
m = MLP(input_size, num_classes)
train_classifier(m, split_train_dataset, val_dataset, lr=1e-5, patience=5, chkpt_path=state_filepath, n_workers=4, random_state=42)

## Extract test set embeddings

In [None]:
data, labels, _ = get_data(testset)

inputs = encode_text(data, tokenizer)
masks = create_attention_masks(inputs)

labels = lenc.transform(labels)
batch_size = 32

inputs = torch.tensor(inputs)
labels = torch.tensor(labels)
masks = torch.tensor(masks)

# Create the DataLoader for our dataset
data = TensorDataset(inputs, masks)
sampler = SequentialSampler(data)
dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size, num_workers=2)

In [None]:
embed_dir = 'Embed_Test'
os.makedirs(embed_dir, exist_ok=True)
extract_embeddings(embed_dir, dataloader)

## Load model and evaluate

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import json
test_dataset = EmbedDataset(embed_dir, get_one_hot(list(testset['issue_author_association']), enc), labels)

m = MLP(input_size, num_classes)

# Instantiate the model and load from folder
m.load_state_dict(torch.load(state_filepath))

# Get the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Test using device: ' + str(device))

# Move the model to device
m.to(device)

# Make sure the model is set to evaluation mode
m.eval()
metrics_path = 'metrics'
os.makedirs(metrics_path, exist_ok=True)


# Make the predictions for testing the model
y_pred, y_true = [], []
with torch.no_grad():
    for idx, (x1, label) in enumerate(tqdm(test_dataset)):
        x1 = x1.unsqueeze(0)
        x1 = x1.to(device)
        pred = m(x1.float())
        pred = torch.log_softmax(pred, dim=1)
        pred = torch.argmax(pred, dim=1).item()
        y_pred.append(pred)
        y_true.append(label)

# Obtain the classification report
report = classification_report(y_true, y_pred, output_dict=True)
cm = confusion_matrix(y_true, y_pred)
metrics = {
    'report': report,
    'confusion_matrix': cm.tolist()
}

# Store the metrics in a JSON file
with open(os.path.join(metrics_path, "MLP-metrics.json"), 'w') as file:
    json.dump(metrics, file, indent=4)


## Visualize metrics

In [None]:
print(classification_report(y_true, y_pred))