In [None]:
!pip install nltk
!pip install transformers

# Connect to the Dataset

In [None]:
from google.colab import drive

drive.mount('/content/drive')

## Check for hardware and versions

In [None]:
from numba.cuda.cudadrv import enums
from numba import cuda

import transformers

# device = cuda.get_current_device()
# attribs= [name.replace("CU_DEVICE_ATTRIBUTE_", "") for name in dir(enums) if name.startswith("CU_DEVICE_ATTRIBUTE_")]
# for attr in attribs:
#     print(attr, '=', getattr(device, attr))

print(f'transformers version: {transformers.__version__}')


In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Setup

In [None]:
import pandas as pd

random_state = 42 # for reproducibility

OVERALL_RATING = 'overall_rating'
REVIEW_TEXT = 'review_text'

df = pd.read_csv('/content/drive/MyDrive/Datasets/B2W-Reviews01.csv', parse_dates=['submission_date'], low_memory=False)
print(df.dtypes)
print('Data size: ' + str(len(df)))

## Pre Processing of Data

### Select the samples

In [None]:
# SAMPLE_SIZE = 15000
SAMPLE_SIZE = 3000

# Drop rows with null values in 'review_text' column and select a sample
samples = df.dropna(subset = [REVIEW_TEXT])

# Drops samples with 3 stars, as they will be considered neutral
samples = samples[samples[OVERALL_RATING] != 3]

samples = samples.sample(
    n=SAMPLE_SIZE,
    random_state=random_state
)

In [None]:
print(f'samples size is {len(samples)}')

## Prepare Training and Evaluation

### Create X and Y values

In [None]:
X = samples[REVIEW_TEXT].values

Y = samples[OVERALL_RATING]

# Maps 1 and 2 stars as negative reviews, and 4 and 5 stars as positive reviews
Y = Y.map({1:0, 2:0, 4: 1, 5: 1}).values

print('X values:')
print(X)

print('\nY values:')
print(Y)

### Separate training and test

In [None]:
from sklearn.model_selection import train_test_split

TEST_PERCENTAGE = 0.2

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_PERCENTAGE, random_state=random_state)

### Balance the training data

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle


print(f'X_train length: {len(X_train)}')
print(X_train)

reshaped_X_train = X_train.reshape(-1, 1)
print(reshaped_X_train)

print(f'\n\nY_train length: {len(Y_train)}')
print(Y_train)

Y_train_series = pd.Series(Y_train)

undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, Y_train_resampled = undersampler.fit_resample(reshaped_X_train, Y_train)

X_train_resampled = X_train_resampled.flatten()

print(f'\n\nX_train_resampled length: {len(X_train_resampled)}')
print(X_train_resampled)

print(f'\n\nY_train_resampled length: {len(Y_train_resampled)}')

X_train_resampled = shuffle(X_train_resampled, random_state=random_state)
Y_train_resampled = shuffle(Y_train_resampled, random_state=random_state)

print(f'\n\nX_test length: {len(X_test)}')
print(X_test)

print(f'\n\nY_test length: {len(Y_test)}')
print(Y_test)

## Vectorize data

In [None]:
from transformers import AutoTokenizer, BertTokenizer

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)


## Create train inputs

In [None]:
import transformers
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm


def preprocessing_for_bert(data):
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []
    for sent in tqdm(data):
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,             # Max length to truncate/pad
            pad_to_max_length=True,         # Pad or Truncate sentences to max length
            return_attention_mask=True
        )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    return input_ids, attention_masks


# Testing with first sentence
MAX_LEN = 510  # As that is the max length that is generally encoded.

# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ', X[0] )
print('Token IDs: ', token_ids)

# Tokenizing all data
print('Tokenizing Data')
train_inputs, train_masks = preprocessing_for_bert(X_train_resampled)
test_inputs, test_masks = preprocessing_for_bert(X_test)

# Creating PyTorch Dataloader

A dataloader allows for faster loading of data, thus making the training more efficient and it also helps to save on memory.

In [None]:
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


# Convert other data types to torch.Tensor
train_labels = torch.tensor(np.array(Y_train_resampled))
test_labels = torch.tensor(np.array(Y_test))

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size =  16

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

print(len(train_inputs))

# Training the model

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, BertModel

class BertClassifier(nn.Module):
    """
    Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        # self.bert = BertModel.from_pretrained('bert-base-uncased')
        # self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.bert = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False


    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size, max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size, num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

# Optimizer

The authors recommend following hyper-parameters:

Batch size:
* 16 or 32
* Learning rate (Adam): 5e-5, 3e-5 or 2e-5
* Number of epochs: 2, 3, 4

Huggingface provided the run_glue.py script, an examples of implementing the transformers library. In the script, the AdamW optimizer is used.

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(
        bert_classifier.parameters(),
        lr=3e-5,    # Default learning rate
        eps=1e-8    # Default epsilon value
    )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=10, # Default value
        num_training_steps=total_steps
    )
    return bert_classifier, optimizer, scheduler

# Train

In [None]:
import os
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)


def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """
    Train the BertClassifier model.
    """
    start_epoch = 0
    checkpoint_path = "/content/drive/MyDrive/Datasets/bert_checkpoint.pth"

    # Check if there is a saved checkpoint
    if os.path.isfile(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f'Loaded checkpoint from epoch {start_epoch}\n')

    # Start training loop
    print("Start training...\n")
    for epoch_i in range(start_epoch, epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-" * 70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-" * 70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
        torch.save({
            'epoch': epoch_i + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, checkpoint_path)

    print("Training complete!")


def evaluate(model, val_dataloader):
    """
    After the completion of each training epoch, measure the model's performance on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
import time

set_seed(random_state)    # Set seed for reproducibility
t1 = time.time()
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader, test_dataloader, epochs=4, evaluation=True)
t2 = time.time()
print(f'\nTook: {t2 - t1} seconds to run')

torch.cuda.empty_cache()

# Evaluation on validation set

In [None]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)

    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn

def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')

    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy of BERT model: {accuracy*100:.2f}%')
    recall = recall_score(y_true, y_pred, average='macro')
    print(f'Recall of BERT model: {recall}')
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f'F1 Score of BERT model: {f1}')


    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

    # Plot Confusion Matrix
    data = confusion_matrix(y_true, y_pred)
    data.astype(int)
    plot_confusion_matrix(data, labels=['negative', 'positive'])


def plot_confusion_matrix(data, labels):
  """Plot confusion matrix using heatmap.
  Args:
      data (list of list): List of lists with confusion matrix data.
      labels (list): Labels which will be plotted across x and y axis.
      output_filename (str): Path to output file.
  """
  seaborn.set(color_codes=True)
  plt.figure(1, figsize=(9, 6))

  plt.title("Confusion Matrix")

  seaborn.set(font_scale=1.4)
  ax = seaborn.heatmap(data, annot=True, fmt='d', cmap="YlGnBu", cbar_kws={'label': 'Scale'})

  ax.set_xticklabels(labels)
  ax.set_yticklabels(labels)

  ax.set(ylabel="True Label", xlabel="Predicted Label")

  # plt.savefig(output_filename, bbox_inches='tight', dpi=300)
  plt.show()

In [None]:
# Compute predicted probabilities on the test set
t1 = time.time()
probs = bert_predict(bert_classifier, test_dataloader)
t2 = time.time()
print(f'\nTook: {t2 - t1} seconds to run')

In [None]:
# Evaluate the Bert classifier
t1 = time.time()
evaluate_roc(probs, Y_test)
t2 = time.time()
print(f'\nTook: {t2 - t1} seconds to run')

In [None]:
torch.save(bert_classifier.state_dict(), "/content/drive/MyDrive/Datasets/bert_news_sentiment.pth")