<a href="https://colab.research.google.com/github/danielkorat/BERT-ABSA/blob/master/Sentiment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup environment

## Select backend and pre-trained model

In [1]:
BACKEND = 'tf' # or 'pt'
MODEL_NAME = 'distilbert-base-uncased'
DO_TRAIN = True
DO_PREDICT = True

SMOKE_TEST = True

## Install dependencies

In [None]:
from sys import executable as python
!{python} -m pip install -q -q transformers==4.4.2

if BACKEND == 'tf':
    !{python} -m pip install -q -q tensorflow==2.4.1
if BACKEND == 'pt':
    !{python} -m pip install -q -q torch==1.8.0

In [None]:
from pathlib import Path

if BACKEND == 'tf':
    import tensorflow as tf
    from tensorflow.data import Dataset
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

    from transformers import TFAutoModelForSequenceClassification as model_cls
    from transformers import TFTrainer as trainer_cls
    from transformers import TFTrainingArguments as training_args_cls

if BACKEND == 'pt':
    import torch

    from transformers import AutoModelForSequenceClassification as model_cls
    from transformers import Trainer as trainer_cls
    from transformers import TrainingArguments as training_args_cls

import transformers
transformers.logging.set_verbosity_info()

# Download Dataset

In [None]:
! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
! tar -xf aclImdb_v1.tar.gz

# Read dataset

In [None]:
def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)
    return texts, labels

train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

if SMOKE_TEST:
    train_texts, train_labels = train_texts[:500], train_labels[:500]
    test_texts, test_labels = test_texts[:200], test_labels[:200]

# Preprocess

## Tokenize

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

## Convert to tensor dataset

In [None]:
if BACKEND == 'tf':
    train_dataset = Dataset.from_tensor_slices((dict(train_encodings), train_labels))
    test_dataset = Dataset.from_tensor_slices((dict(test_encodings), test_labels))

if BACKEND == 'pt':
    class IMDbDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = IMDbDataset(train_encodings, train_labels)
    test_dataset = IMDbDataset(test_encodings, test_labels)

# Training arguments

In [None]:
training_args = training_args_cls(
    output_dir='./results',             # output directory
    num_train_epochs=5,                 # total number of training epochs
    max_steps=20 if SMOKE_TEST else -1, # total number of training steps (overrides `num_train_epochs`)
    per_device_train_batch_size=16,     # batch size per device during training
    warmup_steps=500,                   # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                  # strength of weight decay
    logging_dir='./logs',               # directory for storing logs
    logging_steps=10,
)

# Load pre-trained model

In [None]:
if BACKEND == 'tf':
    with training_args.strategy.scope():
        model = model_cls.from_pretrained(MODEL_NAME)
if BACKEND == 'pt':
    model = model_cls.from_pretrained(MODEL_NAME)

In [None]:
trainer = trainer_cls(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)

# Fine-Tune

In [None]:
if DO_TRAIN:
    trainer.train()

# Inference

In [None]:
if DO_PREDICT:
    trainer.predict(test_dataset)