In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.6 MB/s[0m eta [36m0:00:0

In [3]:
import re
import os

import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from tqdm import tqdm, trange
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer


In [4]:
PAD_TOKEN_LABEL_ID = CrossEntropyLoss().ignore_index

BATCH_SIZE = 16
LEARNING_RATE_MODEL = 1e-5
LEARNING_RATE_CLASSIFIER = 1e-3
WARMUP_STEPS = 0
GRADIENT_ACCUMULATION_STEPS = 1
MAX_GRAD_NORM = 1.0
SEED = 42
NO_CUDA = False


In [8]:
def rpad(array, n):
    current_len = len(array)
    if current_len > n:
        return array[:n]
    extra = n - current_len
    return array + ([0] * extra)


def convert_to_embedding(tokenizer, sentences_with_labels):
    for sentence, label in sentences_with_labels:
        tokens = tokenizer.tokenize(sentence)
        tokens = tokens[:250]
        bert_sent = rpad(tokenizer.convert_tokens_to_ids(["CLS"] + tokens + ["SEP"]), n=256)
        yield torch.tensor(bert_sent), torch.tensor(label, dtype=torch.int64)


def parse_line(line):
    line = line.strip().lower()
    line = line.replace("&nbsp;", " ")
    line = re.sub(r'<br(\s\/)?>', ' ', line)
    line = re.sub(r' +', ' ', line)  # merge multiple spaces into one

    return line


def read_data(filename):
    data = []
    for line in open(filename, 'r', encoding="utf-8"):
        data.append(parse_line(line))

    return data


def prepare_dataloader(tokenizer, sampler=RandomSampler, train=False):
    filename = "/content/gdrive/MyDrive/sentiment classifier/train.txt" if train else "/content/gdrive/MyDrive/sentiment classifier/test.txt"

    data = read_data(filename)
    y = np.append(np.zeros(216), np.ones(7660))
    sentences_with_labels = zip(data, y.tolist())

    dataset = list(convert_to_embedding(tokenizer, sentences_with_labels))
    sampler_func = sampler(dataset) if sampler is not None else None
    dataloader = DataLoader(dataset, sampler=sampler_func, batch_size=BATCH_SIZE)

    return dataloader


In [10]:
class Transformers:
    model = None

    def __init__(self, tokenizer):
        self.pad_token_label_id = PAD_TOKEN_LABEL_ID
        self.device = torch.device("cuda" if torch.cuda.is_available() and not NO_CUDA else "cpu")
        self.tokenizer = tokenizer

    def predict(self, sentence):
        if self.model is None or self.tokenizer is None:
            self.load()

        embeddings = list(convert_to_embedding([(sentence, -1)]))
        preds = self._predict_tags_batched(embeddings)
        return preds

    def evaluate(self, dataloader):
        from sklearn.metrics import classification_report
        y_pred = self._predict_tags_batched(dataloader)
        y_true = np.append(np.zeros(216), np.ones(7660))

        score = classification_report(y_true, y_pred)
        print(score)

    def _predict_tags_batched(self, dataloader):
        preds = []
        self.model.eval()
        for batch in tqdm(dataloader, desc="Computing NER tags"):
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                outputs = self.model(batch[0])
                _, is_neg = torch.max(outputs[0], 1)
                preds.extend(is_neg.cpu().detach().numpy())

        return preds

    def train(self, dataloader, model, epochs):
        assert self.model is None  # make sure we are not training after load() command
        model.to(self.device)
        self.model = model

        t_total = len(dataloader) // GRADIENT_ACCUMULATION_STEPS * epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        optimizer_grouped_parameters = [
            {"params": model.bert.parameters(), "lr": LEARNING_RATE_MODEL},
            {"params": model.classifier.parameters(), "lr": LEARNING_RATE_CLASSIFIER}
        ]
        optimizer = AdamW(optimizer_grouped_parameters)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=t_total)

        # Train!
        print("***** Running training *****")
        print("Training on %d examples", len(dataloader))
        print("Num Epochs = %d", epochs)
        print("Total optimization steps = %d", t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(epochs, desc="Epoch")
        self._set_seed()
        for _ in train_iterator:
            epoch_iterator = tqdm(dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(self.device) for t in batch)
                outputs = model(batch[0], labels=batch[1])
                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

                if GRADIENT_ACCUMULATION_STEPS > 1:
                    loss = loss / GRADIENT_ACCUMULATION_STEPS

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

                    scheduler.step()  # Update learning rate schedule
                    optimizer.step()
                    model.zero_grad()
                    global_step += 1

        self.model = model

        return global_step, tr_loss / global_step

    def _set_seed(self):
        torch.manual_seed(SEED)
        if self.device == 'gpu':
            torch.cuda.manual_seed_all(SEED)

    def load(self, model_dir='weights/'):
        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
        self.model = BertForSequenceClassification.from_pretrained(model_dir)
        self.model.to(self.device)


In [12]:
def train(epochs=10, output_dir="weights/"):
    num_labels = 2  # negative and positive reviews
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

    dataloader = prepare_dataloader(tokenizer, train=True)
    predictor = Transformers(tokenizer)
    predictor.train(dataloader, model, epochs)

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def evaluate(model_dir="weights/"):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    dataloader = prepare_dataloader(tokenizer, train=False, sampler=None)
    predictor = Transformers(tokenizer)
    predictor.load(model_dir=model_dir)
    predictor.evaluate(dataloader)



path = '/content/gdrive/MyDrive/sentiment classifier/weights/'
os.makedirs(path, exist_ok=True)
train(epochs=10, output_dir=path)
evaluate(model_dir=path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


***** Running training *****
Training on %d examples 493
Num Epochs = %d 10
Total optimization steps = %d 4930


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 1/493 [00:00<06:01,  1.36it/s][A
Iteration:   0%|          | 2/493 [00:01<05:47,  1.41it/s][A
Iteration:   1%|          | 3/493 [00:02<05:40,  1.44it/s][A
Iteration:   1%|          | 4/493 [00:02<05:41,  1.43it/s][A
Iteration:   1%|          | 5/493 [00:03<05:43,  1.42it/s][A
Iteration:   1%|          | 6/493 [00:04<05:45,  1.41it/s][A
Iteration:   1%|▏         | 7/493 [00:05<05:50,  1.39it/s][A
Iteration:   2%|▏         | 8/493 [00:05<05:55,  1.36it/s][A
Iteration:   2%|▏         | 9/493 [00:06<05:52,  1.37it/s][A
Iteration:   2%|▏         | 10/493 [00:07<05:52,  1.37it/s][A
Iteration:   2%|▏         | 11/493 [00:07<05:58,  1.34it/s][A
Iteration:   2%|▏         | 12/493 [00:08<05:46,  1.39it/s][A
Iteration:   3%|▎         | 13/493 [00:09<05:43,  1.40it/s][A
Iteration:   3%|▎         | 14/493 [00:10<05:40,  1.41it/s][A
Iteration:   3%|▎         | 15/493 [00:10<05:35,  1.42it/s][A
Iteration:   3%|▎

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       216
         1.0       0.97      1.00      0.99      7660

    accuracy                           0.97      7876
   macro avg       0.49      0.50      0.49      7876
weighted avg       0.95      0.97      0.96      7876




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
