# SENTIMENT ANALYSIS USING ROMANIAN BERT

First we need to install dependencies.

In [None]:
!pip3 install transformers tokenizers pytorch-lightning torch

Import all dependencies

In [27]:
import torch
from torch import nn
import torch.nn.functional as F
from transformers import *
import logging
import os
from functools import lru_cache
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from imbalanced_sampler import ImbalancedDatasetSampler
from argparse import Namespace
from model import SentimentModel
from data import SentimentDataset
from imbalanced_sampler import ImbalancedDatasetSampler
import matplotlib
matplotlib.style.use('ggplot')

Load dataset.

In [60]:
# Load Data
df = pd.read_csv("./ro/train.csv")
df.dropna(inplace=True)
df.label = df.label.astype('category')
train_df = df.iloc[:15000]
validation_df = df.iloc[15000:]
test_df = pd.read_csv("./ro/test.csv")
test_df.dropna(inplace=True)

label2int = {
  "meta_minus_m": 0,
  "meta_plus_m": 1,
  "meta_zero": 2,
  "meta_amb": 3
}

In [55]:
train_df = train_df.iloc[:100]
validation_df = validation_df.iloc[:10]
test_df = test_df.iloc[:10]

Load pre-trained model  and tokenizer. Declare hparams

In [56]:
name = "dumitrescustefan/bert-base-romanian-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(name)
config = BertConfig.from_pretrained(name, output_hidden_states=True)
model = AutoModel.from_pretrained(name, config=config)

hparams = Namespace(
    train_df=train_df,
    test_df=test_df,
    val_df=validation_df,
    batch_size=16,
    warmup_steps=100,
    epochs=1,
    lr=5e-4,
    accumulate_grad_batches=1,
)
module = TrainingModule(hparams)

INFO:transformers.tokenization_utils:Model name 'dumitrescustefan/bert-base-romanian-cased-v1' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1). Assuming 'dumitrescustefan/bert-base-romanian-cased-v1' is a path or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:Didn't find file dumitrescustefan/bert-base-romanian-cased-v1/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils:Didn't find file dumitrescustefan/bert-base-romanian-cased-v1

Define pytorch-lightning module

In [57]:
class TrainingModule(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.model = SentimentModel(model, tokenizer, output_size=2)
        self.loss = nn.CrossEntropyLoss()
        self.hparams = hparams
        
    def step(self, batch, step_name="train"):
        X, y = batch
        loss = self.loss(self.forward(X), y)
        loss_key = f"{step_name}_loss"
        tensorboard_logs = {loss_key: loss}
        print({ ("loss" if step_name == "train" else loss_key): loss, 'log': tensorboard_logs,
               "progress_bar": {loss_key: loss}})
        return { ("loss" if step_name == "train" else loss_key): loss, 'log': tensorboard_logs,
               "progress_bar": {loss_key: loss}}
    
    def forward(self, X):
        return self.model(X)
    
    def training_step(self, batch, batch_idx):
        return self.step(batch, "train")
    
    def validation_step(self, batch, batch_idx):
        return self.step(batch, "valid")
    
    def validation_end(self, outputs):
        loss = torch.stack([x["valid_loss"] for x in outputs]).mean()
        return {"valid_loss": loss}
        
    def test_step(self, batch, batch_idx):
        return self.step(batch, "test")
    
    def train_dataloader(self):
        return self.create_data_loader(mode="train", df=self.hparams.train_df)
    
    def val_dataloader(self):
        return self.create_data_loader(mode="val", df=self.hparams.val_df)

    def test_dataloader(self):
        return self.create_data_loader(mode="test", df=self.hparams.test_df)
    
    def create_data_loader(self, mode, df, shuffle=False):
        if mode == "train":
            return DataLoader(
                    SentimentDataset(df),
                    batch_size=self.hparams.batch_size,
                    shuffle=False,
                    #sampler=ImbalancedDatasetSampler(train_df)
                    )
        else:
            return DataLoader(
                    SentimentDataset(df),
                    batch_size=self.hparams.batch_size,
                    shuffle=False
                    )
    
    @lru_cache()
    def total_steps(self):
        return len(self.train_dataloader()) // self.hparams.accumulate_grad_batches * self.hparams.epochs

    def configure_optimizers(self):
        ## use AdamW optimizer -- faster approach to training NNs
        ## read: https://www.fast.ai/2018/07/02/adam-weight-decay/
        optimizer = AdamW(self.model.parameters(), lr=self.hparams.lr)
        lr_scheduler = get_linear_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=self.hparams.warmup_steps,
                    num_training_steps=self.total_steps(),
        )
        return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}]

Start training. Can be very easily switch from CPU to GPU.

In [58]:
## train roughly for about 10-15 minutes with GPU enabled.
trainer = pl.Trainer(gpus=0, max_epochs=hparams.epochs, progress_bar_refresh_rate=10,
                     accumulate_grad_batches=hparams.accumulate_grad_batches)

trainer.fit(module)

INFO:lightning:GPU available: False, used: False
INFO:lightning:
    | Name                                                    | Type              | Params
------------------------------------------------------------------------------------------
0   | model                                                   | SentimentModel    | 125 M 
1   | model.model                                             | BertModel         | 124 M 
2   | model.model.embeddings                                  | BertEmbeddings    | 38 M  
3   | model.model.embeddings.word_embeddings                  | Embedding         | 38 M  
4   | model.model.embeddings.position_embeddings              | Embedding         | 393 K 
5   | model.model.embeddings.token_type_embeddings            | Embedding         | 1 K   
6   | model.model.embeddings.LayerNorm                        | LayerNorm         | 1 K   
7   | model.model.embeddings.dropout                          | Dropout           | 0     
8   | model.model.encoder

HBox(children=(IntProgress(value=1, bar_style='info', description='Validation sanity check', layout=Layout(fle…

{'valid_loss': tensor(0.8204), 'log': {'valid_loss': tensor(0.8204)}, 'progress_bar': {'valid_loss': tensor(0.8204)}}




HBox(children=(IntProgress(value=1, bar_style='info', description='Training', layout=Layout(flex='2'), max=1, …

{'loss': tensor(0.6227, grad_fn=<NllLossBackward>), 'log': {'train_loss': tensor(0.6227, grad_fn=<NllLossBackward>)}, 'progress_bar': {'train_loss': tensor(0.6227, grad_fn=<NllLossBackward>)}}
{'loss': tensor(0.5953, grad_fn=<NllLossBackward>), 'log': {'train_loss': tensor(0.5953, grad_fn=<NllLossBackward>)}, 'progress_bar': {'train_loss': tensor(0.5953, grad_fn=<NllLossBackward>)}}
{'loss': tensor(0.5897, grad_fn=<NllLossBackward>), 'log': {'train_loss': tensor(0.5897, grad_fn=<NllLossBackward>)}, 'progress_bar': {'train_loss': tensor(0.5897, grad_fn=<NllLossBackward>)}}
{'loss': tensor(0.6056, grad_fn=<NllLossBackward>), 'log': {'train_loss': tensor(0.6056, grad_fn=<NllLossBackward>)}, 'progress_bar': {'train_loss': tensor(0.6056, grad_fn=<NllLossBackward>)}}
{'loss': tensor(0.5477, grad_fn=<NllLossBackward>), 'log': {'train_loss': tensor(0.5477, grad_fn=<NllLossBackward>)}, 'progress_bar': {'train_loss': tensor(0.5477, grad_fn=<NllLossBackward>)}}
{'loss': tensor(0.5359, grad_fn=<Nl

HBox(children=(IntProgress(value=1, bar_style='info', description='Validating', layout=Layout(flex='2'), max=1…

{'valid_loss': tensor(1.0602), 'log': {'valid_loss': tensor(1.0602)}, 'progress_bar': {'valid_loss': tensor(1.0602)}}





1

In [65]:
with torch.no_grad():
    progress = ["/", "-", "\\", "|", "/", "-", "\\", "|"]
    module.eval()
    true_y, pred_y = [], []
    for i, batch_ in enumerate(module.test_dataloader()):
        X, y = batch_
        batch = X
        y_pred = torch.argmax(module(batch), dim=1)
        true_y.extend(y)
        pred_y.extend(y_pred)
print("\n" + "_" * 80)
print(classification_report(true_y, pred_y, target_names=["Results"], digits=2))


________________________________________________________________________________
              precision    recall  f1-score   support

     Results       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

