<a href="https://colab.research.google.com/github/graviraja/100-Days-of-NLP/blob/applications%2Fclassification/applications/classification/sentiment_classification/Sentimix%20with%20XLM-Roberta-LSTM-Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initial Setup

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
train_file = '/content/drive/My Drive/train_14k_split_conll.txt'
test_file = '/content/drive/My Drive/dev_3k_split_conll.txt'

# Data containing transliteration using google's api is taken from here
# https://github.com/keshav22bansal/BAKSA_IITK
processed_train_file = '/content/drive/My Drive/hinglish_train.txt'
processed_test_file = '/content/drive/My Drive/hinglish_test.txt'

In [5]:
!pip install indic_transliteration -q
!pip install contractions -q
!pip install transformers -q

[K     |████████████████████████████████| 102kB 3.2MB/s 
[K     |████████████████████████████████| 911kB 9.6MB/s 
[K     |████████████████████████████████| 245kB 5.9MB/s 
[K     |████████████████████████████████| 317kB 30.5MB/s 
[?25h  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 778kB 4.4MB/s 
[K     |████████████████████████████████| 3.0MB 19.2MB/s 
[K     |████████████████████████████████| 890kB 42.6MB/s 
[K     |████████████████████████████████| 1.1MB 52.5MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


### Imports

In [6]:
import re
import time
import string
import contractions
import numpy as np
import pandas as pd

from indic_transliteration import sanscript
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import XLMRobertaTokenizer, XLMRobertaModel, AdamW, get_linear_schedule_with_warmup

import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Processing the Data

Skip this step and move to Using the processed sentences.

The processed data is taken from [here](https://github.com/keshav22bansal/BAKSA_IITK)

The major difference is that transliteration of hinglish words to hindi is done using google's api instead of indic_transliteration module

In [6]:
with open(train_file) as f:
    data = f.readlines()

with open(test_file, 'r') as f:
    test_data = f.readlines()

In [7]:
def parse_data(data):
    uids, sentences, sentences_info, sentiment = [], [], [], []
    
    single_sentence, single_sentence_info = [], []
    sent = ""
    uid = 0

    for idx, each_line in enumerate(data):
        line = each_line.strip()
        tokens = line.split('\t')
        num_tokens = len(tokens)
        if num_tokens == 2:
            # add the word
            single_sentence.append(tokens[0])
            # add the word info(lang)
            single_sentence_info.append(tokens[1])
        elif num_tokens == 3 and idx > 0:
            # append the sentence data
            sentences.append(single_sentence)
            sentences_info.append(single_sentence_info)
            sentiment.append(sent)
            uids.append(uid)
            sent = tokens[-1]
            uid = int(tokens[1])
            # clear the single sentence
            single_sentence = []
            single_sentence_info = []
        # new line after the sentence
        elif num_tokens == 1:
            continue
        else:
            sent = tokens[-1]
            uid = int(tokens[1])

    # for the last sentence
    if len(single_sentence) > 0:
        sentences.append(single_sentence)
        sentences_info.append(single_sentence_info)
        sentiment.append(sent)
        uids.append(uid)
        
    assert len(sentences) == len(sentences_info) == len(sentiment) == len(uids)
    return sentences, sentences_info, sentiment, uids

In [8]:
sentences, sentences_info, sentiment, uids = parse_data(data)

In [9]:
test_sentences, test_sentences_info, test_sentiment, test_uids = parse_data(test_data)

In [10]:
list(zip(sentences[0], sentences_info[0]))

[('nen', 'Eng'),
 ('á', 'O'),
 ('vist', 'Eng'),
 ('bolest', 'Eng'),
 ('vztek', 'Eng'),
 ('smutek', 'Eng'),
 ('zmatek', 'Hin'),
 ('osam', 'Hin'),
 ('ě', 'O'),
 ('lost', 'Eng'),
 ('beznad', 'Eng'),
 ('ě', 'O'),
 ('j', 'Hin'),
 ('a', 'Eng'),
 ('nakonec', 'Eng'),
 ('jen', 'Hin'),
 ('klid', 'Hin'),
 ('Asi', 'Hin'),
 ('takhle', 'Hin'),
 ('vypad', 'Hin'),
 ('á', 'O'),
 ('m', 'Hin'),
 ('ů', 'O'),
 ('j', 'Eng'),
 ('life', 'Eng'),
 ('...', 'O')]

In [11]:
data = "jen klid takhle vypad"
transliterate(data, sanscript.ITRANS, sanscript.DEVANAGARI)

'जेन् क्लिद् तख्ले व्य्पद्'

In [12]:
def translate(sentences, sentences_info):
    translated = []

    for sent, sent_info in zip(sentences, sentences_info):
        partial_translated = []
        for word, word_info in zip(sent, sent_info):
            if word_info == "Hin":
                partial_translated.append(transliterate(word, sanscript.ITRANS, sanscript.DEVANAGARI))
            else:
                partial_translated.append(word)
        translated.append(partial_translated)
    
    return translated

In [13]:
translated_sentences = translate(sentences, sentences_info)
test_translated_sentences = translate(test_sentences, test_sentences_info)

In [39]:
url_pattern = r'https(.*)/\s[\w\u0900-\u097F]+'
special_chars = r'[_…\*\[\]\(\)&“]'
names_with_numbers = r'([A-Za-z\u0900-\u097F]+)\d{3,}'
apostee = r"([\w]+)\s'\s([\w]+)"
names = r"@[\s]*[\w\u0900-\u097F]+[\s]*[_]+[\s]*[\w\u0900-\u097F]+|@[\s]*[\w\u0900-\u097F]+"
hashtags = r"#[\s]*[\w\u0900-\u097F]+[\s]*"

def preprocess_data(sentence_tokens):
    sentence = " ".join(sentence_tokens)
    sentence = " " + sentence
    # remove rt and … from string
    sentence = sentence.replace(" RT ", "")
    sentence = sentence.replace("…", "")
    # replace apostee
    sentence = sentence.replace("’", "'")
    # replace _
    sentence = sentence.replace("_", " ")
    # replace names
    sentence = re.sub(re.compile(names), " ", sentence)
    # remove hashtags
    sentence = re.sub(re.compile(hashtags), " ", sentence)
    # remove urls
    sentence = re.sub(re.compile(url_pattern), "", sentence)
    # combine only ' related words => ... it ' s ... -> ... it's ...
    sentence = re.sub(re.compile(apostee), r"\1'\2", sentence)
    # fix contractions
    sentence = contractions.fix(sentence)
    # replace names ending with numbers with only names (remove numbers)
    sentence = re.sub(re.compile(names_with_numbers), r" ", sentence)
    sentence = " ".join(sentence.split()).strip()
    return sentence


In [15]:
MODEL_NAME = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [16]:
print(tokenizer.sep_token, tokenizer.sep_token_id)
print(tokenizer.cls_token, tokenizer.cls_token_id)
print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.unk_token, tokenizer.unk_token_id)

</s> 2
<s> 0
<pad> 1
<unk> 3


In [33]:
" ".join(sentences[32]), sentiment[32]

('@ IndiaToday Teri kimat dokodi ki ho gayi ... amit shah will capture telegana soon ... kcr will resign ...',
 'negative')

In [34]:
" ".join(translated_sentences[32])

'@ IndiaToday टेरि किमत् दोकोदि कि हो गयि ... अमित् शह् will capture telegana soon ... kcr will resign ...'

In [35]:
preprocess_data(translated_sentences[32])

'टेरि किमत् दोकोदि कि हो गयि ... अमित् शह् will capture telegana soon ... kcr will resign ...'

In [20]:
encoding = tokenizer.encode_plus(
  preprocess_data(translated_sentences[32]),
  max_length=100,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  truncation=True,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

In [21]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

100


tensor([     0,  46005,  18992,   1682, 154156,  10850,    356,  13551,   1682,
          1253,   5167,  67625,    153, 129069,   4377,   8933,   3849,   4377,
          1221, 141621,   5501,  24869,  33662,    153,    472,  23150,   1221,
        199747,    153,      2,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1])

In [22]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

100


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [23]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['<s>',
 '▁टे',
 'रि',
 '▁कि',
 'मत्',
 '▁दो',
 'को',
 'दि',
 '▁कि',
 '▁हो',
 '▁ग',
 'यि',
 '▁...',
 '▁अमित',
 '्',
 '▁श',
 'ह',
 '्',
 '▁will',
 '▁capture',
 '▁tele',
 'gana',
 '▁soon',
 '▁...',
 '▁k',
 'cr',
 '▁will',
 '▁resign',
 '▁...',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [36]:
" ".join(sentences[29]), sentiment[29]

('Madam @ SushmaSwaraj ji we always miss you as a # videsh _ mantri',
 'positive')

In [37]:
" ".join(translated_sentences[29])

'ंअदम् @ Sउश्मSवरज् जि we always miss you as a # विदेश् _ मन्त्रि'

In [40]:
preprocess_data(translated_sentences[29])

'ंअदम् जि we always miss you as a मन्त्रि'

In [27]:
" ".join(sentences[10]), sentiment[10]

('@ ECISVEEP Can you answer miscalculated votes on each seat ? One vote matters ! # deshkamahatyohar hai aur apne dhji … https // t . co / SuHS4mx6Dm',
 'neutral')

In [28]:
" ".join(translated_sentences[10])

'@ ECISVEEP Can you answer miscalculated votes on each seat ? One vote मत्तेर्स् ! # देश्कमहत्योहर् है और् अप्ने dhji … https // t . cओ / SउःS४म्क्ष्६ड्म्'

In [29]:
preprocess_data(translated_sentences[10])

'Can you answer miscalculated votes on each seat ? One vote मत्तेर्स् ! # देश्कमहत्योहर् है और् अप्ने dhji'

In [41]:
%%time
processed_sentences = []

for sent in translated_sentences:
    processed_sentences.append(preprocess_data(sent))

test_data = []

for sent in test_translated_sentences:
    test_data.append(preprocess_data(sent))

CPU times: user 707 ms, sys: 8 µs, total: 707 ms
Wall time: 708 ms


In [42]:
sentiment_mapping = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

In [43]:
labels = [sentiment_mapping[sent] for sent in sentiment]
test_label = [sentiment_mapping[sent] for sent in test_sentiment]

### Using the Processed sentences

In [8]:
uids = []
processed_sentences = []
labels = []

with open(processed_train_file, 'r') as f:
    for line in f.readlines()[1:]:
        items = line.strip().split('\t')
        uids.append(items[0])
        processed_sentences.append(str(items[1]))
        labels.append(int(items[2]))

In [9]:
test_uids = []
test_data = []
test_label = []

with open(processed_test_file, 'r') as f:
    for line in f.readlines()[1:]:
        items = line.strip().split('\t')
        test_uids.append(items[0])
        test_data.append(str(items[1]))
        test_label.append(int(items[2]))

### Train-Val-Test data splits

In [10]:
train_uids, val_uids, train_data, val_data, train_label, val_label = train_test_split(uids, processed_sentences, labels, test_size=0.2)

In [11]:
len(train_data), len(val_data), len(test_data)

(11200, 2800, 3131)

### Tokenizer

In [12]:
MAX_LEN = 150

In [13]:
MODEL_NAME = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




### Dataset Wrapper class

In [14]:
class SentiMixDataSet(Dataset):
    def __init__(self, inputs, labels, tokenizer, max_len):
        self.sentences = inputs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, item):
        sentence = self.sentences[item]
        sentiment = int(self.labels[item])
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        
        return {
            "text": sentence,
            "input_ids": encoding['input_ids'].flatten(),
            "attention_mask": encoding['attention_mask'].flatten(),
            "label": torch.tensor(sentiment, dtype=torch.long)
        }

In [15]:
train_dataset = SentiMixDataSet(train_data, train_label, tokenizer, MAX_LEN)
val_dataset = SentiMixDataSet(val_data, val_label, tokenizer, MAX_LEN)
test_dataset = SentiMixDataSet(test_data, test_label, tokenizer, MAX_LEN)

### DataLoaders

In [16]:
BATCH_SIZE = 64

In [17]:
train_data_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
valid_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [18]:
# sample
sample = next(iter(train_data_loader))

In [19]:
sample["input_ids"].shape, sample["attention_mask"].shape, sample["label"].shape

(torch.Size([64, 150]), torch.Size([64, 150]), torch.Size([64]))

### XLM-RoBERTa with Bidirectional LSTM Attention Model

In [20]:
class XLMAttentionModel(nn.Module):
    def __init__(self, hidden_dim, output_dim, dropout=0.3):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.bert = XLMRobertaModel.from_pretrained(MODEL_NAME)
        embedding_size = self.bert.config.to_dict()['hidden_size']

        self.lstm = nn.LSTM(embedding_size, hidden_dim, batch_first=True, bidirectional=True)
        self.out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def attention(self, outputs, hidden, mask=None):
        # outputs => [batch_size, seq_len, hid_dim]
        # hidden => [batch_size, hid_dim]
        # mask => [batch_size, seq_len]

        hidden = hidden.unsqueeze(2)
        # hidden => [batch_size, hid_dim, 1]

        attn_weights = torch.bmm(outputs, hidden)
        # outputs      => [batch_size, seq_len, hid_dim]
        # hidden       => [batch_size, hid_dim, 1]
        # attn_weights => [batch_size, seq_len, 1]

        attn_weights = attn_weights.squeeze(2)
        # attn_weights => [batch_size, seq_len]

        if mask is not None:
            attn_weights = attn_weights.masked_fill(mask==0, -1e10)

        soft_attn_weights = F.softmax(attn_weights, dim=1)
        # soft_attn_weights => [batch_size, seq_len]

        soft_attn_weights = soft_attn_weights.unsqueeze(2)
        # soft_attn_weights => [batch_size, seq_len, 1]

        weighted = torch.bmm(outputs.transpose(1, 2), soft_attn_weights)
        # outputs.transpose(1, 2) => [batch_size, hid_dim, seq_len]
        # soft_attn_weights       => [batch_size, seq_len, 1]
        # weighted                => [batch_size, hid_dim, 1]

        weighted = weighted.squeeze(2)
        # weighted => [batch_size, hid_dim]

        return weighted, soft_attn_weights.squeeze(2)

    def forward(self, input_ids, attention_mask):
        # input_ids => [batch_size, seq_len]
        # attention_mask => [batch_size, seq_len]

        embeddings, _ = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        embeddings = self.dropout(embeddings)
        # embeddings => [batch_size, seq_len, emb_dim]

        outputs, (hidden, cell) = self.lstm(embeddings)
        # outputs => [batch_size, seq_len, hid_dim * 2]
        # hidden, cell => [2, batch_size, hid_dim]

        outputs = outputs[:, :, :self.hidden_dim] + outputs[:, :, self.hidden_dim:]
        # outputs => [batch_size, seq_len, hid_dim]
        
        hidden = hidden[0, :, :] + hidden[1, :, :]
        # hidden => [batch_size, hid_dim]

        weighted, attn_scores = self.attention(outputs, hidden, attention_mask)
        # weighted => [batch_size, hid_dim]
        # attn_scores => [batch_size, seq_len]

        logits = self.out(self.dropout(weighted))
        # logits => [batch_size, output_dim]

        return logits, attn_scores

In [21]:
hidden_dim = 100
output_dim = 3
model = XLMAttentionModel(hidden_dim, output_dim)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…




In [22]:
model = model.to(device)

In [23]:
torch.cuda.empty_cache()

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 278,739,951 trainable parameters


### Loss & Optimizer

In [25]:
EPOCHS = 10

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

loss_fn = nn.CrossEntropyLoss().to(device)

### Training Method

In [26]:
def train(model, iterator, clip=2.0):
    epoch_loss = 0
    model.train()

    for batch in iterator:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["label"].to(device)

        predictions, _ = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        optimizer.zero_grad()
        loss = loss_fn(predictions, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

### Evaluation Method

In [27]:
def simple_accuracy(preds, labels):
    """Takes in two lists of predicted labels and actual labels and returns the accuracy in the form of a float. """
    return np.equal(preds, labels).mean()

In [28]:
def evaluate(model, iterator):
    model.eval()
    epoch_loss = 0
    preds = []
    trgs = []

    with torch.no_grad():
        for batch in iterator:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["label"].to(device)

            predictions, _ = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            loss = loss_fn(predictions, targets)
            epoch_loss += loss.item()

            trgs.extend(targets.detach().cpu().numpy().tolist())
            _, predicted = torch.max(predictions, 1)
            preds.extend(predicted.detach().cpu().numpy().tolist())

    return epoch_loss / len(iterator), simple_accuracy(preds, trgs)

In [29]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### Training Loop

In [30]:
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_data_loader)
    val_loss, val_acc = evaluate(model, valid_data_loader)
    end_time = time.time()
    # scheduler.step(val_loss)
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f"Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs:.2f}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f} | Val Acc: {val_acc:.3f}")
    
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), 'xlm_roberta.pt')


Epoch: 01 | Time: 5m 41.00s
	Train Loss: 0.955 | Val Loss: 0.851 | Val Acc: 0.610
Epoch: 02 | Time: 5m 45.00s
	Train Loss: 0.837 | Val Loss: 0.828 | Val Acc: 0.626
Epoch: 03 | Time: 5m 45.00s
	Train Loss: 0.796 | Val Loss: 0.866 | Val Acc: 0.630
Epoch: 04 | Time: 5m 44.00s
	Train Loss: 0.751 | Val Loss: 0.845 | Val Acc: 0.642
Epoch: 05 | Time: 5m 44.00s
	Train Loss: 0.716 | Val Loss: 0.888 | Val Acc: 0.636
Epoch: 06 | Time: 5m 44.00s
	Train Loss: 0.680 | Val Loss: 0.882 | Val Acc: 0.630
Epoch: 07 | Time: 5m 44.00s
	Train Loss: 0.634 | Val Loss: 0.891 | Val Acc: 0.636
Epoch: 08 | Time: 5m 44.00s
	Train Loss: 0.586 | Val Loss: 0.931 | Val Acc: 0.634
Epoch: 09 | Time: 5m 44.00s
	Train Loss: 0.546 | Val Loss: 0.995 | Val Acc: 0.631
Epoch: 10 | Time: 5m 44.00s
	Train Loss: 0.498 | Val Loss: 1.027 | Val Acc: 0.626


### Test Data results

In [31]:
model.load_state_dict(torch.load('xlm_roberta.pt'))

<All keys matched successfully>

In [32]:
with torch.no_grad():
    model.eval()
    preds = []
    trgs = []
    for batch in test_data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["label"].to(device)

        outputs, _ = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # get the predicted labels
        _, predicted = torch.max(outputs, 1)
        # Add data to lists
        preds.extend(predicted.detach().cpu().numpy().tolist())
        trgs.extend(targets.detach().cpu().numpy().tolist())

print(metrics.classification_report(trgs, preds))

              precision    recall  f1-score   support

           0       0.66      0.65      0.66       979
           1       0.57      0.56      0.57      1211
           2       0.68      0.71      0.70       941

    accuracy                           0.63      3131
   macro avg       0.64      0.64      0.64      3131
weighted avg       0.63      0.63      0.63      3131

