# **Sentiment Analysis**: Classifying iMDB movie reviews üëçüëé

In [None]:
%%capture

!pip install datasets

In [None]:
import torch

In [None]:
torch.cuda.manual_seed_all(42)

Loading in our dataset. Since the imdb dataset is part of the huggingface datasets library.

In [None]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")

# General overview of the dataset
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


Let's load in training and testing splits

In [None]:
train = imdb_dataset["train"].select(range(100))
test = imdb_dataset["test"].select(range(100))

Load in our pre-trained backbone and its tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
backbone = AutoModel.from_pretrained("bert-base-uncased")

# What does a tokenizer do?
It breaks up a sentence into its individual components

Tokenizers have the vocabulary of words that the model "knows"

In [None]:
print("The vocabulary contains " + str(tokenizer.vocab_size) + " 'words'")
print(tokenizer.vocab)

The vocabulary contains 30522 'words'


**What do the numbers along with the words mean?**
We know that a model takes in static embeddings and converts them to contextual embeddings. The numbers are the indices used to extract static embeddings.

In [None]:
static_embeddings = backbone.embeddings.word_embeddings.weight.detach().cpu()
print(static_embeddings.shape)

idx = tokenizer.vocab["computer"]
print("Index for the word 'computer' is " + str(idx))
print("Static embedding for the word 'computer' is: ")
print(static_embeddings[idx])

torch.Size([30522, 768])
Index for the word 'computer' is 3274
Static embedding for the word 'computer' is: 
tensor([-5.9546e-02,  2.4515e-02, -6.7853e-02, -4.0666e-02,  7.4523e-02,
         1.5400e-02, -9.2887e-02,  2.1392e-02, -3.6711e-02, -4.1389e-02,
        -4.9072e-02, -5.4892e-02, -3.1831e-02, -2.8631e-02, -3.0203e-02,
         3.5598e-02, -6.3553e-02,  3.1032e-03, -2.4183e-02,  3.0975e-02,
        -8.5123e-03, -1.4649e-02, -1.4289e-02,  3.2553e-03, -1.0620e-01,
        -6.7314e-02, -4.1345e-02, -3.2856e-02,  2.9507e-02, -6.5036e-02,
         2.4453e-02, -5.1962e-03, -4.3611e-02, -2.0829e-02,  2.6900e-02,
        -6.7784e-02, -1.3402e-02,  1.0268e-02, -1.8625e-02,  1.8153e-02,
         5.7891e-03,  6.4343e-02, -3.4312e-02, -6.0427e-02,  3.6104e-02,
         9.1638e-03, -5.3603e-02, -9.8907e-03,  3.2909e-02, -4.1469e-03,
        -2.0511e-02,  3.0693e-03, -3.6686e-02, -1.1080e-02, -1.0004e-01,
        -2.6868e-02, -7.9305e-03,  9.3706e-04, -2.1024e-02, -3.6096e-02,
        -4.8796

Let's see tokenizer split up a sentence in action (not usually used)

In [None]:
print( tokenizer.tokenize("I'd love to learn natural language processing") )

['i', "'", 'd', 'love', 'to', 'learn', 'natural', 'language', 'processing']


In [None]:
print( tokenizer("I'd love to learn natural language processing") )

# input_ids are the indices we described above

{'input_ids': [101, 1045, 1005, 1040, 2293, 2000, 4553, 3019, 2653, 6364, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


# Defining our model

In [None]:
import torch.nn as nn
import torch

loss_fn = nn.BCELoss()
class BertForSentimentCLassification(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = backbone
        self.classifier = nn.Linear(self.backbone.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels):
        if input_ids.shape[1] == 1:
            input_ids = input_ids.squeeze(1)
            attention_mask = attention_mask.squeeze(1)
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        output2 = self.classifier(cls_output)
        final_output = torch.sigmoid(output2).squeeze(1)
        return {"logits": final_output, "loss": loss_fn(final_output, labels.float())}

model = BertForSentimentCLassification()

Tokenizing dataset

In [None]:
tokenized_train = imdb_dataset["train"].shuffle().select(range(1000)).map(lambda x : tokenizer(x["text"], truncation=True, padding="max_length", max_length=512))
tokenized_test = imdb_dataset["test"].shuffle().select(range(500)).map(lambda x : tokenizer(x["text"], truncation=True, padding="max_length", max_length=512))

#Converting lists to pytorch tensors
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Define accuracy as evaluation metric

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits >= 0.5).astype(int)
    accuracy = accuracy_score(labels, predictions)
    f1_score_metric = f1_score(labels, predictions, average='macro')
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')

    return {
        "accuracy": accuracy,
        "f1_score": f1_score_metric,
        "precision": precision,
        "recall": recall
    }

# Hyperparamters:

When changing hyperparameters, there are always trade-offs

**High LR** -> Faster learning, but possiblity of overshooting
**Large Batch Size** -> Less overfitting, but slower learning and more memory needed
**More epochs** -> More learning, but chance of overfitting
**Large Weight decay** -> Less overfitting, chance of underfitting

In [None]:
LR = 2e-5
train_batch_size = 8
valid_batch_size = 8
num_epoch = 1
weight_decay = 0.01

Shuffle the training set

In [None]:
tokenized_train.shuffle()

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

Training and evaluation made easy with Trainer API

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=valid_batch_size,
    num_train_epochs=num_epoch,
    weight_decay=weight_decay,
    logging_dir='./logs',
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1 Score,Precision,Recall
1,No log,0.372815,0.908,0.907668,0.907879,0.907486


TrainOutput(global_step=125, training_loss=0.22717373657226564, metrics={'train_runtime': 124.098, 'train_samples_per_second': 8.058, 'train_steps_per_second': 1.007, 'total_flos': 0.0, 'train_loss': 0.22717373657226564, 'epoch': 1.0})

# **Bangla Named Entity Recognition** ‚è¨

Finding number of classes

In [None]:
import csv

categories = set()
with open("/content/train.csv") as csv_file:
    file =  csv.reader(csv_file, delimiter=',')
    next(file) #skipping headers
    for line in file:
        categories.add(line[-1].strip())

print(categories)
print(len(categories))

{'B-GRP', 'B-PER', 'B-CORP', 'I-PER', 'B-LOC', 'B-CW', 'I-CW', 'I-LOC', 'I-GRP', 'I-CORP', 'B-PROD', 'O', 'I-PROD'}
13


Setting numerical mapping for categorical data

In [None]:
str_to_int = {
    "O": 0,
    "B-CORP": 1,
    "I-CORP": 2,
    "B-CW": 3,
    "I-CW": 4,
    "B-GRP": 5,
    "I-GRP": 6,
    "B-LOC": 7,
    "I-LOC": 8,
    "B-PER": 9,
    "I-PER": 10,
    "B-PROD": 11,
    "I-PROD": 12,
}

In [None]:
import torch, pandas as pd
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer

Constructing dataset for model to train on

In [None]:
train_dataset = []
i = 0
with open("/content/train.csv") as csv_file:
    file =  csv.reader(csv_file, delimiter=',')
    next(file) #skipping headers
    tokens = []
    tags = []
    for line in file:
        if "‡•§" in line[0]:
            tokens.append(line[0])
            tags.append(line[-1])
            train_dataset.append({
                "id": i,
                "tags": tags,
                "tokens": tokens
            })
            i += 1
            tags = []
            tokens = []
        else:
            tokens.append(line[0])
            tags.append(line[-1])

In [None]:
train_dataset[1]

{'id': 1,
 'tags': ['O',
  'O',
  'O',
  'O',
  'B-GRP',
  'I-GRP',
  'I-GRP',
  'I-GRP',
  'I-GRP',
  'I-GRP',
  'I-GRP',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'tokens': ['‡¶¨‡ßç‡¶∞‡¶æ‡¶Ç‡¶Æ‡ßç‡¶Ø‡¶æ‡¶®',
  '‡¶°‡¶æ‡¶Æ‡ßç‡¶™‡¶∏‡¶®',
  '‡ßß‡ß™‡ß¶‡ß¶',
  '‡¶∏‡¶æ‡¶≤‡ßá',
  '‡¶Ü‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶ú‡¶æ‡¶§‡¶ø‡¶ï',
  '‡¶∞‡ßá‡¶°',
  '‡¶ï‡ßç‡¶∞‡¶∏',
  '‡¶ì',
  '‡¶∞‡ßá‡¶°',
  '‡¶ï‡ßç‡¶∞‡¶ø‡¶∏‡ßá‡¶®‡ßç‡¶ü',
  '‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®',
  '‡¶è‡¶∞',
  '‡¶∏‡ßç‡¶¨‡ßá‡¶ö‡ßç‡¶õ‡¶æ‡¶∏‡ßá‡¶¨‡¶ï',
  '‡¶π‡¶ø‡¶∏‡ßá‡¶¨‡ßá',
  '‡¶∂‡ßÅ‡¶∞‡ßÅ',
  '‡¶ï‡¶∞‡ßá‡¶õ‡¶ø‡¶≤‡ßá‡¶®‡•§']}

Constructing dataset for model to test on

In [None]:
test_dataset = []
i = 0
with open("/content/test.csv") as csv_file:
    file =  csv.reader(csv_file, delimiter=',')
    next(file) #skipping headers
    tokens = []
    tags = []
    for line in file:
        if "‡•§" in line[0]:
            tokens.append(line[0])
            tags.append(line[-1])
            test_dataset.append({
                "id": i,
                "tags": tags,
                "tokens": tokens
            })
            i += 1
            tags = []
            tokens = []
        else:
            tokens.append(line[0])
            tags.append(line[-1])

In [None]:
train_dataset = train_dataset[:1000]
test_dataset = test_dataset[:100]

Ensuring subwords and special tokens are correctly labelled

In [None]:
def tokenize_and_align(entry):
    tokenized_entry = tokenizer(entry["tokens"], truncation = True, is_split_into_words = True)
    labels = entry["tags"]
    word_ids = tokenized_entry.word_ids()

    stretched_labels = []
    curr_label_index = 0

    prev_w = None
    for w in word_ids:
        if w is None:
            stretched_labels.append(-100)
        elif w != prev_w:
            stretched_labels.append(str_to_int[labels[curr_label_index]])
            curr_label_index += 1
        else:
            stretched_labels.append(-100)
        prev_w = w

    tokenized_entry["labels"] = stretched_labels
    return tokenized_entry

Mappings between categorical form and numeric form of labels

In [None]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

Loading in model and tokenizer

In [None]:
model = AutoModelForTokenClassification.from_pretrained('xlm-roberta-base', num_labels = 13, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Tokenizing the dataset

In [None]:
tokenized_dataset_train = list(map(tokenize_and_align, train_dataset))
tokenized_dataset_test = list(map(tokenize_and_align, test_dataset))

Helps us pad the input

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Hyperparameters

In [None]:
LR = 2e-5
train_batch_size = 8
valid_batch_size = 8
num_epoch = 1
weight_decay = 0.01

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=2)

    true_labels = labels.flatten()
    pred_labels = preds.flatten()

    mask = true_labels != -100
    true_labels = true_labels[mask]
    pred_labels = pred_labels[mask]

    f1 = f1_score(true_labels, pred_labels, average='macro')
    precision = precision_score(true_labels, pred_labels, average='macro')
    recall = recall_score(true_labels, pred_labels, average='macro')

    return {"macro_f1": f1}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=valid_batch_size,
    num_train_epochs=num_epoch,
    weight_decay=weight_decay,
    logging_dir='./logs',
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.709746,0.069098


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=125, training_loss=0.9223403930664062, metrics={'train_runtime': 54.9968, 'train_samples_per_second': 18.183, 'train_steps_per_second': 2.273, 'total_flos': 28382099767056.0, 'train_loss': 0.9223403930664062, 'epoch': 1.0})