# Beto model for sentiment index

In this notebook we trained a sentiment model using some spanish Google playstore data

The training consists on a pretrained model based on [BERT](https://huggingface.co/transformers/model_doc/bert.html#bertfornextsentenceprediction) using the transformers library.

<img src = "https://vignette.wikia.nocookie.net/muppet/images/e/e1/Bert_smile.png/revision/latest/window-crop/width/200/x-offset/0/y-offset/0/window-width/700/window-height/700?cb=20110630173259">
<img src="https://www.disruptivestatic.com/wp-content/uploads/2019/07/SENTIMENT.jpg" width=400>

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |▍                               | 10kB 23.2MB/s eta 0:00:01[K     |▊                               | 20kB 2.9MB/s eta 0:00:01[K     |█▏                              | 30kB 3.8MB/s eta 0:00:01[K     |█▌                              | 40kB 4.1MB/s eta 0:00:01[K     |█▉                              | 51kB 3.4MB/s eta 0:00:01[K     |██▎                             | 61kB 3.8MB/s eta 0:00:01[K     |██▋                             | 71kB 4.1MB/s eta 0:00:01[K     |███                             | 81kB 4.4MB/s eta 0:00:01[K     |███▍                            | 92kB 4.7MB/s eta 0:00:01[K     |███▊                            | 102kB 4.6MB/s eta 0:00:01[K     |████                            | 112kB 4.6MB/s eta 0:00:01[K     |████▌                           | 122kB 4.6M

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
device

'cuda'

## Loading the data

In [5]:
df = pd.read_csv("reviews.csv").loc[:,["contenido", "sentimiento"]]
df["sentimiento"] = df["sentimiento"].map({"bajo":0, "neutro":1, "alto":2})
df.head()

Unnamed: 0,contenido,sentimiento
0,Echo de menos las subtareas que tenía la aplic...,2
1,He encontrado un fallo. En el widget de Androi...,2
2,"Muy buena app, me encanta los detalles que tie...",2
3,Esta muy buena la app pero ultimamente me esta...,2
4,Es una excelente herramienta adaptando el méto...,2


In [6]:
df["sentimiento"].value_counts()

2    33000
1    33000
0    33000
Name: sentimiento, dtype: int64

## Some parameters

In [7]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242120.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=43.0, style=ProgressStyle(description_w…




## Data loader

In [8]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.contenido

        self.targets = self.data.sentimiento
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

In [9]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (99000, 2)
TRAIN Dataset: (79200, 2)
TEST Dataset: (19800, 2)


In [10]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Model for classification

This model uses the beto embeddings and add a dense layer for classification at the end

In [11]:
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
config = AutoConfig.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", config=config)

model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=441944381.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased a

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Loss and optimizer

In [12]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [13]:
optimizer = torch.optim.Adam(params = filter(lambda p: p.requires_grad, model.parameters()) , lr=LEARNING_RATE)

# Trainning

In [14]:
def train(epoch, optimizer):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask, token_type_ids)[0]

        optimizer.zero_grad()
        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [15]:
for epoch in range(EPOCHS):
    train(epoch, optimizer)



Epoch: 0, Loss:  1.0947604179382324


## Validation

In [16]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)[0]
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [17]:
outputs, targets = validation(epoch)
predictions = np.array(outputs).argmax(axis=1)
accuracy = metrics.accuracy_score(targets, predictions)
f1_score_micro = metrics.f1_score(targets, predictions, average='micro')
f1_score_macro = metrics.f1_score(targets, predictions, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")



Accuracy Score = 0.9833333333333333
F1 Score (Micro) = 0.9833333333333333
F1 Score (Macro) = 0.9833473132800936


## Saving the model

In [18]:
torch.save(model, "spanish_model.pkl")