# roBERTo Classifier

This notebook inspired in BERT/roBERTa models andd attaches an output layer that works as multiclassification. As well, this model is trained with a spanish corpus.

In [2]:
# !pip install -r https://raw.githubusercontent.com/ernestomancebo/roberto-mlm/master/requirements.txt

Collecting datasets==1.11.0
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[?25l[K     |█▎                              | 10 kB 19.3 MB/s eta 0:00:01[K     |██▌                             | 20 kB 11.9 MB/s eta 0:00:01[K     |███▊                            | 30 kB 9.5 MB/s eta 0:00:01[K     |█████                           | 40 kB 8.7 MB/s eta 0:00:01[K     |██████▏                         | 51 kB 5.2 MB/s eta 0:00:01[K     |███████▍                        | 61 kB 5.6 MB/s eta 0:00:01[K     |████████▋                       | 71 kB 5.5 MB/s eta 0:00:01[K     |██████████                      | 81 kB 6.1 MB/s eta 0:00:01[K     |███████████▏                    | 92 kB 5.6 MB/s eta 0:00:01[K     |████████████▍                   | 102 kB 5.4 MB/s eta 0:00:01[K     |█████████████▋                  | 112 kB 5.4 MB/s eta 0:00:01[K     |██████████████▉                 | 122 kB 5.4 MB/s eta 0:00:01[K     |████████████████                | 133 kB 5.4 MB/s eta 0

In [None]:
!pip install -r requirements.txt

In [3]:
from datasets import load_dataset

dataset = load_dataset("muchocine")
dataset = dataset['train']

print(f'Features: { dataset.features }. Instances: { dataset.num_rows }')


Downloading:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset muchocine/default (download: 52.98 MiB, generated: 11.31 MiB, post-processed: Unknown size, total: 64.29 MiB) to /root/.cache/huggingface/datasets/muchocine/default/1.1.1/3ed5582584cd84ef722606a3d725ef18fd4647d63195fef05c47683e5a056ccd...


Downloading:   0%|          | 0.00/55.6M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

Dataset muchocine downloaded and prepared to /root/.cache/huggingface/datasets/muchocine/default/1.1.1/3ed5582584cd84ef722606a3d725ef18fd4647d63195fef05c47683e5a056ccd. Subsequent calls will reuse this data.
Features: {'review_body': Value(dtype='string', id=None), 'review_summary': Value(dtype='string', id=None), 'star_rating': Value(dtype='int32', id=None)}. Instances: 3872


In [4]:
import numpy as np

In [5]:
summary_len = [len(str(x).split()) for x in dataset['review_summary']]
body_len = [len(str(x).split()) for x in dataset['review_body']]


In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

sns.set_style('darkgrid')

In [None]:
sns.displot(summary_len)


In [12]:
np.unique(dataset['star_rating'])


array([1, 2, 3, 4, 5])

In [12]:
SEQ_MAX_LEN = 50

In [7]:
Xids = np.zeros((len(dataset), SEQ_MAX_LEN))
Xmask = np.zeros((len(dataset), SEQ_MAX_LEN))

Xids.shape

(3872, 50)

---

In [8]:
from transformers import BertTokenizer

In [9]:
model_name = 'Geotrend/bert-base-es-cased'

tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/174k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/659 [00:00<?, ?B/s]

In [13]:
def tokenize_sequence(sequence):
    tokens = tokenizer.encode_plus(sequence,
                                   max_length=SEQ_MAX_LEN,
                                   truncation=True,
                                   padding='max_length',
                                   add_special_tokens=True,
                                   return_tensors='tf')
    return tokens


tokenize_sequence('hola, buen dia')


{'input_ids': <tf.Tensor: shape=(1, 50), dtype=int32, numpy=
array([[   11, 26300,   214,    25, 13900,   626,    12,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 50), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 50), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]], dtype=int32)>}

In [11]:
for i, seq in enumerate(dataset['review_summary']):
    tokens = tokenize_sequence(seq)

    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']


In [12]:
Xids[2]

array([[1.1000e+01, 6.6000e+02, 2.5000e+01, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.1000e+01, 1.0850e+04, 8.2900e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.1000e+01, 2.4499e+04, 2.1400e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [1.1000e+01, 1.5980e+03, 1.2980e+03, ..., 2.0500e+02, 2.0104e+04,
        1.2000e+01],
       [1.1000e+01, 6.1980e+03, 5.8350e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.1000e+01, 2.5800e+02, 2.8150e+03, ..., 1.7193e+04, 5.3450e+03,
        1.2000e+01]])

In [13]:
Xmask[2]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [14]:
np.unique(dataset['star_rating'])

array([1, 2, 3, 4, 5])

In [15]:
label_arr = dataset['star_rating']

labels = np.zeros((len(label_arr), np.max(label_arr)))
labels.shape

(3872, 5)

In [16]:
labels[np.arange(len(label_arr)), np.subtract(label_arr,  1)] = 1

labels[:5]


array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [17]:
import os


def serialize_tensors(tensor, name, path):
    with open(f'{os.sep.join([path, name])}', 'wb') as file:
        np.save(file, tensor)


In [24]:
!mkdir -p tensors/classifier 

In [25]:
tensors_path =  os.sep.join(['.', 'tensors', 'classifier'])

serialize_tensors(Xids, 'Xids.npy', tensors_path)
serialize_tensors(Xmask, 'Xmask.npy', tensors_path)
serialize_tensors(labels, 'labels.npy', tensors_path)

!ls ./tensors/classifier


labels.npy  Xids.npy  Xmask.npy


In [None]:
del dataset, Xids, Xmask, label_arr, labels

In [26]:
Xids[:2]


array([[4.0000e+00, 1.5820e+03, 1.0170e+03, 1.0670e+03, 2.3680e+03,
        1.2000e+03, 1.1530e+03, 2.5570e+03, 1.0640e+03, 5.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00],
       [4.0000e+00, 1.1840e+03, 1.0447e+04, 1.0360e+03, 1.0300e+03,
        6.1290e+03, 1.0080e+03, 1.0490e+03, 9.8200e+02, 1.3480e+03,
        9.8100e+02, 1.0947e+04, 3.0931e+04, 5.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0

In [18]:
import numpy as np


def load_tensors(name, path):
    with open(os.sep.join([path, name]), 'rb') as f:
        return np.load(f)


In [None]:
Xids = load_tensors('Xids.npy', os.sep.join(['.', 'tensors', 'classifier']))
Xmask = load_tensors('Xmask.npy', os.sep.join(['.', 'tensors', 'classifier']))
labels = load_tensors('labels.npy', os.sep.join(
    ['.', 'tensors', 'classifier']))


---

## Input pipeline

In [19]:
import tensorflow as tf

devices = tf.config.experimental.list_physical_devices('GPU')
devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [20]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))


In [21]:
dataset.take(1)

<TakeDataset shapes: ((50,), (50,), (5,)), types: (tf.float64, tf.float64, tf.float64)>

In [22]:
def map_tensor(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels


In [23]:
dataset = dataset.map(map_tensor)

dataset.take(1)


<TakeDataset shapes: ({input_ids: (50,), attention_mask: (50,)}, (5,)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [24]:
dataset = dataset.shuffle(10_000).batch(16, drop_remainder=False)

dataset.take(1)


<TakeDataset shapes: ({input_ids: (None, 50), attention_mask: (None, 50)}, (None, 5)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [25]:
DS_LEN = len(list(dataset))
DS_LEN


242

In [26]:
SPLIT = 0.85

train = dataset.take(round(DS_LEN * SPLIT))
val = dataset.skip(round(DS_LEN * SPLIT))

del dataset


---

### Training the model

In [27]:
from transformers import TFAutoModel

In [28]:
bert = TFAutoModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/510M [00:00<?, ?B/s]

Some layers from the model checkpoint at Geotrend/bert-base-es-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at Geotrend/bert-base-es-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [29]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  106285056 
Total params: 106,285,056
Trainable params: 106,285,056
Non-trainable params: 0
_________________________________________________________________


In [30]:
input_ids = tf.keras.layers.Input(
    shape=(SEQ_MAX_LEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(
    shape=(SEQ_MAX_LEN,), name='attention_mask', dtype='int32')

embedings = bert.bert(input_ids, attention_mask=mask)[1]

# x = tf.keras.layers.Flatten()(embedings)
x = tf.keras.layers.Dense(128, activation='relu')(embedings)

y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.layers[2].trainable = False


In [31]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 50)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 50)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 106285056   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          98432       bert[0][1]                   

In [32]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])


In [39]:
hist = model.fit(
    train,
    validation_data=val,
    epochs=4,
)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [40]:
model.save('sentiment_roberto')



INFO:tensorflow:Assets written to: sentiment_roberto/assets


INFO:tensorflow:Assets written to: sentiment_roberto/assets


In [41]:
!zip -r sentiment_roberto.zip sentiment_roberto

  adding: sentiment_roberto/ (stored 0%)
  adding: sentiment_roberto/assets/ (stored 0%)
  adding: sentiment_roberto/saved_model.pb (deflated 92%)
  adding: sentiment_roberto/keras_metadata.pb (deflated 95%)
  adding: sentiment_roberto/variables/ (stored 0%)
  adding: sentiment_roberto/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: sentiment_roberto/variables/variables.index (deflated 76%)


---

In [42]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [43]:
!cp sentiment_roberto.zip "/content/gdrive/MyDrive/proyectos/machine learning/sentiment_analysis"

---

In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  from google.colab import drive

  drive.mount('/content/gdrive')

  !cp "/content/gdrive/MyDrive/proyectos/machine learning/sentiment_analysis/sentiment_roberto.zip" .

Mounted at /content/gdrive


In [2]:
!unzip sentiment_roberto.zip

Archive:  sentiment_roberto.zip
   creating: sentiment_roberto/
   creating: sentiment_roberto/assets/
  inflating: sentiment_roberto/saved_model.pb  
  inflating: sentiment_roberto/keras_metadata.pb  
   creating: sentiment_roberto/variables/
  inflating: sentiment_roberto/variables/variables.data-00000-of-00001  
  inflating: sentiment_roberto/variables/variables.index  


In [3]:
!pip install -r https://raw.githubusercontent.com/ernestomancebo/roberto-mlm/master/requirements.txt

Collecting datasets==1.11.0
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[?25l[K     |█▎                              | 10 kB 22.0 MB/s eta 0:00:01[K     |██▌                             | 20 kB 24.6 MB/s eta 0:00:01[K     |███▊                            | 30 kB 12.5 MB/s eta 0:00:01[K     |█████                           | 40 kB 9.7 MB/s eta 0:00:01[K     |██████▏                         | 51 kB 5.4 MB/s eta 0:00:01[K     |███████▍                        | 61 kB 6.0 MB/s eta 0:00:01[K     |████████▋                       | 71 kB 5.7 MB/s eta 0:00:01[K     |██████████                      | 81 kB 6.4 MB/s eta 0:00:01[K     |███████████▏                    | 92 kB 4.9 MB/s eta 0:00:01[K     |████████████▍                   | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████████▋                  | 112 kB 5.2 MB/s eta 0:00:01[K     |██████████████▉                 | 122 kB 5.2 MB/s eta 0:00:01[K     |████████████████                | 133 kB 5.2 MB/s eta 

In [4]:
import tensorflow as tf

In [6]:
model = tf.keras.models.load_model('sentiment_roberto')

In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 50)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 50)]         0                                            
__________________________________________________________________________________________________
bert (Custom>TFBertMainLayer)   {'pooler_output': (N 106285056   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          98432       bert[0][1]                   

In [7]:
model_name = 'Geotrend/bert-base-es-cased'

In [9]:
from transformers import BertTokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/174k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/659 [00:00<?, ?B/s]

In [16]:
def prepare_data(text):
  tokens = tokenize_sequence(text)
  return {'input_ids': tf.cast(tokens['input_ids'], tf.float64),
          'attention_mask': tf.cast(tokens['attention_mask'], tf.float64)}

In [40]:
input_tensor = prepare_data('Pudo haber sido mejor')

probs = model.predict(input_tensor)[0]
probs

array([0.10558031, 0.23227973, 0.3313494 , 0.21261229, 0.11817823],
      dtype=float32)

In [18]:
import numpy as np

In [41]:
np.argmax(probs)

2

In [36]:
input_tensor = prepare_data('Me pareció muy buena ')

probs = model.predict(input_tensor)[0]
probs

array([0.09737257, 0.26411855, 0.3125822 , 0.2078706 , 0.11805607],
      dtype=float32)

In [35]:
probs[2]

0.33177188

In [37]:
np.argmax(probs)

2

In [47]:
frases = ['No me gusto para nada', 'Estuvo muy buena, excelente',
          'Creo pudo mejorar', 'Estuvo totalmente maravillosa', 
          'Horrible', 'Bien', 'No tiene nada que ver el resultado',
          'La viera otra vez']

for f in frases:
  input = prepare_data(f)
  p = model.predict(input)[0]

  print(f'Highest: {np.argmax(p)} from {p}')

Highest: 2 from [0.09779301 0.23596473 0.30954877 0.22503732 0.13165626]
Highest: 2 from [0.10167223 0.21692628 0.27523494 0.26554176 0.14062475]
Highest: 2 from [0.12497162 0.22514386 0.29618528 0.21158129 0.14211792]
Highest: 2 from [0.08251777 0.20117098 0.39224193 0.22311923 0.10095011]
Highest: 3 from [0.12589279 0.19004318 0.24804491 0.30254513 0.13347396]
Highest: 3 from [0.11276485 0.23152219 0.21121958 0.28117046 0.16332294]
Highest: 2 from [0.09095988 0.23589207 0.35352877 0.20633574 0.11328353]
Highest: 2 from [0.11157091 0.27421793 0.29390407 0.20497024 0.11533682]
