# Deep: NLP With Transformer -  Section 6 Sentimento Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_copom = pd.read_csv('df_copom_label.csv')

In [3]:
df_copom.head()

Unnamed: 0,Date,Selic,Meeting_Number,Decision,Decision_txt,label_hawk_dove,label_next_meet,Text,Type
0,2006/03/08,16.5,117.0,-0.75,decrease,dovish,decrease,"In the March Meeting, the Banco Central do Br...",Statement
1,2006/04/19,15.75,118.0,-0.75,decrease,dovish,decrease,"In the April Meeting, the Monetary Policy Com...",Statement
2,2006/05/31,15.25,119.0,-0.5,decrease,dovish,decrease,"In the May Meeting, the Monetary Policy Commi...",Statement
3,2006/07/19,14.75,120.0,-0.5,decrease,dovish,decrease,"In the July Meeting, the Copom unanimously de...",Statement
4,2006/08/30,14.25,121.0,-0.5,decrease,dovish,decrease,"In the August Meeting, the Copom unanimously ...",Statement


In [4]:
df_copom.shape

(159, 9)

### Split test and train

In [5]:
from sklearn.model_selection import train_test_split

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [6]:
X = df_copom.copy()
y = df_copom['label_hawk_dove']

In [7]:
#Perform train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
texts = X_train['Text'].tolist()
labels = y_train.tolist()

In [69]:
len(texts)

127

In [9]:
len(labels)

127

In [10]:
seq_len = 512 # number of the tokens tokenizer will create
num_samples = len (texts)
num_samples, seq_len

(127, 512)

#### Tokenizer

In [11]:
from transformers import BertTokenizer

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [13]:
tokens = tokenizer(texts, 
                   max_length=seq_len,
                   truncation=True, 
                   padding = 'max_length', 
                   add_special_tokens=True,
                   return_tensors='np')

In [14]:
tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [15]:
tokens['input_ids']

array([[  101,  1109,  3291, ...,     0,     0,     0],
       [  101,  1130,  1157, ...,  1112, 17162,   102],
       [  101,  1130,  1103, ...,     0,     0,     0],
       ...,
       [  101,  1249,  8830, ...,     0,     0,     0],
       [  101,  1109,  3291, ...,  1103,  5880,   102],
       [  101,  1109,  3291, ..., 16642,  1879,   102]])

In [16]:
tokens['attention_mask']

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

#### Save to file

In [17]:
with open('en-text-ids.npy', 'wb') as f:
    np.save(f, tokens['input_ids'])
with open('en-text-masks.npy', 'wb') as f:
    np.save(f, tokens['attention_mask'])

#### Convert labels to one-hot encoded vectors

In [18]:
# labels = df_copom['label_hawk_dove']

In [19]:
label_classes = list(set(labels))
num_classes = len(label_classes)
label_classes, num_classes

(['dovish', 'neutral', 'hawkish'], 3)

In [20]:
label_to_index = {label: index for index, label in enumerate(label_classes)}
index_to_label = {index: label for label, index in label_to_index.items()}
label_to_index, index_to_label

({'dovish': 0, 'neutral': 1, 'hawkish': 2},
 {0: 'dovish', 1: 'neutral', 2: 'hawkish'})

In [21]:
labels_encoded = np.array([label_to_index[label] for label in labels])
labels_one_hot = np.eye(num_classes)[labels_encoded]
labels_one_hot[:5]

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [22]:
with open('en-labels.npy', 'wb') as f:
    np.save(f, labels_one_hot)

## Pipeline

In [23]:
with open('en-text-ids.npy', 'rb') as f:
    Xids = np.load(f, allow_pickle=True)
with open('en-text-masks.npy', 'rb') as f:
    Xmasks = np.load(f, allow_pickle=True)
with open('en-labels.npy', 'rb') as f:
    labels = np.load(f, allow_pickle=True)

In [24]:
Xids, Xids.shape

(array([[  101,  1109,  3291, ...,     0,     0,     0],
        [  101,  1130,  1157, ...,  1112, 17162,   102],
        [  101,  1130,  1103, ...,     0,     0,     0],
        ...,
        [  101,  1249,  8830, ...,     0,     0,     0],
        [  101,  1109,  3291, ...,  1103,  5880,   102],
        [  101,  1109,  3291, ..., 16642,  1879,   102]]),
 (127, 512))

In [25]:
labels.shape

(127, 3)

#### Create a TensorFlow datatype object

In [26]:
Xmasks, Xmasks.shape

(array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1]]),
 (127, 512))

In [27]:
labels.shape

(127, 3)

In [28]:
import tensorflow as tf

In [29]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmasks, labels))

In [30]:
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

In [31]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids,
            'attention_mask': masks}, labels

In [32]:
dataset = dataset.map(map_func)

In [33]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

#### Batch size, shuffle, split

In [34]:
batch_size = 16

In [35]:
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

In [36]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(16, 3), dtype=tf.float64, name=None))>

In [37]:
split = 0.8

In [38]:
size = int(Xids.shape[0]/batch_size * split)
size

6

In [39]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [40]:
train_ds, val_ds

(<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(16, 3), dtype=tf.float64, name=None))>,
 <SkipDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(16, 3), dtype=tf.float64, name=None))>)

In [41]:
tf.data.Dataset.save(train_ds, 'train')
tf.data.Dataset.save(train_ds, 'val')

In [42]:
train_ds.element_spec

({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)},
 TensorSpec(shape=(16, 3), dtype=tf.float64, name=None))

In [43]:
# ds = tf.data.Dataset.load('train', element_spec=train_ds.element_spec)

## Build

In [44]:
from transformers import TFAutoModel

In [45]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [46]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [49]:
# import tensorflow as tf

In [50]:
# two imputs
input_ids = tf.keras.layers.Input(shape=(512,),
                                  name='input_ids', dtype='int32')
masks = tf.keras.layers.Input(shape=(512,),
                              name='attention_mask', dtype='int32')

# transformers
embeddings = bert.bert(input_ids, attention_mask=masks)[1]

# classifier head
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(3, activation='softmax', name='outputs')(x)

In [51]:
model = tf.keras.Model(inputs=[input_ids, masks], outputs=y)

In [52]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [53]:
# freeze bert layer
model.layers[2].trainable = False

In [54]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [55]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [56]:
element_spec = ({'input_ids': tf.TensorSpec(shape=(16, 512), dtype=tf.int32, name=None),
                 'attention_mask': tf.TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)},
                 tf.TensorSpec(shape=(16, 3), dtype=tf.float64, name=None))

In [57]:
train_ds = tf.data.Dataset.load('train', element_spec=element_spec)
val_ds = tf.data.Dataset.load('val', element_spec=element_spec)

In [58]:
train_ds.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(16, 3), dtype=tf.float64, name=None))>

In [59]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [60]:
model.save('en_sentiment_model')



INFO:tensorflow:Assets written to: en_sentiment_model\assets


INFO:tensorflow:Assets written to: en_sentiment_model\assets


# Load and predictions

In [None]:
model = tf.keras.models.load_model('en_sentiment_model/')

In [61]:
from transformers import BertTokenizer

In [62]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [63]:
def prep_data(text):
    tokens = tokenizer(text, max_length=512, truncation=True,
                       padding='max_length', add_special_tokens=True,
                       return_tensors='tf')
    return {'input_ids': tokens['input_ids'],
            'attention_mask': tokens['attention_mask']}

In [64]:
probs = model.predict(prep_data('The Brazil Central Bank decided to increase the Selic interest rate by 0.5 percentage point, to 14%'))[0]
probs



array([0.38154268, 0.26313284, 0.35532445], dtype=float32)

In [65]:
np.argmax(probs)

0

In [66]:
X_test['Sentiment'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [67]:
for i, row in X_test.iterrows():
    tokens = prep_data(row['Text'])
    probs = model.predict(tokens)
    pred = np.argmax(probs)
    X_test.at[i, 'Sentiment'] = pred



In [68]:
X_test.head()

Unnamed: 0,Date,Selic,Meeting_Number,Decision,Decision_txt,label_hawk_dove,label_next_meet,Text,Type,Sentiment
78,2016/03/02,14.25,197.0,0.0,mantain,neutral,mantain,The Copom released the following note to the ...,Statement,2
155,2023/03/22,13.75,253.0,0.0,mantain,neutral,mantain,A) Update of economic outlook and Copom’s scen...,Minutes,2
128,2021/06/16,4.25,239.0,0.75,increase,hawkish,increase,A) Update of economic outlook and Copom’s base...,Minutes,0
55,2013/04/17,7.5,174.0,0.25,increase,hawkish,increase,The Copom released the following note to the ...,Statement,2
94,2018/03/21,6.5,213.0,-0.25,decrease,dovish,mantain,The Copom unanimously decided to reduce the Se...,Statement,0
