# Deep: NLP With Transformer -  Section 7 long text

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_copom = pd.read_csv('df_copom_label.csv')

In [3]:
df_copom.head()

Unnamed: 0,Date,Selic,Meeting_Number,Decision,Decision_txt,label_hawk_dove,label_next_meet,Text,Type
0,2006/03/08,16.5,117.0,-0.75,decrease,dovish,decrease,"In the March Meeting, the Banco Central do Br...",Statement
1,2006/04/19,15.75,118.0,-0.75,decrease,dovish,decrease,"In the April Meeting, the Monetary Policy Com...",Statement
2,2006/05/31,15.25,119.0,-0.5,decrease,dovish,decrease,"In the May Meeting, the Monetary Policy Commi...",Statement
3,2006/07/19,14.75,120.0,-0.5,decrease,dovish,decrease,"In the July Meeting, the Copom unanimously de...",Statement
4,2006/08/30,14.25,121.0,-0.5,decrease,dovish,decrease,"In the August Meeting, the Copom unanimously ...",Statement


In [4]:
df_copom.shape

(159, 9)

## Split test and train

In [5]:
from sklearn.model_selection import train_test_split

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [6]:
X = df_copom.copy()
y = df_copom['label_hawk_dove']

In [7]:
#Perform train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
texts = X_train['Text'].tolist()
labels = y_train.tolist()

In [9]:
len(labels)

127

In [10]:
seq_len = 512 # number of the tokens tokenizer will create
num_samples = len (texts)
num_samples, seq_len

(127, 512)

## Initializing

In [11]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

In [12]:
# initialize model and tokenizer
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

#### Text inputs too long for this Bert model

In [13]:
test_text = texts[120]
test_text[:100]

'A) Update of economic outlook and Copom’s scenario11. The global environment remains challenging. Th'

In [14]:
test_tokens = tokenizer.encode_plus(test_text, add_special_tokens=False, return_tensors='pt')

Token indices sequence length is longer than the specified maximum sequence length for this model (3840 > 512). Running this sequence through the model will result in indexing errors


In [15]:
len(test_tokens['input_ids'][0])

3840

In [16]:
test_tokens

{'input_ids': tensor([[ 1037,  1007, 10651,  ...,  3567, 12146,  1012]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [17]:
test_tokens['input_ids'][0]

tensor([ 1037,  1007, 10651,  ...,  3567, 12146,  1012])

#### Windows strategy

In [23]:
chunk_size = 512

input_id_chunks = list(test_tokens['input_ids'][0].split(chunk_size - 2))
mask_chunks = list(test_tokens['attention_mask'][0].split(chunk_size - 2))

for i in range(len(input_id_chunks)):
    input_id_chunks[i] = torch.cat(
        [torch.Tensor([101]), input_id_chunks[i], torch.Tensor([102])]
    )
    mask_chunks[i] = torch.cat(
        [torch.Tensor([1]), mask_chunks[i], torch.Tensor([1])]
    )
    
    pad_len = chunk_size - input_id_chunks[i].shape[0]
    
    if pad_len > 0:
        input_id_chunks[i] = torch.cat(
            [input_id_chunks[i], torch.Tensor([0] * pad_len)]
        )
        mask_chunks[i] = torch.cat(
            [mask_chunks[i], torch.Tensor([0] * pad_len)]
        )

input_ids = torch.stack(input_id_chunks)
attention_mask = torch.stack(mask_chunks)

input_dict = {
    'input_ids': input_ids.long(),
    'attention_mask': attention_mask.int()
}

In [36]:
outputs = model(**input_dict)
probs = torch.nn.functional.softmax(outputs[0], dim=-1)
probs = probs.mean(dim=0)

In [37]:
probs

tensor([0.2564, 0.0428, 0.7008], grad_fn=<MeanBackward1>)