Reference: [Apply Transformers to Any Length of Text](https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f)

## IMPORT FIN-BERT MODEL

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

## IMPORT DATAFRAME WITH ALL TEXTS

In [None]:
from utils import *

df = df_from_filings()
txt = df['text'][0]
df

In [None]:
def predict_finbert(txt):
    # tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
    # model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
    ### Tokenizer
    # Standard way (does not split, not what we want)
    # tokens = tokenizer.encode_plus(
    #     txt, add_special_tokens=True,
    #     max_length=512, truncation=True,
    #     padding="max_length"
    # )
    # Modified tokenizer (the one we need)
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,
                                   return_tensors='pt')

    # print(len(tokens['input_ids'][0]))
    # tokens
    ### Split
    #chunks of 510 not 512 to leave two places spare to add our [CLS] and [SEP] tokens.
    # input_id_chunks = tokens['input_ids'][0].split(510)
    # mask_chunks = tokens['attention_mask'][0].split(510)

    # 'Chunk lengths: ' + ', '.join([str(len(_)) for _ in input_id_chunks])
    ### Padding
    # define target chunksize
    chunksize = 512

    # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))

    # loop through each chunk
    for i in range(len(input_id_chunks)):
        # add CLS and SEP tokens to input IDs
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([
            torch.tensor([1]), mask_chunks[i], torch.tensor([1])
        ])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_len)
            ])
            mask_chunks[i] = torch.cat([
                mask_chunks[i], torch.Tensor([0] * pad_len)
            ])

    # check length of each tensor
    # for chunk in input_id_chunks:
    #     print(len(chunk))
    # print final chunk so we can see 101, 102, and 0 (PAD) tokens are all correctly placed
    # chunk
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)

    input_dict = {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.int()
    }
    input_dict
    outputs = model(**input_dict)
    probs = torch.nn.functional.softmax(outputs[0], dim=-1)
    probs = probs.mean(dim=0)
    probs  #outputs: positive negative neutral
    print(probs.tolist())
    return probs.tolist()

In [None]:
df = df_from_filings().sample(n=5)

In [None]:
%%time
df['finbert_positive'], df['finbert_negative'], df['finbert_neutral'] = df.text.apply(lambda txt: predict_finbert(txt))