Reference: [Apply Transformers to Any Length of Text](https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f)

## IMPORT FIN-BERT MODEL

In [30]:
from utils import *
from transformers import BertForSequenceClassification, BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

In [31]:
def predict_finbert(txt):
    # tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
    # model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
    ### Tokenizer
    # Standard way (does not split, not what we want)
    # tokens = tokenizer.encode_plus(
    #     txt, add_special_tokens=True,
    #     max_length=512, truncation=True,
    #     padding="max_length"
    # )
    # Modified tokenizer (the one we need)
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,
                                   return_tensors='pt')

    # print(len(tokens['input_ids'][0]))
    # tokens
    ### Split
    #chunks of 510 not 512 to leave two places spare to add our [CLS] and [SEP] tokens.
    # input_id_chunks = tokens['input_ids'][0].split(510)
    # mask_chunks = tokens['attention_mask'][0].split(510)

    # 'Chunk lengths: ' + ', '.join([str(len(_)) for _ in input_id_chunks])
    ### Padding
    # define target chunksize
    chunksize = 512

    # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))

    # loop through each chunk
    for i in range(len(input_id_chunks)):
        # add CLS and SEP tokens to input IDs
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([
            torch.tensor([1]), mask_chunks[i], torch.tensor([1])
        ])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_len)
            ])
            mask_chunks[i] = torch.cat([
                mask_chunks[i], torch.Tensor([0] * pad_len)
            ])

    # check length of each tensor
    # for chunk in input_id_chunks:
    #     print(len(chunk))
    # print final chunk so we can see 101, 102, and 0 (PAD) tokens are all correctly placed
    # chunk
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)

    input_dict = {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.int()
    }
    input_dict
    outputs = model(**input_dict)
    probs = torch.nn.functional.softmax(outputs[0], dim=-1)
    probs = probs.mean(dim=0)
    probs = probs.tolist() #outputs: positive negative neutral
    print(probs)
    return probs

In [40]:
# 

#df = df_from_filings()
#df['finbert_positive'], df['finbert_negative'], df['finbert_neutral'] = zip(*df.text.apply(lambda x : [None, None, None]))
#df.to_pickle('df_finbert_predictions.pkl')

In [32]:
# df = load_pkl('df_finbert_predictions.pkl')

In [33]:
# %%time
# df['finbert_positive'], df['finbert_negative'], df['finbert_neutral'] = zip(*df.text.apply(lambda txt: predict_finbert(txt)))

In [42]:
%%time
step = 5
for i in range (0, 1300, step):
    df = load_pkl('df_finbert_predictions.pkl')
    mask = df.finbert_positive.isna()
    mask = mask & mask.nlargest(step)
    df.loc[mask,'finbert_positive'], df.loc[mask,'finbert_negative'], df.loc[mask,'finbert_neutral'] = zip(*df[mask].apply(lambda row: predict_finbert(row['text']) if not row['finbert_positive'] else [row['finbert_positive'], row['finbert_negative'], row['finbert_neutral']], axis=1))
    df.to_pickle('df_finbert_predictions.pkl')

[0.06187935173511505, 0.12440332770347595, 0.8137173056602478]
[0.05668359994888306, 0.15130901336669922, 0.7920073866844177]
CPU times: user 5min 47s, sys: 45.3 s, total: 6min 32s
Wall time: 5min 34s


In [43]:
df

Unnamed: 0,cik,report_type,report_identity,year,file,text,finbert_positive,finbert_negative,finbert_neutral
0,0001567101,N-CSR,0001193125-16-696397,2016,full-submission.txt,UNITED STATES SECURITIES AND EXCHANGE COMMISS...,0.061879,0.124403,0.813717
1,0001567101,N-CSR,0001193125-20-232534,2020,full-submission.txt,UNITED STATES SECURITIES AND EXCHANGE COMMISS...,0.056684,0.151309,0.792007
2,0001567101,N-CSR,0001193125-19-232573,2019,full-submission.txt,UNITED STATES SECURITIES AND EXCHANGE COMMISS...,,,
3,0001567101,N-CSR,0001193125-21-260598,2021,full-submission.txt,UNITED STATES SECURITIES AND EXCHANGE COMMISS...,,,
4,0001567101,N-CSR,0001193125-17-271972,2017,full-submission.txt,UNITED STATES SECURITIES AND EXCHANGE COMMISS...,,,
...,...,...,...,...,...,...,...,...,...
1235,0001845809,N-CSRS,0000928816-22-000501,2022,full-submission.txt,Putnam Sustainable Future ETF Semiannual repo...,,,
1236,0001845809,N-CSRS,0000928816-22-000498,2022,full-submission.txt,Putnam Focused Large Cap Growth ETF Semiannua...,,,
1237,0001845809,N-CSRS,0000928816-22-000499,2022,full-submission.txt,Putnam Focused Large Cap Value ETF Semiannual...,,,
1238,0001849998,N-CSR,0001623632-22-000554,2022,full-submission.txt,United States Securities and Exchange Commiss...,,,
