Reference: [Apply Transformers to Any Length of Text](https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f)

## IMPORT FIN-BERT MODEL

In [None]:
from utils import *
from transformers import BertForSequenceClassification, BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

In [None]:
def predict_finbert(txt):
    # tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
    # model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
    ### Tokenizer
    # Standard way (does not split, not what we want)
    # tokens = tokenizer.encode_plus(
    #     txt, add_special_tokens=True,
    #     max_length=512, truncation=True,
    #     padding="max_length"
    # )
    # Modified tokenizer (the one we need)
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,
                                   return_tensors='pt')

    ### Split
    #chunks of 510 not 512 to leave two places spare to add our [CLS] and [SEP] tokens.
    # input_id_chunks = tokens['input_ids'][0].split(510)
    # mask_chunks = tokens['attention_mask'][0].split(510)

    # 'Chunk lengths: ' + ', '.join([str(len(_)) for _ in input_id_chunks])
    ### Padding
    # define target chunksize
    chunksize = 512

    # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))

    # loop through each chunk
    for i in range(len(input_id_chunks)):
        # add CLS and SEP tokens to input IDs
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([
            torch.tensor([1]), mask_chunks[i], torch.tensor([1])
        ])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_len)
            ])
            mask_chunks[i] = torch.cat([
                mask_chunks[i], torch.Tensor([0] * pad_len)
            ])

    # check length of each tensor
    # for chunk in input_id_chunks:
    #     print(len(chunk))
    # print final chunk so we can see 101, 102, and 0 (PAD) tokens are all correctly placed
    
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)

    input_dict = {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.int()
    }
    input_dict
    outputs = model(**input_dict)
    probs = torch.nn.functional.softmax(outputs[0], dim=-1)
    probs = probs.mean(dim=0)
    probs = probs.tolist() #outputs: positive negative neutral
    print(probs)
    return probs

In [None]:
path_finbert_df = os.path.join('data', 'sentiment_analysis', 'df_finbert_predictions.pkl')

In [None]:
# Reset FINBERT analysis results

#df = df_from_filings()
#df['finbert_positive'], df['finbert_negative'], df['finbert_neutral'] = zip(*df.text.apply(lambda x : [None, None, None]))
#df.to_pickle(path_finbert_df)

In [None]:
# Load pickle file with analysis results of FINBERT
# df = load_pkl(path_finbert_df)

In [None]:
# %%time
# df['finbert_positive'], df['finbert_negative'], df['finbert_neutral'] = zip(*df.text.apply(lambda txt: predict_finbert(txt)))

In [None]:
%%time
from IPython.display import clear_output

step = 5
for i in range (0, 2000, step):
    df = load_pkl(path_finbert_df)
    mask = df.finbert_positive.isna()
    nb_analzed = len(mask[mask==False])
    mask = mask & mask.nlargest(step)
    print(f'Number of analyzed texts : {nb_analzed}')
    df.loc[mask,'finbert_positive'], df.loc[mask,'finbert_negative'], df.loc[mask,'finbert_neutral'] = zip(*df[mask].apply(lambda row: predict_finbert(row['text']) if not row['finbert_positive'] else [row['finbert_positive'], row['finbert_negative'], row['finbert_neutral']], axis=1))
    df.to_pickle(path_finbert_df)
    clear_output(wait=False)

In [None]:
# Eliminate text data from the dataframe to save space and reduce the size of file containing the predictions
df = df.drop(['text'], axis=1)
save_pkl(df, path_finbert_df)