# Longformer 3.0
We are going to try to set up a Longformer model to handle the SEC filings because of BERT's sequence length limitation. 

The model we are going to set up is the LongformerModelForSequenceClassification by HuggingFace: 
https://huggingface.co/transformers/model_doc/longformer.html#longformerforsequenceclassification

Original repository: 
https://github.com/allenai/longformer





In [1]:
!pip install transformers
import re
import random
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from tqdm.notebook import tqdm
from transformers import (
    LongformerTokenizer,
    LongformerForSequenceClassification,
    LongformerConfig,
    LongformerModel,
    AutoConfig,
    AdamW,
    get_linear_schedule_with_warmup
)
from fastai.text import *
from fastai.metrics import *

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 17.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 46.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 55.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=8fd1ab7e17

## Loading data

In [2]:
df = pd.read_csv('/content/consolidated-data')
df.head()

Unnamed: 0.1,Unnamed: 0,CIK Code,Filing,Date Filed_x,Ticker,CompanyName,Previous close,Next Open
0,0,1364954,An investment in our securities involves a hig...,2021-02-27,CHGG,"CHEGG, INC",96.529999,95.870003
1,1,1738758,An investment in our securitiesis speculative ...,2020-09-14,GSMG,GLORY STAR NEW MEDIA GROUP HOLDINGS Ltd,3.01,3.0
2,2,1738758,An investment in our securitiesis speculative ...,2020-09-14,GSMG,GLORY STAR NEW MEDIA GROUP HOLDINGS Ltd,3.42,3.6
3,3,1674930,Investing in our common stock involves risk. B...,2020-08-12,FLGT,"Fulgent Genetics, Inc.",30.25,30.610001
4,4,1422892,Beforeyou make a decision to invest in our sec...,2018-02-16,SINO,"Sino-Global Shipping America, Ltd.",2.02,2.08


In [3]:
labels = []
for i in range(len(df)):
  if df['Next Open'][i] > df['Previous close'][i]:
    labels.append(1)
  else:
    labels.append(0)
df.insert(8, 'Label', labels)
df.drop(columns = 'Unnamed: 0', inplace = True)
df.head()

Unnamed: 0,CIK Code,Filing,Date Filed_x,Ticker,CompanyName,Previous close,Next Open,Label
0,1364954,An investment in our securities involves a hig...,2021-02-27,CHGG,"CHEGG, INC",96.529999,95.870003,0
1,1738758,An investment in our securitiesis speculative ...,2020-09-14,GSMG,GLORY STAR NEW MEDIA GROUP HOLDINGS Ltd,3.01,3.0,0
2,1738758,An investment in our securitiesis speculative ...,2020-09-14,GSMG,GLORY STAR NEW MEDIA GROUP HOLDINGS Ltd,3.42,3.6,1
3,1674930,Investing in our common stock involves risk. B...,2020-08-12,FLGT,"Fulgent Genetics, Inc.",30.25,30.610001,1
4,1422892,Beforeyou make a decision to invest in our sec...,2018-02-16,SINO,"Sino-Global Shipping America, Ltd.",2.02,2.08,1


In [4]:
x = df.Filing.values
y = df.Label.values

data = {
    'Filing': x,
    'Label': y
}

exp = pd.DataFrame(data)
exp.head()

Unnamed: 0,Filing,Label
0,An investment in our securities involves a hig...,0
1,An investment in our securitiesis speculative ...,0
2,An investment in our securitiesis speculative ...,1
3,Investing in our common stock involves risk. B...,1
4,Beforeyou make a decision to invest in our sec...,1


In [5]:
train_ratio = 0.75
val_ratio = 0.15
test_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(exp.Filing.values,
                                                    exp.Label.values,
                                                    test_size = 1 - train_ratio,
                                                    random_state = 32)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                test_size = test_ratio/(test_ratio + val_ratio))

In [6]:
exp.head()

Unnamed: 0,Filing,Label
0,An investment in our securities involves a hig...,0
1,An investment in our securitiesis speculative ...,0
2,An investment in our securitiesis speculative ...,1
3,Investing in our common stock involves risk. B...,1
4,Beforeyou make a decision to invest in our sec...,1


## Longformer

In [7]:
model_name = 'allenai/longformer-base-4096'
config = AutoConfig.from_pretrained(model_name,
                                    num_labels = 2,
                                    overwrite_cache = True,
                                    gradient_checkpointing = True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=694.0, style=ProgressStyle(description_…




In [9]:
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                            config = config)
configuration = model.config

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

In [None]:
configuration

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.4.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = model.to(device)

## Tokenizer

In [11]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096', max_length = 1024,
                                                do_lower_case = True)
#tokenizer.model_max_length = model.config.max_position_embeddings

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [12]:
train_data_enc = tokenizer.batch_encode_plus(
    x_train,
    add_special_tokens = True,
    return_attention_mask = True,
    pad_to_max_length = True,
    return_tensors = 'pt'
)

val_data_enc = tokenizer.batch_encode_plus(
    x_val,
    add_special_tokens = True,
    return_attention_mask = True,
    pad_to_max_length = True,
    return_tensors = 'pt'
)

Token indices sequence length is longer than the specified maximum sequence length for this model (4319 > 4096). Running this sequence through the model will result in indexing errors


In [13]:
train_input_ids = train_data_enc['input_ids']
train_att_masks = train_data_enc['attention_mask']
train_labels = torch.tensor(y_train)

In [None]:
# global_attention_mask1 = torch.zeros_like(input_ids_train)
# global_attention_mask1[:,0] = 1
# output = model(input_ids_train[0],attention_masks_train[0],labels_train[0])

In [14]:
val_input_ids = val_data_enc['input_ids']
val_att_masks = val_data_enc['attention_mask']
val_labels = torch.tensor(y_val)

train_set = TensorDataset(train_input_ids,
                          train_att_masks,
                          train_labels)
val_set = TensorDataset(val_input_ids,
                        val_att_masks,
                        val_labels)

In [15]:
batch_size = 1 ## using batch size 1 because of the CUDA OOM problem

train_dataloader = DataLoader(train_set,
                              sampler = RandomSampler(train_set),
                              batch_size = batch_size)

val_dataloader = DataLoader(val_set,
                            sampler = RandomSampler(val_set),
                            batch_size = batch_size)


In [16]:
optimizer = optim.SGD(model.parameters(), lr = 0.001)

epochs = 1

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = len(train_dataloader)*epochs)

## Metrics

In [17]:
def f1_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis = 1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat, preds_flat, average = 'weighted')

def accuracy(preds, labels):
  label_dict_inverse = {v: k for k,v in label_dict.items()}
  preds_flat = np.argmax(preds, axis = 1).flatten()
  labels_flat = labels.flatten()
  for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')
        print(f'Percentage: {(len(y_preds[y_preds==label]))*100/len(y_true)}\n')

In [18]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        #take out input values 
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        #evaluating without gradient
        with torch.no_grad():        
            outputs = model(**inputs).to_fp16() ## read somewhere that using fp16 may save some memory,
                                                ## not really sure how to set it up

        #Calculate everything, no need for optimizing weights,gradients,sheduler etc  
        loss = outputs[0]

        logits = outputs[1]
        print('eval logits')
        print(logits)

        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [19]:

for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(train_dataloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        
        #take out inputs
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 } 
   
        #insert the input into the model and get the result
        outputs = model(**inputs).to_fp16()

        # pooled_output of shape [batch_size, 768] with representations for the entire input sequences
        # sequence_output of shape [batch_size, max_seq_length, 768] with representations for each input token (in context).
        loss = outputs[0]

        #calculate loss
        loss_train_total += loss.item()

        #this will calculate the gradients
        loss.backward()
        # for preventening gradient explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        #this will update the weights 
        optimizer.step()

        #this will empty the gradients from the previous iterations
        model.zero_grad() 

        #optimizing learning rate
        scheduler.step()

        
           

        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    #torch.save(model.state_dict(), f'/content/Gdrive/My Drive/finetuned_longformer_epoch_{epoch}.model')
    #torch.save(model.state_dict(), f'checkpoint{epoch}.pth')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(train_dataloader)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(val_dataloader)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

    #(tensor(1.3484, device='cuda:0', grad_fn=<NllLossBackward>), tensor([[ 0.1061, -0.4233,  0.6779,  0.0332],

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=177.0, style=ProgressStyle(description_widt…

RuntimeError: ignored