<a href="https://colab.research.google.com/github/chriszxy/DL_NLP_EIB_Project/blob/main/BERT_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.12.2-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 7.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 42.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 47.0 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.1.0-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting unin

In [17]:
import matplotlib.pyplot as plt
import pandas as pd
import torch

# Preliminaries
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

# Models
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training
import torch.optim as optim

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
import pandas as pd
import numpy as np

corpus = pd.read_csv('/content/gdrive/My Drive/real whole corpus.csv', names = ['company', 'text'])
score = pd.read_csv("/content/gdrive/My Drive/real BB company.csv", index_col = 0)

data = corpus.merge(score, left_on = 'company', right_on = 'Name')
data = data.drop('Name', axis=1).dropna()

In [6]:
data = data[['text','ENV_DISCLOSURE_SCORE']]
data

Unnamed: 0,text,ENV_DISCLOSURE_SCORE
0,dear sharehold busi wa mark by signific challe...,5.357143
2,document de référenc le rapport financi annue...,43.410853
4,of from the corpor sustain stakehold goal su...,27.131783
6,ocado corpor respons report ocado way i m deli...,18.604651
8,car typeagricultureconstruct metalsconsum prod...,1.550388
...,...,...
786,syneo health sustain snapshot tabl of contents...,1.550388
787,safeharbor s we our kadant orth compani future...,8.527132
788,whichcouldinfluencecustomertrend consumertraff...,3.125000
789,integr report see integr report engi s contrib...,51.162791


In [7]:
train = data[:int(0.5*len(data))]
valid = data[int(0.5*len(data)):int(0.7*len(data))]
test = data[int(0.7*len(data)):]
train.to_csv('train.csv',encoding='utf_8_sig')
valid.to_csv('valid.csv',encoding='utf_8_sig')
test.to_csv('test.csv',encoding='utf_8_sig')

In [8]:
train 

Unnamed: 0,text,ENV_DISCLOSURE_SCORE
0,dear sharehold busi wa mark by signific challe...,5.357143
2,document de référenc le rapport financi annue...,43.410853
4,of from the corpor sustain stakehold goal su...,27.131783
6,ocado corpor respons report ocado way i m deli...,18.604651
8,car typeagricultureconstruct metalsconsum prod...,1.550388
...,...,...
387,sustain report i am veri pleas with the contin...,1.550388
389,liveabl simpli report group report of leg immo...,26.356589
390,sustain report creat valu over time requir a c...,13.953488
391,to my fellow stockhold the advansix purpos is ...,19.379845


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Model parameter
MAX_SEQ_LEN = 512
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX, truncate_first=True)
#truncate_first=True : truncate text below length of 512

fields = [('ENV_DISCLOSURE_SCORE', label_field),('text', text_field), ]

# TabularDataset

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
train, valid, test = TabularDataset.splits(path='', train='train.csv', validation='valid.csv',
                                           test='test.csv', format='CSV', fields=fields, skip_header=True)



Token indices sequence length is longer than the specified maximum sequence length for this model (20366 > 512). Running this sequence through the model will result in indexing errors


In [11]:
# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text),
                            train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
                             train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, train=False, shuffle=False, sort=False)

In [12]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

In [13]:
def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']


In [22]:
def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = '/train_file',
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, text) in train_iter:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to()
            text = text.type(torch.LongTensor)  
            text = text.to()
            output = model(text, labels)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (text, labels), _ in valid_loader:
                        #labels = labels.type(torch.LongTensor)           
                        #labels = labels.to()
                        #text = text.type(torch.LongTensor)  
                        #text = text.to()
                        output = model(text, labels)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

model = BERT().to()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer, file_path = '/train_file')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<class 'tuple'>
<class 'NoneType'>


AttributeError: ignored

In [25]:
for i in train_iter:
  print(i)
  break


[torchtext.legacy.data.batch.Batch of size 16]
	[.ENV_DISCLOSURE_SCORE]:[torch.FloatTensor of size 16]
	[.text]:[torch.LongTensor of size 16x512]


In [26]:
for (text, labels) in train_iter:
    print(text)
    break

(tensor([312., 318., 280., 279., 264., 263., 252., 251., 241., 240., 132., 131.,
         76.,  75.,  69.,  68.]), tensor([[  101, 14910, 16558,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0],
        ...,
        [  101,   102,     0,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0]]))


In [15]:
for (text, labels) in train_iter:
    print(text)

(tensor([312., 318., 280., 279., 264., 263., 252., 251., 241., 240., 132., 131.,
         76.,  75.,  69.,  68.]), tensor([[  101, 14910, 16558,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0],
        ...,
        [  101,   102,     0,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0]]))
(tensor([221., 180., 234., 272.,  88., 167., 354., 334., 357., 254., 375., 349.,
         94., 330., 136., 368.]), tensor([[  101,  5142,  2024,  ...,     0,     0,     0],
        [  101,  3067,  2203,  ...,     0,     0,     0],
        [  101,  2238,  1998,  ...,     0,     0,     0],
        ...,
        [  101,  1037,  8754,  ...,     0,     0,     0],
        [  101,  4685,  5435,  ...,     0,     0,     0],
        [  101, 13058,  2953,  ...,     0,     0,     0]]))
(tensor([ 58., 239., 296., 135., 134., 195.,