# Prediction on BERT Model

In [1]:
import torch.nn as nn
from transformers.modeling_bert import BertPreTrainedModel, BertModel
from torch.nn import CrossEntropyLoss
from torch.nn import BCEWithLogitsLoss

class BertForMultitask(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForMultitask, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        self.classifier = nn.Linear(config.hidden_size, out_features=11)
        self.s_classifier = nn.Linear(config.hidden_size, out_features=3)
        self.e_classifier = nn.Linear(config.hidden_size, out_features=4)

        self.init_weights()

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, 
                labels=None, s_labels=None, e_labels=None):
        _, pooled_output  = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(pooled_output)
        
        logits = self.classifier(pooled_output)
        s_logits = self.s_classifier(pooled_output)
        e_logits = self.e_classifier(pooled_output)
        
        outputs = logits, s_logits, e_logits

        return outputs



In [3]:
import torch
from transformers import BertTokenizer

#output_dir = './drive/My Drive/EBAC_G/NLP_Project/BERT/model_Multitask/'
output_dir = './model_save/'

# Load a trained model and vocabulary that you have fine-tuned
model = BertForMultitask.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

device = torch.device("cpu")
model.to(device)

BertForMultitask(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [4]:
import spacy
import re
import os
import pandas as pd
from tqdm import tqdm

!python -m spacy download en_core_web_sm -q
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(nlp.create_pipe('sentencizer'))

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
file_path = "Western Union Co_20170502-Text.txt"
file_name = os.path.basename(file_path)
file_name = os.path.splitext(file_name)[0]

scripts = []
with open(file_path, 'r') as file:
  mydata = file.readlines()
  for lines in mydata:
    scripts.append(lines)

# get sentence segemented review with #sentences > 2
def sentence_segment_filter_docs(doc_array):
    sentences = []
    for doc in nlp.pipe(doc_array, disable=['parser', 'tagger', 'ner'], batch_size=1000, n_threads=8):
        sentences.append([sent.text.strip() for sent in doc.sents])

    return sentences


print(f'Found {len(scripts)} transcripts')
print(f'Tokenizing Transcripts...')

sentences = sentence_segment_filter_docs(scripts)
nr_sents = sum([len(s) for s in sentences])
print(f'Segmented {nr_sents} transcript sentences')


sentences = sentence_segment_filter_docs(scripts)

# Save to file
fn_out = f'corpus_{file_name}.txt'

with open(fn_out, "w") as f:
    for sents in tqdm(sentences):
        real_sents = []
        for s in sents:
            x = s.replace(' ', '').replace('\n', '')
            if x != '':
                real_sents.append(s.replace('\n', ''))
        # filter only paragraph more than or equal to 1 sentence        
        if len(real_sents) >= 1:
            str_to_write = "\n".join(real_sents) + "|||" + "\n"
            f.write(str_to_write)

print(f'Done writing to {fn_out}')


Found 205 transcripts
Tokenizing Transcripts...
Segmented 213 transcript sentences


100%|██████████| 205/205 [00:00<00:00, 95388.54it/s]

Done writing to corpus_Western Union Co_20170502-Text.txt





### Preprocessing the Test Dataset

In [6]:
import argparse
import collections
import logging
import json
import re
import math

import torch
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer
from transformers import BertModel

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

class InputExample(object):

    def __init__(self, unique_id, text_a, text_b):
        self.unique_id = unique_id
        self.text_a = text_a
        self.text_b = text_b

corpus = []
unique_id = 0
count = []
with open(fn_out, "r", encoding='utf-8') as input_file:
  for line in tqdm(input_file):
    line = line.strip()
    text_a = None
    text_b = None
    m = re.match(r"^(.*) \|\|\| (.*)$", line)
    if m is None:
      text_a = re.sub(r"(\|\|\|)$", "", line)
    else:
      text_a = m.group(1)
      text_b = m.group(2)
    corpus.append(InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
    unique_id += 1
    cnt = len(line.split())
    count.append(cnt)

MAX_LEN = int(math.ceil(max(count)/10)*10)
print(' ')
print('Max sentence length: ' + str(MAX_LEN))

# Set the maximum sequence length.
# In the original paper, the authors used a length of 512.
seq_length = MAX_LEN 
# type=int
# The maximum total input sequence length after WordPiece tokenization. 
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
        self.unique_id = unique_id
        self.tokens = tokens
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids

features = []
for (txt_index, sent_pair) in enumerate(corpus):
    tokens_a = tokenizer.tokenize(sent_pair.text_a)

    tokens_b = None
    if sent_pair.text_b:
        tokens_b = tokenizer.tokenize(sent_pair.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > seq_length - 2:
            tokens_a = tokens_a[0:(seq_length - 2)]

    tokens = []
    input_type_ids = []
    tokens.append("[CLS]")
    input_type_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        input_type_ids.append(0)
    tokens.append("[SEP]")
    input_type_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            input_type_ids.append(1)
        tokens.append("[SEP]")
        input_type_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < seq_length:
        input_ids.append(0)
        input_mask.append(0)
        input_type_ids.append(0)

    assert len(input_ids) == seq_length
    assert len(input_mask) == seq_length
    assert len(input_type_ids) == seq_length

    if txt_index < 5:
        logger.info("******")
        logger.info("unique_id: %s" % (sent_pair.unique_id))
        logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
        logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logger.info("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
    
        
    features.append(InputFeatures(
                unique_id=sent_pair.unique_id,
                tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids))
    

213it [00:00, 26869.58it/s]
03/23/2020 19:31:06 - INFO - __main__ -   ******
03/23/2020 19:31:06 - INFO - __main__ -   unique_id: 0
03/23/2020 19:31:06 - INFO - __main__ -   tokens: [CLS] thank you , hi ##km ##et [SEP]
03/23/2020 19:31:06 - INFO - __main__ -   input_ids: 101 4067 2017 1010 7632 22287 3388 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/23/2020 19:31:06 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/23/2020 19:31:06 - INFO - __main__ -   input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/23/2020 19:31:06 - INFO - __main__ -   ******
03/23/2020 19:31:06 - INFO - __main__ -   unique_id: 1
03/23/2020 19:31:06 - INFO - __main__ -   tokens: [CLS] first quarter reported revenues of $ 1 . 3 billion were flat or increased 3 % on a constant currency basis compared to the prior - ye

 
Max sentence length: 50


## Making Predictions


In [10]:
# For Prediction, we try higher batch size of 32

batch_size = 32
local_rank = -1 
#local_rank for distributed training on gpus

# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

#if local_rank != -1:
    #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)
#elif n_gpu > 1:
    #model = torch.nn.DataParallel(model)


# Convert to tensors, need "input_ids & its index", "input_mask" and "input_label"
# For testing set
prediction_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) # Token ids for every sentences in individual list
prediction_input_ids_index = torch.arange(prediction_input_ids.size(0), dtype=torch.long) # Index for each sentences in one list
prediction_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)

prediction_data = TensorDataset(prediction_input_ids, prediction_input_mask, prediction_input_ids_index)

# Create the DataLoader for our testing set.
if local_rank == -1:
    prediction_sampler = SequentialSampler(prediction_data)
else:
    prediction_sampler = DistributedSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) 
# No of item in dataloader = Total sample / Batch_size



In [11]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(prediction_input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = None

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_input_ids_index = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits, s_logits, e_logits = model(b_input_ids, token_type_ids=None, 
                                         attention_mask=b_input_mask)

      y_prob = logits.softmax(dim = -1) # normalizes values along axis 1
      s_y_prob = s_logits.softmax(dim = -1)
      e_y_prob = e_logits.softmax(dim = -1)

      if predictions is None:
        predictions = y_prob.detach().cpu().numpy()
        
        s_predictions = s_y_prob.detach().cpu().numpy()
        s_class = np.argmax(s_predictions, axis=1).flatten()
        
        e_predictions = e_y_prob.detach().cpu().numpy()
        e_class = np.argmax(e_predictions, axis=1).flatten()

      else:
        predictions = np.concatenate((predictions, y_prob.detach().cpu().numpy()), axis=0)

        s_predictions = np.concatenate((s_predictions, s_y_prob.detach().cpu().numpy()), axis=0)
        s_class = np.argmax(s_predictions, axis=1).flatten()
        
        e_predictions = np.concatenate((e_predictions, e_y_prob.detach().cpu().numpy()), axis=0)
        e_class = np.argmax(e_predictions, axis=1).flatten()
  
print('    DONE.')

Predicting labels for 213 test sentences...
    DONE.


In [12]:
predictions = np.round(predictions, decimals=3)
print("Sum of Aspect Mining Total Probability across " + str(len(predictions[0])) + " Aspects is " + str(sum(predictions[0])))

s_predictions = np.round(s_predictions, decimals=3)
print("Sum of Sentiment Classification Total Probability across " + str(len(s_predictions[0])) + " Sentiment is " + str(sum(s_predictions[0])))

e_predictions = np.round(e_predictions, decimals=3)
print("Sum of Emotion Classification Total Probability across " + str(len(e_predictions[0])) + " Emotion is " + str(sum(e_predictions[0])))


Sum of Aspect Mining Total Probability across 11 Aspects is 0.9990000128746033
Sum of Sentiment Classification Total Probability across 3 Sentiment is 1.0
Sum of Emotion Classification Total Probability across 4 Emotion is 0.9999999790452421


In [15]:
# Categories of Aspects
label_list = ["sales","earnings","op_costs","products_services","organic_expansion","acquisitions","competition","op_risks","debt","not_applicable","NIL"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
num_labels = len(label_list) # 11

# Categories of Sentiment
Slabel_list = ["Negative","Neutral","Positive",] # Follow order in Slabel_f
Slabel2id = {label: i for i, label in enumerate(Slabel_list)}
Sid2label = {i: label for i, label in enumerate(Slabel_list)}
s_num_labels = len(Slabel_list) # 3

# Categories of Emotion
Elabel_list = ["Confident","Dodgy","NIL","Uncertain"] # Follow order in Elabel_f
Elabel2id = {label: i for i, label in enumerate(Elabel_list)}
Eid2label = {i: label for i, label in enumerate(Elabel_list)}
e_num_labels = len(Elabel_list) # 4

In [16]:
# Extract the text used by the tokenizer
flat_txt = []
for (txt_index, sent_pair) in enumerate(corpus):
  txt = sent_pair.text_a
  flat_txt.append(txt)

# Concat the ids to the Sentiments and Emotion lables
s_txt = [str(s)+ "-" + Sid2label[int(s)] for s in np.nditer(s_class)]
e_txt = [str(e)+ "-" + Eid2label[int(e)] for e in np.nditer(e_class)]

# Concat the Predictions to a dataframe
text_df = pd.DataFrame(data=flat_txt, columns = ["text"])
s_class_df = pd.DataFrame(data=s_txt, columns = ["Sentiment"])
e_class_df = pd.DataFrame(data=e_txt, columns = ["Emotion"])

a_df = pd.DataFrame(data=predictions, columns = list(label2id))

s_df = pd.DataFrame(data=s_predictions, columns = list(Slabel2id))

e_df = pd.DataFrame(data=e_predictions, columns = list(Elabel2id))

output_df = pd.concat([text_df, s_class_df, e_class_df, a_df, s_df, e_df], axis=1)

# Saving to CSV
pred_name = f'predicted_{file_name}.csv'
output_df.to_csv(pred_name, index=True, header=True)

output_df.sample(3)

Unnamed: 0,text,Sentiment,Emotion,sales,earnings,op_costs,products_services,organic_expansion,acquisitions,competition,...,debt,not_applicable,NIL,Negative,Neutral,Positive,Confident,Dodgy,NIL.1,Uncertain
196,"But by the end of the year, we should be at ab...",1-Neutral,2-NIL,0.005,0.083,0.028,0.021,0.013,0.005,0.003,...,0.003,0.053,0.784,0.0,0.996,0.003,0.159,0.002,0.837,0.002
63,"Excluding these expenses, adjusted earnings pe...",0-Negative,2-NIL,0.064,0.694,0.155,0.01,0.009,0.023,0.006,...,0.007,0.009,0.012,0.995,0.002,0.003,0.024,0.004,0.969,0.003
156,And I know the team is doing a little – puttin...,2-Positive,2-NIL,0.034,0.073,0.063,0.395,0.062,0.014,0.014,...,0.007,0.023,0.299,0.0,0.002,0.998,0.039,0.011,0.873,0.077
