# Prediction on BERT Model

In [1]:
#import torch
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#n_gpu = torch.cuda.device_count()
#torch.cuda.get_device_name(0)

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
#!pip install transformers -q

In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

output_dir = './model_save/'
s_output_dir = './s_model_save/'
e_output_dir = './e_model_save/'

# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained(output_dir, num_labels = 11)
s_model = BertForSequenceClassification.from_pretrained(s_output_dir, num_labels = 3)
e_model = BertForSequenceClassification.from_pretrained(e_output_dir, num_labels = 4)

tokenizer = BertTokenizer.from_pretrained(output_dir)

device = torch.device("cpu")
model.to(device)
s_model.to(device)
e_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [5]:
import spacy
import re
import os
import pandas as pd
from tqdm import tqdm

!python -m spacy download en_core_web_sm -q
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(nlp.create_pipe('sentencizer'))

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [6]:
file_path = "Western Union Co_20170502-Text.txt"
file_name = os.path.basename(file_path)
file_name = os.path.splitext(file_name)[0]


### Prediction

In [8]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

t0 = time.time()



scripts = []
with open(file_path, 'r') as file:
  mydata = file.readlines()
  for lines in mydata:
    scripts.append(lines)

# get sentence segemented review with #sentences > 2
def sentence_segment_filter_docs(doc_array):
    sentences = []
    for doc in nlp.pipe(doc_array, disable=['parser', 'tagger', 'ner'], batch_size=1000, n_threads=8):
        sentences.append([sent.text.strip() for sent in doc.sents])

    return sentences


print(f'Found {len(scripts)} transcripts')
print(f'Tokenizing Transcripts...')

sentences = sentence_segment_filter_docs(scripts)
nr_sents = sum([len(s) for s in sentences])
print(f'Segmented {nr_sents} transcript sentences')


sentences = sentence_segment_filter_docs(scripts)

# Save to file
fn_out = f'corpus_{file_name}.txt'

with open(fn_out, "w") as f:
    for sents in tqdm(sentences):
        real_sents = []
        for s in sents:
            x = s.replace(' ', '').replace('\n', '')
            if x != '':
                real_sents.append(s.replace('\n', ''))
        # filter only paragraph more than or equal to 1 sentence        
        if len(real_sents) >= 1:
            str_to_write = "\n".join(real_sents) + "|||" + "\n"
            f.write(str_to_write)

print(f'Done writing to {fn_out}')


import argparse
import collections
import logging
import json
import re
import math
import numpy as np

import torch
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer
from transformers import BertModel


class InputExample(object):

    def __init__(self, unique_id, text_a, text_b):
        self.unique_id = unique_id
        self.text_a = text_a
        self.text_b = text_b

corpus = []
unique_id = 0
count = []
with open(fn_out, "r", encoding='utf-8') as input_file:
  for line in tqdm(input_file):
    line = line.strip()
    text_a = None
    text_b = None
    m = re.match(r"^(.*) \|\|\| (.*)$", line)
    if m is None:
      text_a = re.sub(r"(\|\|\|)$", "", line)
    else:
      text_a = m.group(1)
      text_b = m.group(2)
    corpus.append(InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
    unique_id += 1
    cnt = len(line.split())
    count.append(cnt)

MAX_LEN = int(math.ceil(max(count)/10)*10)

print('Max sentence length: ' + str(MAX_LEN))

# Set the maximum sequence length.
# In the original paper, the authors used a length of 512.
seq_length = MAX_LEN 
# type=int
# The maximum total input sequence length after WordPiece tokenization. 
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
        self.unique_id = unique_id
        self.tokens = tokens
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids

features = []
for (txt_index, sent_pair) in enumerate(corpus):
    tokens_a = tokenizer.tokenize(sent_pair.text_a)

    tokens_b = None
    if sent_pair.text_b:
        tokens_b = tokenizer.tokenize(sent_pair.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > seq_length - 2:
            tokens_a = tokens_a[0:(seq_length - 2)]

    tokens = []
    input_type_ids = []
    tokens.append("[CLS]")
    input_type_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        input_type_ids.append(0)
    tokens.append("[SEP]")
    input_type_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            input_type_ids.append(1)
        tokens.append("[SEP]")
        input_type_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < seq_length:
        input_ids.append(0)
        input_mask.append(0)
        input_type_ids.append(0)

    assert len(input_ids) == seq_length
    assert len(input_mask) == seq_length
    assert len(input_type_ids) == seq_length
        
    features.append(InputFeatures(
                unique_id=sent_pair.unique_id,
                tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids))
    
# For Prediction, we try higher batch size of 32

batch_size = 32
local_rank = -1 
#local_rank for distributed training on gpus

# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

#if local_rank != -1:
    #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)
#elif n_gpu > 1:
    #model = torch.nn.DataParallel(model)


# Convert to tensors, need "input_ids & its index", "input_mask" and "input_label"
# For testing set
prediction_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) # Token ids for every sentences in individual list
prediction_input_ids_index = torch.arange(prediction_input_ids.size(0), dtype=torch.long) # Index for each sentences in one list
prediction_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)

prediction_data = TensorDataset(prediction_input_ids, prediction_input_mask, prediction_input_ids_index)

# Create the DataLoader for our testing set.
if local_rank == -1:
    prediction_sampler = SequentialSampler(prediction_data)
else:
    prediction_sampler = DistributedSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) 
# No of item in dataloader = Total sample / Batch_size



# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(prediction_input_ids)))

# Put model in evaluation mode
model.eval()
s_model.eval()
e_model.eval()

# Tracking variables 
predictions = None

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU/CPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_input_ids_index = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
      s_logits = s_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
      e_logits = e_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

      
      y_prob = logits[0].softmax(dim = -1)  
      s_y_prob = s_logits[0].softmax(dim = -1)
      e_y_prob = e_logits[0].softmax(dim = -1)

      if predictions is None:
        predictions = y_prob.detach().cpu().numpy()
        a_class = np.argmax(predictions, axis=1).flatten()
        
        s_predictions = s_y_prob.detach().cpu().numpy()
        s_class = np.argmax(s_predictions, axis=1).flatten()
        
        e_predictions = e_y_prob.detach().cpu().numpy()
        e_class = np.argmax(e_predictions, axis=1).flatten()

      else:
        predictions = np.concatenate((predictions, y_prob.detach().cpu().numpy()), axis=0)
        a_class = np.argmax(predictions, axis=1).flatten()

        s_predictions = np.concatenate((s_predictions, s_y_prob.detach().cpu().numpy()), axis=0)
        s_class = np.argmax(s_predictions, axis=1).flatten()
        
        e_predictions = np.concatenate((e_predictions, e_y_prob.detach().cpu().numpy()), axis=0)
        e_class = np.argmax(e_predictions, axis=1).flatten()
        
        
        
# Categories of Aspects
label_list = ["sales","earnings","op_costs","products_services","organic_expansion","acquisitions","competition","op_risks","debt","not_applicable","NIL"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
num_labels = len(label_list) # 11

# Categories of Sentiment
Slabel_list = ["Negative","Neutral","Positive",] # Follow order in Slabel_f
Slabel2id = {label: i for i, label in enumerate(Slabel_list)}
Sid2label = {i: label for i, label in enumerate(Slabel_list)}
s_num_labels = len(Slabel_list) # 3

# Categories of Emotion
Elabel_list = ["Confident","Dodgy","NIL","Uncertain"] # Follow order in Elabel_f
Elabel2id = {label: i for i, label in enumerate(Elabel_list)}
Eid2label = {i: label for i, label in enumerate(Elabel_list)}
e_num_labels = len(Elabel_list) # 4

# Extract the text used by the tokenizer
flat_txt = []
for (txt_index, sent_pair) in enumerate(corpus):
  txt = sent_pair.text_a
  flat_txt.append(txt)

# Concat the ids to the Aspect, Sentiments and Emotion lables
a_txt = [str(a)+ "-" + id2label[int(a)] for a in np.nditer(a_class)]
s_txt = [str(s)+ "-" + Sid2label[int(s)] for s in np.nditer(s_class)]
e_txt = [str(e)+ "-" + Eid2label[int(e)] for e in np.nditer(e_class)]

# Concat the Predictions to a dataframe
text_df = pd.DataFrame(data=flat_txt, columns = ["text"])
a_class_df = pd.DataFrame(data=a_txt, columns = ["Aspect"])
s_class_df = pd.DataFrame(data=s_txt, columns = ["Sentiment"])
e_class_df = pd.DataFrame(data=e_txt, columns = ["Emotion"])


a_df = pd.DataFrame(data=predictions, columns = list(label2id))

s_df = pd.DataFrame(data=s_predictions, columns = list(Slabel2id))

e_df = pd.DataFrame(data=e_predictions, columns = list(Elabel2id))

output_df = pd.concat([text_df, a_class_df, s_class_df, e_class_df, a_df, s_df, e_df], axis=1)

# Saving to CSV
output_df.to_csv('predicted1.csv', index=True, header=True)
    
    
print("Prediction took: {:}".format(format_time(time.time() - t0)))  
print('    DONE.')


Found 205 transcripts
Tokenizing Transcripts...


100%|██████████| 205/205 [00:00<00:00, 106612.81it/s]
213it [00:00, 25570.63it/s]

Segmented 213 transcript sentences
Done writing to corpus_Western Union Co_20170502-Text.txt
Max sentence length: 50





Predicting labels for 213 test sentences...
Prediction took: 0:01:04
    DONE.


In [9]:
predictions = np.round(predictions, decimals=3)
print("No. of Aspect Classification across " + str(len(predictions[0])) + " Aspects is " + str(sum(predictions[0])))

s_predictions = np.round(s_predictions, decimals=3)
print("Sum of Sentiment Classification Total Probability across " + str(len(s_predictions[0])) + " Sentiment is " + str(sum(s_predictions[0])))

e_predictions = np.round(e_predictions, decimals=3)
print("Sum of Emotion Classification Total Probability across " + str(len(e_predictions[0])) + " Emotion is " + str(sum(e_predictions[0])))

output_df.sample(10)

No. of Aspect Classification across 11 Aspects is 0.9989999756217003
Sum of Sentiment Classification Total Probability across 3 Sentiment is 1.0000000050058588
Sum of Emotion Classification Total Probability across 4 Emotion is 1.0000000222353265


Unnamed: 0,text,Aspect,Sentiment,Emotion,sales,earnings,op_costs,products_services,organic_expansion,acquisitions,...,debt,not_applicable,NIL,Negative,Neutral,Positive,Confident,Dodgy,NIL.1,Uncertain
43,Business Solutions revenues declined 6% or 3% ...,1-earnings,0-Negative,2-NIL,0.048653,0.908751,0.01311,0.006053,0.006235,0.007603,...,0.002613,0.000572,0.001907,0.970712,0.002794,0.026494,0.041203,0.000886,0.947248,0.010663
166,"We can be even better there, having the right ...",1-earnings,2-Positive,2-NIL,0.02488,0.333908,0.287877,0.028933,0.040301,0.003152,...,0.009348,0.02187,0.200412,0.018283,0.317467,0.66425,0.370206,0.002409,0.604078,0.023307
185,Our customers really don't have to leave the m...,3-products_services,1-Neutral,2-NIL,0.048526,0.031799,0.01198,0.388563,0.026304,0.000999,...,0.00178,0.197168,0.266475,0.013992,0.931684,0.054325,0.115261,0.00534,0.833769,0.045629
54,We recorded $14 million of WU Way expenses in ...,2-op_costs,2-Positive,2-NIL,0.001914,0.056989,0.840234,0.005711,0.026417,0.00568,...,0.012039,0.002256,0.024528,0.011204,0.073355,0.915441,0.277299,0.002954,0.690457,0.02929
111,the infrastructure is right to improve there,9-not_applicable,1-Neutral,2-NIL,0.000741,0.000707,0.000727,0.001918,0.000847,9e-05,...,0.000203,0.93619,0.05697,0.000825,0.989476,0.009699,0.022568,0.000331,0.975968,0.001132
17,"In addition, we have changed the methodology o...",0-sales,1-Neutral,2-NIL,0.894372,0.08306,0.001807,0.007982,0.004843,0.001023,...,0.000488,0.0014,0.002108,0.083388,0.463083,0.453529,0.112637,0.003802,0.839491,0.044071
158,"You know our business very well, Ashwin, depen...",9-not_applicable,1-Neutral,2-NIL,0.014367,0.017743,0.012068,0.015073,0.015121,0.000321,...,0.001991,0.46511,0.437931,0.002495,0.983937,0.013568,0.211924,0.006327,0.737437,0.044312
60,The higher tax rate in the quarter was due to ...,10-NIL,0-Negative,2-NIL,0.094916,0.156458,0.072805,0.144568,0.112398,0.019536,...,0.012565,0.095582,0.233808,0.636758,0.27957,0.083672,0.087676,0.002124,0.896938,0.013262
209,That's something that we are extremely optimis...,0-sales,2-Positive,0-Confident,0.435926,0.044272,0.01169,0.022795,0.181553,0.0015,...,0.005102,0.057259,0.211716,0.008604,0.107139,0.884257,0.546111,0.003871,0.415448,0.03457
28,Revenue growth in the region is consistent wit...,0-sales,2-Positive,0-Confident,0.956939,0.024689,0.000817,0.005211,0.006709,0.00069,...,0.000339,0.000954,0.001473,0.024684,0.014574,0.960743,0.962446,0.000869,0.033721,0.002964
