In [1]:
# Install transformers module
!pip install transformers==3.1.0
!pip install ftfy

Collecting transformers==3.1.0
  Downloading transformers-3.1.0-py3-none-any.whl (884 kB)
[?25l[K     |▍                               | 10 kB 26.4 MB/s eta 0:00:01[K     |▊                               | 20 kB 13.8 MB/s eta 0:00:01[K     |█▏                              | 30 kB 8.1 MB/s eta 0:00:01[K     |█▌                              | 40 kB 7.8 MB/s eta 0:00:01[K     |█▉                              | 51 kB 4.2 MB/s eta 0:00:01[K     |██▎                             | 61 kB 5.0 MB/s eta 0:00:01[K     |██▋                             | 71 kB 5.2 MB/s eta 0:00:01[K     |███                             | 81 kB 5.9 MB/s eta 0:00:01[K     |███▍                            | 92 kB 4.7 MB/s eta 0:00:01[K     |███▊                            | 102 kB 5.0 MB/s eta 0:00:01[K     |████                            | 112 kB 5.0 MB/s eta 0:00:01[K     |████▌                           | 122 kB 5.0 MB/s eta 0:00:01[K     |████▉                           | 133 kB 5.0 MB/s

In [2]:
import io
import os
import torch
import json
import argparse
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ftfy import fix_text
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification


In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
with open('train.json', encoding='utf-8') as inputfile:
    df = pd.read_json(inputfile,lines = True)

In [5]:
df.columns

Index(['sent_id', 'text', 'sources', 'targets', 'expressions'], dtype='object')

In [6]:
for cols in df:
  print(cols)

sent_id
text
sources
targets
expressions


In [7]:
print(len(df['text']))

1744


In [8]:

#preprocessing training data
import csv
max_len_sent = 0
# Converting json file to csv file
# columns of the new csv file
"""
Each sentence is tokenized into words and in turn each word is labelled.
labels are shown in the following cells
A slight modification to the dataset is done for our convinience.
"""
header = ["Sentence #", "Word", "Tag"]
with open('labeled_words.csv', 'w', encoding='UTF8') as preproc_csv:
  # writer pointer to write into csv file
    writer = csv.writer(preproc_csv)
    writer.writerow(header)
    for i in range(0, len(df)):
      # Number of words in each sentence
        n_words = len(df['text'][i])
        max_len_sent = max(max_len_sent,n_words)
        if(n_words == 0):
          # If there is null string
            continue
        for j in range(0, n_words):
            # Unlabelled token
            token = "O"
            # Assigning labels
            if(len(df['sources'][i][j]) > 1):
                token = df['sources'][i][j]
            elif(len(df['targets'][i][j]) > 1):
                token = df['targets'][i][j]
            elif(len(df['expressions'][i][j]) > 1):
                token = df['expressions'][i][j]
            sentence_no = "Sentence: " + str(i)
            # Appending each row to the csv file
            row = []
            row.append(sentence_no)
            row.append(df['text'][i][j])
            row.append(token)
            writer.writerow(row)


In [9]:
print(max_len_sent)

127


In [10]:
data = pd.read_csv('labeled_words.csv', encoding='UTF-8')
data.head(10)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 0,Experienced,O
1,Sentence: 0,staff,O
2,Sentence: 0,and,O
3,Sentence: 0,had,O
4,Sentence: 0,a,O
5,Sentence: 0,memorable,O
6,Sentence: 0,stay,O
7,Sentence: 1,India,O
8,Sentence: 1,as,O
9,Sentence: 1,a,O


In [11]:
# Number of words in the whole train.json file
data.count()

Sentence #    25677
Word          25677
Tag           25677
dtype: int64

In [12]:
# Number of labels which were discussed earlier
print("Number of tags: {}".format(len(data.Tag.unique())))
# Frequency of each tags
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 11


O                  15249
I-exp-Positive      2125
B-exp-Positive      1991
B-targ-Positive     1566
I-exp-Negative      1462
I-targ-Positive     1143
B-exp-Negative       781
B-targ-Negative      581
I-targ-Negative      576
B-holder             201
I-holder               2
Name: Tag, dtype: int64

In [13]:
i = 0
# Dictionary of labels
labels_to_ids = {}
ids_to_labels = {}
for label in data.Tag:
  if label not in labels_to_ids.keys():
    ids_to_labels[i] = label
    labels_to_ids[label] = i
    i = i + 1

In [14]:
ids_to_labels

{0: 'O',
 1: 'B-exp-Positive',
 2: 'I-exp-Positive',
 3: 'B-targ-Positive',
 4: 'I-targ-Positive',
 5: 'B-holder',
 6: 'B-exp-Negative',
 7: 'I-exp-Negative',
 8: 'B-targ-Negative',
 9: 'I-targ-Negative',
 10: 'I-holder'}

In [15]:
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head(10)

Unnamed: 0,Sentence #,Word,Tag,sentence,word_labels
0,Sentence: 0,Experienced,O,Experienced staff and had a memorable stay,"O,O,O,O,O,O,O"
1,Sentence: 0,staff,O,Experienced staff and had a memorable stay,"O,O,O,O,O,O,O"
2,Sentence: 0,and,O,Experienced staff and had a memorable stay,"O,O,O,O,O,O,O"
3,Sentence: 0,had,O,Experienced staff and had a memorable stay,"O,O,O,O,O,O,O"
4,Sentence: 0,a,O,Experienced staff and had a memorable stay,"O,O,O,O,O,O,O"
5,Sentence: 0,memorable,O,Experienced staff and had a memorable stay,"O,O,O,O,O,O,O"
6,Sentence: 0,stay,O,Experienced staff and had a memorable stay,"O,O,O,O,O,O,O"
7,Sentence: 1,India,O,India as a country has always fascinated me an...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
8,Sentence: 1,as,O,India as a country has always fascinated me an...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
9,Sentence: 1,a,O,India as a country has always fascinated me an...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [16]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)

In [17]:
data.iloc[3].sentence

'Yes , it really was a great experience and we visited various places but the most wonderful part of the trip was our stay at the Oberoi Udaivilas Luxury Hotel .'

In [18]:
data.iloc[3].word_labels

'O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-exp-Positive,I-exp-Positive,I-exp-Positive,I-exp-Positive,I-exp-Positive,I-exp-Positive,I-exp-Positive,O,O,O,O,B-targ-Positive,I-targ-Positive,I-targ-Positive,I-targ-Positive,I-targ-Positive,O'

In [19]:
data.head(10)

Unnamed: 0,sentence,word_labels
0,Experienced staff and had a memorable stay,"O,O,O,O,O,O,O"
1,India as a country has always fascinated me an...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,One of my friends who had been there before wa...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,"Yes , it really was a great experience and we ...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-exp-Positive,I-e..."
4,I can ’t explain in words how grand this place...,"O,O,O,O,O,O,B-exp-Positive,I-exp-Positive,B-ta..."
5,It is a unique blend of the old world royal ch...,"B-targ-Positive,O,O,B-exp-Positive,I-exp-Posit..."
6,I ’m definitely going there again whenever I g...,"B-holder,O,B-exp-Positive,I-exp-Positive,B-tar..."
7,Bit pricey and but away from center,"B-exp-Negative,I-exp-Negative,O,O,B-exp-Negati..."
8,You need to count 20 · 30 minutes walking time...,"O,O,O,O,B-exp-Negative,I-exp-Negative,I-exp-Ne..."
9,There is no sauna nor swimming pool .,"B-exp-Negative,I-exp-Negative,I-exp-Negative,B..."


In [20]:
f"Length of training data = {len(data)}"

'Length of training data = 1710'

In [21]:
# MAX_LEN = 128
# TRAIN_BATCH_SIZE = 32
# VALID_BATCH_SIZE = 16
# LEARNING_RATE = 1e-05
# MAX_GRAD_NORM = 10
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [22]:
class dataset(Dataset):
  # Takes pandas dataframe,tokenizer ,max_len as input
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        sentence = self.data.sentence[index].strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # Using tokenizer to encode the sentence
        # Padding of max_len is applied
        encoding = self.tokenizer(sentence,
                             is_pretokenized=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        

        labels = [labels_to_ids[label] for label in word_labels] 
        # pad with -100
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [23]:
import sklearn
from sklearn.model_selection import train_test_split

In [24]:
# Bert tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
MAX_LEN = 128

training_set = dataset(data, tokenizer, MAX_LEN)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [25]:
len(training_set)

1710

In [26]:
# sample output of one of the sentence from encoder block
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[3]["input_ids"]), training_set[3]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
yes         0
,           0
it          0
really      0
was         0
a           0
great       0
experience  0
and         0
we          0
visited     0
various     0
places      0
but         0
the         1
most        2
wonderful   2
part        2
of          2
the         2
trip        2
was         0
our         0
stay        0
at          0
the         3
obe         4
##roi       -100
ud          4
##ai        -100
##vil       -100
##as        -100
luxury      4
hotel       4
.           0
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]

In [27]:
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [28]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


training_loader = DataLoader(training_set, **train_params)


In [29]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [30]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [31]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 10==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 10 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [32]:
EPOCHS = 8
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 10 training steps: 2.5020554065704346
Training loss per 10 training steps: 2.1852687922391025
Training loss per 10 training steps: 1.9038052331833613
Training loss per 10 training steps: 1.8061110127356745
Training loss per 10 training steps: 1.7123832702636719
Training loss per 10 training steps: 1.6440034193151138
Training loss epoch: 1.6302900005269934
Training accuracy epoch: 0.5450237363273696
Training epoch: 2
Training loss per 10 training steps: 1.4201716184616089
Training loss per 10 training steps: 1.2861196886409412
Training loss per 10 training steps: 1.316675101007734
Training loss per 10 training steps: 1.292523730185724
Training loss per 10 training steps: 1.2653509465659536
Training loss per 10 training steps: 1.2448260760774799
Training loss epoch: 1.2324174134819597
Training accuracy epoch: 0.6291421479113211
Training epoch: 3
Training loss per 10 training steps: 0.977083146572113
Training loss per 10 training steps: 1.04823589866811

In [33]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 10==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 10 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

Validation


In [34]:
with open('dev.json', encoding='utf-8') as inputfile:
    df_dev = pd.read_json(inputfile,lines = True)



with open('labeled_words_dev.csv', 'w', encoding='UTF8') as preproc_csv:
    writer = csv.writer(preproc_csv)
    writer.writerow(header)
    for i in range(0, len(df_dev)):
        n_words = len(df_dev['text'][i])
        if(n_words == 0):
            continue
        for j in range(0, n_words):
            token = "O"
            if(len(df_dev['sources'][i][j]) > 1):
                token = df_dev['sources'][i][j]
            elif(len(df_dev['targets'][i][j]) > 1):
                token = df_dev['targets'][i][j]
            elif(len(df_dev['expressions'][i][j]) > 1):
                token = df_dev['expressions'][i][j]
            sentence_no = "Sentence: " + str(i)
            
            row = []
            row.append(sentence_no)
            row.append(df_dev['text'][i][j])
            row.append(token)
            writer.writerow(row)

data_dev = pd.read_csv('labeled_words_dev.csv', encoding='UTF-8')


# let's create a new column called "sentence" which groups the words by sentence 
data_dev['sentence'] = data_dev[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
data_dev['word_labels'] = data_dev[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))



data_dev = data_dev[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data_dev.head()

Unnamed: 0,sentence,word_labels
0,"very warm welcome at the reception , very frie...","B-exp-Positive,I-exp-Positive,B-targ-Positive,..."
1,"The room is very small , about 10m2","B-targ-Negative,I-targ-Negative,O,B-exp-Negati..."
2,Hotel Premiere Classe Orly Rungis is near the ...,"B-targ-Negative,I-targ-Negative,I-targ-Negativ..."
3,Near the hotel there is a bus stop that goes t...,"O,O,O,B-exp-Positive,I-exp-Positive,B-targ-Pos..."
4,"In this area there is famous Rungis market , t...","O,O,O,B-exp-Positive,I-exp-Positive,O,B-targ-P..."


In [35]:
testing_set = dataset(data_dev, tokenizer, MAX_LEN)

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_set, **test_params)

In [36]:
len(testing_set)

249

In [37]:
labels, predictions = valid(model, testing_loader)

Validation loss per 10 evaluation steps: 0.6322234272956848
Validation loss per 10 evaluation steps: 0.8369430195201527
Validation Loss: 0.7943809907883406
Validation Accuracy: 0.762302514130975


In [38]:
sentence = "Even though the price is decent for paris , I would not recommend this hotel ."

In [39]:
inputs = tokenizer(sentence.split(),
                    is_pretokenized=True, 
                    return_offsets_mapping=True, 
                    padding='max_length', 
                    truncation=True, 
                    max_length=MAX_LEN,
                    return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(sentence.split())
print(prediction)

['Even', 'though', 'the', 'price', 'is', 'decent', 'for', 'paris', ',', 'I', 'would', 'not', 'recommend', 'this', 'hotel', '.']
['O', 'O', 'B-targ-Positive', 'I-targ-Positive', 'O', 'B-exp-Positive', 'O', 'O', 'O', 'B-holder', 'B-exp-Negative', 'B-exp-Negative', 'I-exp-Negative', 'B-targ-Negative', 'I-targ-Negative', 'O']


In [40]:
print(sentence[10:18])


h the pr


In [41]:
i = 0
while i < len(prediction):
  print(prediction[i])
  i+=1

O
O
B-targ-Positive
I-targ-Positive
O
B-exp-Positive
O
O
O
B-holder
B-exp-Negative
B-exp-Negative
I-exp-Negative
B-targ-Negative
I-targ-Negative
O


In [42]:

holders = []
holders_span = []
pos_targets = []
pos_targets_span = []
neg_targets = []
neg_targets_span = []
pos_expressions = []
pos_expressions_span = []
neg_expressions = []
neg_expressions_span = []

n = len(prediction)
words = sentence.split()
i = 0
curr_index = 0
start_index = 0
end_index = 0

while i < n:
  word = "" 
  if prediction[i][0] == 'O':
    curr_index += len(words[i]) +1
    i += 1
   
  elif prediction[i][0:5] == 'B-tar' and prediction[i][7] == 'P':
    word = words[i]
    start_index = curr_index
    curr_index += len(words[i]) + 1
    i += 1

    while i < n and prediction[i][0:5] == 'I-tar' and prediction[i][7] == 'P':
      word += " "
      word += words[i]
      curr_index += len(words[i]) + 1
      i += 1
    end_index = curr_index
    sp = str(start_index) + ":" + str(end_index-1)
    pos_targets_span.append(sp)
    pos_targets.append(word) 

  elif prediction[i][0:5] == 'B-tar' and prediction[i][7] == 'N':
    word = words[i]
    start_index = curr_index
    curr_index += len(words[i]) + 1
    i += 1

    while i < n and prediction[i][0:5] == 'I-tar' and prediction[i][7] == 'N':
      word += " "
      word += words[i]
      curr_index += len(words[i]) + 1
      i += 1
    end_index = curr_index
    sp = str(start_index) + ":" + str(end_index-1)
    neg_targets_span.append(sp)
    neg_targets.append(word) 

  elif prediction[i][0:5] == 'B-exp' and prediction[i][6] == 'P':
    word = words[i]
    start_index = curr_index
    curr_index += len(words[i]) + 1
    i += 1

    while i < n and prediction[i][0:5] == 'I-exp' and prediction[i][6] == 'P':
      word += " "
      word += words[i]
      curr_index += len(words[i]) + 1
      i += 1
    end_index = curr_index
    sp = str(start_index) + ":" + str(end_index-1)
    pos_expressions_span.append(sp)
    pos_expressions.append(word) 

  elif prediction[i][0:5] == 'B-exp' and prediction[i][6] == 'N':
    word = words[i]
    start_index = curr_index
    curr_index += len(words[i]) + 1
    i += 1

    while i < n and prediction[i][0:5] == 'I-exp' and prediction[i][6] == 'N':
      word += " "
      word += words[i]
      curr_index += len(words[i]) + 1
      i += 1
    end_index = curr_index
    sp = str(start_index) + ":" + str(end_index-1)
    neg_expressions_span.append(sp)
    neg_expressions.append(word) 


  elif prediction[i][0:5] == 'B-hol':
    word = words[i]
    start_index = curr_index
    curr_index += len(words[i]) + 1
    i += 1

    while i < n and prediction[i][0:5] == 'I-hol':
      word += " "
      word += words[i]
      curr_index += len(words[i]) + 1
      i += 1
    end_index = curr_index
    sp = str(start_index) + ":" + str(end_index-1)
    holders_span.append(sp)
    holders.append(word) 

  # curr_index += len(words[i]) + 1
  # i += 1
  

print(holders , holders_span)
print(pos_targets , pos_targets_span)
print(neg_targets,neg_targets_span)
print(pos_expressions,pos_expressions_span)
print(neg_expressions,neg_expressions_span)

# targets = pos_targets + neg_targets
# expressions = pos_expressions + neg_expressions


['I'] ['44:45']
['the price'] ['12:21']
['this hotel'] ['66:76']
['decent'] ['25:31']
['would', 'not recommend'] ['46:51', '52:65']


In [43]:
with open('test.json', encoding='utf-8') as inputfile:
    df = pd.read_json(inputfile,lines = True)


In [44]:
def convert(lst):
  return ' '.join(lst)

In [45]:
print(len(df))

499


In [46]:
dict_list = []
for w in range(0,len(df['text'])):
  print(f"Processing : {w}")
  words = df['text'][w]
  inputs = tokenizer(words,
                    is_pretokenized=True, 
                    return_offsets_mapping=True, 
                    padding='max_length', 
                    truncation=True, 
                    max_length=MAX_LEN,
                    return_tensors="pt")

  # move to gpu
  ids = inputs["input_ids"].to(device)
  mask = inputs["attention_mask"].to(device)
  # forward pass
  outputs = model(ids, attention_mask=mask)
  logits = outputs[0]

  active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
  flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

  tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
  token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
  wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

  prediction = []
  for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
    #only predictions on first word pieces are important
    if mapping[0] == 0 and mapping[1] != 0:
      prediction.append(token_pred[1])
    else:
      continue


  if w == 2 : 
    print(df['text'][w])
    print(prediction)

  holders = []
  holders_span = []
  pos_targets = []
  pos_targets_span = []
  neg_targets = []
  neg_targets_span = []
  pos_expressions = []
  pos_expressions_span = []
  neg_expressions = []
  neg_expressions_span = []

  n = len(prediction)
  i = 0
  curr_index = 0
  start_index = 0
  end_index = 0

  while i < n:
    word = "" 
    if prediction[i][0] == 'O':
      curr_index += len(words[i]) +1
      i += 1
    
    elif prediction[i][0:5] == 'B-tar' and prediction[i][7] == 'P':
      word = words[i]
      start_index = curr_index
      curr_index += len(words[i]) + 1
      i += 1

      while i < n and prediction[i][0:5] == 'I-tar' and prediction[i][7] == 'P':
        word += " "
        word += words[i]
        curr_index += len(words[i]) + 1
        i += 1
      end_index = curr_index
      sp = str(start_index) + ":" + str(end_index-1)
      pos_targets_span.append(sp)
      pos_targets.append(word) 

    elif prediction[i][0:5] == 'B-tar' and prediction[i][7] == 'N':
      word = words[i]
      start_index = curr_index
      curr_index += len(words[i]) + 1
      i += 1

      while i < n and prediction[i][0:5] == 'I-tar' and prediction[i][7] == 'N':
        word += " "
        word += words[i]
        curr_index += len(words[i]) + 1
        i += 1
      end_index = curr_index
      sp = str(start_index) + ":" + str(end_index-1)
      neg_targets_span.append(sp)
      neg_targets.append(word) 

    elif prediction[i][0:5] == 'B-exp' and prediction[i][6] == 'P':
      word = words[i]
      start_index = curr_index
      curr_index += len(words[i]) + 1
      i += 1

      while i < n and prediction[i][0:5] == 'I-exp' and prediction[i][6] == 'P':
        word += " "
        word += words[i]
        curr_index += len(words[i]) + 1
        i += 1
      end_index = curr_index
      sp = str(start_index) + ":" + str(end_index-1)
      pos_expressions_span.append(sp)
      pos_expressions.append(word) 

    elif prediction[i][0:5] == 'B-exp' and prediction[i][6] == 'N':
      word = words[i]
      start_index = curr_index
      curr_index += len(words[i]) + 1
      i += 1

      while i < n and prediction[i][0:5] == 'I-exp' and prediction[i][6] == 'N':
        word += " "
        word += words[i]
        curr_index += len(words[i]) + 1
        i += 1
      end_index = curr_index
      sp = str(start_index) + ":" + str(end_index-1)
      neg_expressions_span.append(sp)
      neg_expressions.append(word) 


    elif prediction[i][0:5] == 'B-hol':
      word = words[i]
      start_index = curr_index
      curr_index += len(words[i]) + 1
      i += 1

      while i < n and prediction[i][0:5] == 'I-hol':
        word += " "
        word += words[i]
        curr_index += len(words[i]) + 1
        i += 1
      end_index = curr_index
      sp = str(start_index) + ":" + str(end_index-1)
      holders_span.append(sp)
      holders.append(word)

    else:
      curr_index += len(words[i]) + 1
      i += 1



  
  dict1 = {"sent_id":df["sent_id"][w] ,"text" : convert(df["text"][w]),"opinions" : []}
  for x in range(0,len(holders)):
    for y in range(0,len(pos_expressions)):
      for z in range(0,len(pos_targets)):
        op_dict = {"Source":[[],[]] , "Target" : [[],[]],"Polar_expression" : [[],[]],"Polarity" : "Positive","Intensity" : "Standard"}
        op_dict["Source"][0].append(holders[x])
        op_dict["Source"][1].append(holders_span[x])
        op_dict["Polar_expression"][0].append(pos_expressions[y])
        op_dict["Polar_expression"][1].append(pos_expressions_span[y])
        op_dict["Target"][0].append(pos_targets[z])
        op_dict["Target"][1].append(pos_targets_span[z])
        dict1["opinions"].append(op_dict)

  for x in range(0,len(holders)):
    for y in range(0,len(neg_expressions)):
      for z in range(0,len(neg_targets)):
        op_dict = {"Source":[[],[]] , "Target" : [[],[]],"Polar_expression" : [[],[]],"Polarity" : "Negative","Intensity" : "Standard"}
        op_dict["Source"][0].append(holders[x])
        op_dict["Source"][1].append(holders_span[x])
        op_dict["Polar_expression"][0].append(neg_expressions[y])
        op_dict["Polar_expression"][1].append(neg_expressions_span[y])
        op_dict["Target"][0].append(neg_targets[z])
        op_dict["Target"][1].append(neg_targets_span[z])
        dict1["opinions"].append(op_dict)
  
  dict_list.append(dict1)



Processing : 0
Processing : 1
Processing : 2
['A', 'wonderful', 'place', 'to', 'go', 'and', 'we', 'are', 'planning', 'to', 'return', 'as', 'soon', 'as', 'we', 'can', '.']
['O', 'B-exp-Positive', 'B-targ-Positive', 'O', 'I-exp-Positive', 'O', 'B-holder', 'O', 'B-exp-Positive', 'I-exp-Positive', 'I-exp-Positive', 'O', 'O', 'O', 'O', 'O', 'O']
Processing : 3
Processing : 4
Processing : 5
Processing : 6
Processing : 7
Processing : 8
Processing : 9
Processing : 10
Processing : 11
Processing : 12
Processing : 13
Processing : 14
Processing : 15
Processing : 16
Processing : 17
Processing : 18
Processing : 19
Processing : 20
Processing : 21
Processing : 22
Processing : 23
Processing : 24
Processing : 25
Processing : 26
Processing : 27
Processing : 28
Processing : 29
Processing : 30
Processing : 31
Processing : 32
Processing : 33
Processing : 34
Processing : 35
Processing : 36
Processing : 37
Processing : 38
Processing : 39
Processing : 40
Processing : 41
Processing : 42
Processing : 43
Processi

In [67]:
import json
json_object = json.dumps(dict_list)
with open("predictions.json", "w") as outfile:
    outfile.write(json_object)
outfile.close()

In [68]:
def check(list1,list2):
  if(len(list1) != len(list2)):
    return False
  else:
    for i in range(len(list1)):
      flag = 0
      for j in range(len(list2)):
        if list1[i] == list2[j]:
          flag = 1
      if(flag == 0):
        return False
    return True

In [74]:
def Calc_accuracy(gold_tuples,predicted_tuples):
  correct_values = 0
  for w in range(0,len(gold_tuples)):
    if gold_tuples[w]['sent_id'] == predicted_tuples[w]['sent_id']:
      if len(gold_tuples[w]['opinions']) != len(predicted_tuples[w]['opinions']):
        continue
      else:
        for i in range(0,len(gold_tuples[w]['opinions'])):
          flag = 0
          for j in range(0,len(predicted_tuples[w]['opinions'])):
            if check(gold_tuples[w]['opinions'][i]["Source"],predicted_tuples[w]['opinions'][j]["Source"]) and check(gold_tuples[w]['opinions'][i]["Target"],predicted_tuples[w]['opinions'][j]["Target"]) and check(gold_tuples[w]['opinions'][i]["Polar_expression"],predicted_tuples[w]['opinions'][j]["Polar_expression"]) :
              correct_values += 1
  return correct_values

In [92]:
with open('gold_test.json', encoding='utf-8') as inputfile:
  gold_df = json.load(inputfile)
with open('predictions.json', encoding='utf-8') as inpfile:
  predicted_df = json.load(inpfile)

# print(gold_df)
print(gold_df[0])
print(gold_df[1])
print(Calc_accuracy(gold_df,predicted_df))

{'sent_id': 'opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-5', 'text': 'So wonderful to see people go to work smiling and leave work still smiling and happy .', 'opinions': []}
{'sent_id': 'opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-6', 'text': 'They were there just to keep us happy .', 'opinions': [{'Source': [['us'], ['29:31']], 'Target': [['They'], ['0:4']], 'Polar_expression': [['to keep', 'happy'], ['21:28', '32:37']], 'Polarity': 'Positive', 'Intensity': 'Standard'}]}
7
