Install packages

In [478]:
!pip install transformers
!pip install nlpaug
!pip install wget
!pip install matplotlib
!pip install requests



Import packages

In [479]:
import torch
import matplotlib.pyplot as plt
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import pandas as pd
from copy import deepcopy
import os
import json
import random
import nlpaug
import nlpaug.augmenter.word as naw
from transformers import pipeline
from nltk.tokenize import word_tokenize
import re
import nltk


In [480]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Mounting drive and loading the JSON data file

In [481]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [482]:
path = "/content/drive/My Drive/csds_storage/"
# Loading the saved JSON file.
with open(path + 'data.json') as json_file:
    data = json.load(json_file)


A Python method for processing JSON data in order to put the head and text of CSDS objects (which containing intensity attribute) in a dictionary for next uses. 

In [483]:
def process_csds_objects(data):
    """
    It receives the JSON data which is the JSON format of the CSDS, Target and Agent
    objects, and then extract CSDS objects (which are in JSON format) in order to
    process each of them. 
    """

    lst = {'high-extreme': ([], []), 'extreme': ([], []), 'medium': ([], []), 'medium-high': ([], []), 'low': ([], []), 'high': ([], []), 'low-medium': ([], [])}

    for item in data['csds_objects']:
        #Check if this CSDS object has 'intensity' value, because the 'intensity' value if optional.
        if item['intensity']: 
            lst[item['intensity']][0].append(item['text'])
            lst[item['intensity']][1].append(item['head'])    
    return lst


A Python method, which augments with BERT, uses **insertion** and does not change the **head span**




In [484]:
def bert_augmenter_simple(lst, model_name='bert-base-cased'):
    TOPK=20 #default=100
    ACT='insert' #"substitute"
 
    aug_bert = naw.ContextualWordEmbsAug(
        model_path=model_name,
        #device='cuda',
        action=ACT, top_k=TOPK)

    augmented_text = aug_bert.augment(lst)
    return augmented_text 

Using the defined method to receive the dictionary of intensity classes

In [485]:
# Main code
classes_lst = process_csds_objects(data)


The following dictionary describes how many times we need to augment each class of data or how many items we want to add to the original data. Thus addition of data could be done in 2 ways: <br>
A) Using 'mul' and the corresponding number (we call it **n**), that is all data is augmented by the number **n**.
<br>
B) Using 'add' and the corresponding number (we call it **n**), that is the data items are shuffled first, then the first **n** data items are collected and they will be augmented. So **n** items are added to our original set.

In [486]:
aug_rpt_cnt = {'high-extreme': ['mul', 10], 'extreme': ['mul', 50], 
               'medium': ['add', 30], 'medium-high': ['mul', 2], 
               'low': ['mul', 1], 'high': ['add', 150], 'low-medium': ['add', 200]}


The following list defines the classes which we want to augment.

In [487]:
to_do_lst = [] #['low', 'high', 'low-medium', 'high-extreme', 'extreme','medium', 'medium-high']

Creating a BERT augmenter just like what we did in the previously defined method.

In [488]:
model_name='bert-base-cased'
TOPK=20 #default=100
ACT='insert' #"substitute"

aug_bert = naw.ContextualWordEmbsAug(
    model_path=model_name,
    #device='cuda',
    action=ACT, top_k=TOPK)


The following block augments each class of data items (just the classes we want to augment, present in the ***to_do_lst*** list), Based on our previous descriptions we can add to data items or multiply the count of data items by our specified number.

In [489]:
for cl in to_do_lst:
    aug_type, aug_cnt = aug_rpt_cnt[cl][0], aug_rpt_cnt[cl][1]
    X = []
    y = []
    if aug_type == 'mul':
        cnt = aug_cnt
        lst_text = classes_lst[cl][0]
        lst_head = classes_lst[cl][1]
        while cnt > 0:
            augmented_text = aug_bert.augment(lst_text)
            for ii in range(len(augmented_text)):
                X.append((augmented_text[ii], lst_head[ii]))
            y += [cl] * len(lst_head)
            cnt -= 1
    else:
        cnt = aug_cnt
        lst_text = classes_lst[cl][0]
        lst_head = classes_lst[cl][1]
        c = list(zip(lst_text, lst_head))
        random.shuffle(c)
        lst_text, lst_head = zip(*c)
        lst_text, lst_head = list(lst_text), list(lst_head)
        augmented_text = aug_bert.augment(lst_text[: cnt])
        for ii in range(len(augmented_text)):
            X.append((augmented_text[ii], lst_head[ii]))
        y += [cl] * cnt

    #
    data_aug = {
    'aug_class': cl,
    'X': X,
    'y': y
    }
    del X, y
    # The 'path' defines where the augmented data will be written to.
    path = "/content/drive/My Drive/csds_storage/"
    # Saving JSON file.
    with open(path + cl + '_data_aug.json',  'w') as outfile:
        json.dump(data_aug, outfile, indent=4)

    print(f'Class {cl} augmentation completed!')

In [490]:
# Create a reference variable for Class WhitespaceTokenizer
unmasker = pipeline('fill-mask', model=model_name)    
     


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [491]:
stopwords = nltk.corpus.stopwords.words('english')

In [492]:
def is_valid_token(token):
    if len(token) < 2 or token in stopwords:
        return False
    return True

In [493]:
total_costum_text_augments = []
total_costum_head_augments = []

In [494]:
EXTEND_CNT = 2

In [495]:
def custom_augment_rec(text_tokens, head_tokens, i, j, head_beg, head_end, n):
    if i == n:
        total_costum_text_augments.append(str(' '.join(text_tokens)).replace('  ', ' '))
        total_costum_head_augments.append(str(' '.join(head_tokens)).replace('  ', ' '))
    else:
        if i >= head_beg and i < head_end:
            custome_augment_rec(text_tokens, head_tokens, i+1, j+1, head_beg, head_end, n)
        else:
            custom_augment_rec(text_tokens, head_tokens, i+1, j, head_beg, head_end, n)
        if is_valid_token(text_tokens[i]):
            text_tokens[i] = '[MASK]'        
            result = unmasker(str(' '.join(text_tokens)))

            if i >= head_beg and i < head_end:
                for k in range(EXTEND_CNT):
                    text_tokens[i] = result[k]['token_str']
                    head_tokens[j] = text_tokens[i]
                    custom_augment_rec(text_tokens, head_tokens, i+1, j+1, head_beg, head_end, n)
            else:
                for k in range(EXTEND_CNT):
                    text_tokens[i] = result[k]['token_str']
                    custom_augment_rec(text_tokens, head_tokens, i+1, j, head_beg, head_end, n)


In [496]:
def custom_augment(text, head):
    # Clear the answers list
    total_costum_text_augments.clear()
    total_costum_head_augments.clear()
    text_first = text[: text.find(head)]
    text_first_tokens = word_tokenize(text_first)
    text_tokens = word_tokenize(text)
    head_tokens = word_tokenize(head)
    custom_augment_rec(text_tokens, head_tokens, 0, 0, len(text_first_tokens), len(text_first_tokens)+len(head_tokens), len(text_tokens))
    return total_costum_text_augments, total_costum_head_augments


In [497]:
#text = "Indeed, the U.S Administration, which claims to advocate international human rights issues and which tries to play the role of a world policeman, is actually condoning the most atrocious human rights violations committed in the United States itself."
#head = "most atrocious human rights violations"
text = 'Trump lost the 2020 presidential election to Biden but refused to concede.'
head = 'refused'
augmented_text, augmented_head = custom_augment(text, head)
#_ = [print(n) for n in augmented_text]
for i in range(len(augmented_text)):
    print(f'{augmented_text[i]} ===>> {augmented_head[i]}')

Trump lost the 2020 presidential election to Biden but chose to withdraw . ===>> chose
Trump lost the 2020 presidential election to Biden but chose to retire . ===>> chose
Trump lost the 2020 presidential election to Biden but chose to run . ===>> chose
Trump lost the 2020 presidential election to Biden but decided to run . ===>> decided
Trump lost the 2020 presidential election to Biden but decided to retire . ===>> decided
Trump lost the 2020 presidential election to Biden but decided to withdraw . ===>> decided
Trump lost the 2020 presidential election to Trump but decided to run . ===>> decided
Trump lost the 2020 presidential election to Trump but decided to retire . ===>> decided
Trump lost the 2020 presidential election to Trump but decided to run . ===>> decided
Trump lost the 2020 presidential election to Trump but chose to run . ===>> chose
Trump lost the 2020 presidential election to Trump but chose to run . ===>> chose
Trump lost the 2020 presidential election to Trump but 

In [498]:
print(len(augmented_text))

1494
