Install packages

In [53]:
!pip install transformers
!pip install nlpaug
!pip install wget
!pip install matplotlib
!pip install requests



Import packages

In [54]:
import torch
import matplotlib.pyplot as plt
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import pandas as pd
from copy import deepcopy
import os
import json
import random
import nlpaug
import nlpaug.augmenter.word as naw

Mounting drive and loading the JSON data file

In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
path = "/content/drive/My Drive/csds_storage/"
# Loading the saved JSON file.
with open(path + 'data.json') as json_file:
    data = json.load(json_file)


A Python method for processing JSON data in order to put the head and text of CSDS objects (which containing intensity attribute) in a dictionary for next uses. 

In [57]:
def process_csds_objects(data):
    """
    It receives the JSON data which is the JSON format of the CSDS, Target and Agent
    objects, and then extract CSDS objects (which are in JSON format) in order to
    process each of them. 
    """

    lst = {'high-extreme': ([], []), 'extreme': ([], []), 'medium': ([], []), 'medium-high': ([], []), 'low': ([], []), 'high': ([], []), 'low-medium': ([], [])}

    for item in data['csds_objects']:
        #Check if this CSDS object has 'intensity' value, because the 'intensity' value if optional.
        if item['intensity']: 
            lst[item['intensity']][0].append(item['text'])
            lst[item['intensity']][1].append(item['head'])    
    return lst


A Python method, which augments with BERT, uses **insertion** and does not change the **head span**




In [58]:
def bert_augmenter_simple(lst, model_name='bert-base-cased'):
    TOPK=20 #default=100
    ACT='insert' #"substitute"
 
    aug_bert = naw.ContextualWordEmbsAug(
        model_path=model_name,
        #device='cuda',
        action=ACT, top_k=TOPK)

    augmented_text = aug_bert.augment(lst)
    return augmented_text 

Using the defined method to receive the dictionary of intensity classes

In [59]:
# Main code
classes_lst = process_csds_objects(data)


The following dictionary describes how many times we need to augment each class of data or how many items we want to add to the original data. Thus addition of data could be done in 2 ways: <br>
A) Using 'mul' and the corresponding number (we call it **n**), that is all data is augmented by the number **n**.
<br>
B) Using 'add' and the corresponding number (we call it **n**), that is the data items are shuffled first, then the first **n** data items are collected and they will be augmented. So **n** items are added to our original set.

In [60]:
aug_rpt_cnt = {'high-extreme': ['mul', 10], 'extreme': ['mul', 50], 
               'medium': ['add', 30], 'medium-high': ['mul', 2], 
               'low': ['mul', 1], 'high': ['add', 150], 'low-medium': ['add', 200]}


The following list defines the classes which we want to augment.

In [61]:
to_do_lst = [] #['low', 'high', 'low-medium', 'high-extreme', 'extreme','medium', 'medium-high']

Creating a BERT augmenter just like what we did in the previously defined method.

In [62]:
model_name='bert-base-cased'
TOPK=20 #default=100
ACT='insert' #"substitute"

aug_bert = naw.ContextualWordEmbsAug(
    model_path=model_name,
    #device='cuda',
    action=ACT, top_k=TOPK)


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

The following block augments each class of data items (just the classes we want to augment, present in the ***to_do_lst*** list), Based on our previous descriptions we can add to data items or multiply the count of data items by our specified number.

In [63]:
for cl in to_do_lst:
    aug_type, aug_cnt = aug_rpt_cnt[cl][0], aug_rpt_cnt[cl][1]
    X = []
    y = []
    if aug_type == 'mul':
        cnt = aug_cnt
        lst_text = classes_lst[cl][0]
        lst_head = classes_lst[cl][1]
        while cnt > 0:
            augmented_text = aug_bert.augment(lst_text)
            for ii in range(len(augmented_text)):
                X.append((augmented_text[ii], lst_head[ii]))
            y += [cl] * len(lst_head)
            cnt -= 1
    else:
        cnt = aug_cnt
        lst_text = classes_lst[cl][0]
        lst_head = classes_lst[cl][1]
        c = list(zip(lst_text, lst_head))
        random.shuffle(c)
        lst_text, lst_head = zip(*c)
        lst_text, lst_head = list(lst_text), list(lst_head)
        augmented_text = aug_bert.augment(lst_text[: cnt])
        for ii in range(len(augmented_text)):
            X.append((augmented_text[ii], lst_head[ii]))
        y += [cl] * cnt

    #
    data_aug = {
    'aug_class': cl,
    'X': X,
    'y': y
    }
    del X, y
    # The 'path' defines where the augmented data will be written to.
    path = "/content/drive/My Drive/csds_storage/"
    # Saving JSON file.
    with open(path + cl + '_data_aug.json',  'w') as outfile:
        json.dump(data_aug, outfile, indent=4)

    print(f'Class {cl} augmentation completed!')