### NLPAUG - BERT Base Uncased

#### Un-augmented test set
#### Augment only the training set

#### Get Original Paper Data

In [1]:
# !pip install sklearn
# !pip install ekphrasis
# !pip install transformers
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K
from tensorflow import keras

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import os

import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
import torch

print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
  
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {torch.cuda.current_device()}")
        
print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")

Is CUDA supported by this system? True
CUDA version: 11.0
ID of current CUDA device: 0
Name of current CUDA device: NVIDIA GeForce RTX 3090


In [5]:
encoder = LabelEncoder()
encoder.classes_ = np.load('../Data/classes.npy', allow_pickle=True)

In [6]:
tf.__version__

'2.6.0'

In [7]:
transformers.__version__

'4.16.2'

In [8]:
# test, train, dev examples from base notebook

train_data_df = pd.read_csv('../2_DA_Accuracy_Experiments/Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_train_examples.csv')
dev_data_df = pd.read_csv('../2_DA_Accuracy_Experiments/Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_dev_examples.csv')
test_data_df = pd.read_csv('../2_DA_Accuracy_Experiments/Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_test_examples.csv')

In [9]:
train_data_df.head()

Unnamed: 0.1,Unnamed: 0,post_id,text,target1,target2,target3,rationales,final_label,text_combined
0,0,22448349_gab,"['common', 'core', 'weed', 'too', 'much', 'rit...","['Men', 'Women']",['Women'],['None'],[],normal,common core weed too much ritalan chem trails ...
1,1,1178948520201637888_twitter,"['took', 'my', 'nan', 'to', 'the', 'hospital',...",['None'],['None'],['None'],[],normal,took my nan to the hospital for a x ray i turn...
2,2,1482573_gab,"['<user>', 'well', 'not', 'really', 'islam', '...",['Islam'],['Other'],['Islam'],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...",offensive,<user> well not really islam does not care for...
3,3,1097184028149587969_twitter,"['<user>', 'france', 'in', '<number>', 'after'...","['Islam', 'Other']",['Islam'],['Islam'],[],normal,<user> france in <number> after muslims take o...
4,4,1089569255111176192_twitter,"['i', 'will', 'not', 'tolerate', 'non', 'arab'...","['Arab', 'Men', 'Women']",['Arab'],"['Arab', 'Islam']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,...",hatespeech,i will not tolerate non arab women slandering ...


In [10]:
X_train_id = train_data_df['post_id']
X_test_id = test_data_df['post_id']
X_dev_id = dev_data_df['post_id']

In [11]:
y_train = train_data_df['final_label']
y_test = test_data_df['final_label']
y_dev = dev_data_df['final_label']

In [12]:
# x_train_df = pd.DataFrame({'post_id' : X_train_id.to_list()})
# x_dev_df = pd.DataFrame({'post_id' : X_dev_id.to_list()})
# x_test_df = pd.DataFrame({'post_id' : X_test_id.to_list()})

# X_train_df = pd.merge(x_train_df, raw_data_final, how='inner', on='post_id')
# X_dev_df = pd.merge(x_dev_df, raw_data_final, how='inner', on='post_id')
# X_test_df = pd.merge(x_test_df, raw_data_final, how='inner', on='post_id')

X_train_text = train_data_df['text_combined'].to_list()
X_dev_text= dev_data_df['text_combined'].to_list()
X_test_text = test_data_df['text_combined'].to_list()

print(len(X_train_text))
print(len(X_dev_text))
print(len(X_test_text))

15383
1923
1923


### Create Augmented Data

In [13]:
#!pip install numpy requests nlpaug

In [14]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action


In [15]:
text = 'The quick brown fox jumps over the lazy dog .'
print(text)

The quick brown fox jumps over the lazy dog .


In [18]:
# contextual word augmentation, substitution

aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', 
                                action="substitute",
                               aug_p = 0.2,
                               device='cuda',
                               batch_size = 256)
augmented_text = aug.augment(text, n=1)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
the quick brown eye jumps over the wild dog.


In [19]:
train_aug_df = train_data_df[['text_combined', 'final_label']]

print(len(train_aug_df))
print(train_aug_df.head())

15383
                                       text_combined final_label
0  common core weed too much ritalan chem trails ...      normal
1  took my nan to the hospital for a x ray i turn...      normal
2  <user> well not really islam does not care for...   offensive
3  <user> france in <number> after muslims take o...      normal
4  i will not tolerate non arab women slandering ...  hatespeech


In [20]:
# contextual word substitution

aug_text_set1 = aug.augment(train_aug_df['text_combined'].to_list())
aug_text_set2 = aug.augment(train_aug_df['text_combined'].to_list())
aug_text_set3 = aug.augment(train_aug_df['text_combined'].to_list())
aug_text_set4 = aug.augment(train_aug_df['text_combined'].to_list())
aug_text_set5 = aug.augment(train_aug_df['text_combined'].to_list())

In [21]:
len(aug_text_set1)

15383

In [22]:
labels = train_aug_df['final_label'].to_list()

In [23]:
def aug_text_to_df(text_list, label_list):
    keys = ['text_combined', 'final_label']
    aug_dict = {key: [] for key in keys}

    for i in range(len(text_list)):
        aug_dict['text_combined'].append(text_list[i])
        aug_dict['final_label'].append(label_list[i])

    df = pd.DataFrame(aug_dict)
    
    return df

In [24]:
df_aug_sub_set1 = aug_text_to_df(aug_text_set1, labels)
df_aug_sub_set2 = aug_text_to_df(aug_text_set2, labels)
df_aug_sub_set3 = aug_text_to_df(aug_text_set3, labels)
df_aug_sub_set4 = aug_text_to_df(aug_text_set4, labels)
df_aug_sub_set5 = aug_text_to_df(aug_text_set5, labels)

df_aug_sub_set1.head()

Unnamed: 0,text_combined,final_label
0,common core weed too much hate chem trails com...,normal
1,took a nan to the hospital for a vision ray i ...,normal
2,< user > well whatever really islam does need ...,offensive
3,< user > france online < number > after they t...,normal
4,i will not confuse non jewish women for arab w...,hatespeech


In [25]:
df_aug_sub_set2.head()

Unnamed: 0,text_combined,final_label
0,i core weed too long ritalan life trails vegan...,normal
1,took my sister to the hospital for a day until...,normal
2,< user > may not but islam does not care for r...,offensive
3,< user > france 2012 < number > while muslims ...,normal
4,i will not tolerate non arab women slandering ...,hatespeech


In [26]:
df_aug_sub_set3.head()

Unnamed: 0,text_combined,final_label
0,blood core weed a cold ritalan chem trails thi...,normal
1,took my mother to the hospital with a x ray af...,normal
2,< ref > well not tonight i does not care for r...,offensive
3,< r > france in < l > after muslims came over,normal
4,this must not forget non arab women these arab...,hatespeech


In [27]:
df_aug_sub_set4.head()

Unnamed: 0,text_combined,final_label
0,one core weed too much time chem trails vegan ...,normal
1,took my eye above the hospital for a broken ch...,normal
2,< user > but not really either does this go fo...,offensive
3,< scene > france in < number > chinese muslims...,normal
4,i will not permit non arab women slandering ar...,hatespeech


In [28]:
df_aug_sub_set5.head()

Unnamed: 0,text_combined,final_label
0,common love a too much a chem trails vegan die...,normal
1,bringing even nan to texas hospital for a blac...,normal
2,< him > well not you nobody does not care for ...,offensive
3,< user > france 4 < number > there muslims tak...,normal
4,i will not discuss non arab women slandering f...,hatespeech


In [29]:
# train_aug_df.to_csv('../test_data_set/NLPAUG/original_train_df.csv')

In [30]:
df_aug_sub_set1.to_csv('../test_data_set/NLPAUG/NLPAUG_sub_set1_0_2_df.csv')
df_aug_sub_set2.to_csv('../test_data_set/NLPAUG/NLPAUG_sub_set2_0_2_df.csv')
df_aug_sub_set3.to_csv('../test_data_set/NLPAUG/NLPAUG_sub_set3_0_2_df.csv')
df_aug_sub_set4.to_csv('../test_data_set/NLPAUG/NLPAUG_sub_set4_0_2_df.csv')
df_aug_sub_set5.to_csv('../test_data_set/NLPAUG/NLPAUG_sub_set5_0_2_df.csv')