### NLPAUG - BERT Base Uncased

#### Un-augmented test set
#### Augment only the training set

#### Get Original Paper Data

In [1]:
# !pip install sklearn
# !pip install ekphrasis
# !pip install transformers
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K
from tensorflow import keras

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import os

import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
encoder = LabelEncoder()
encoder.classes_ = np.load('../Data/classes.npy', allow_pickle=True)

In [5]:
tf.__version__

'2.6.0'

In [6]:
transformers.__version__

'4.16.2'

In [7]:
# test, train, dev examples from base notebook

train_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_train_examples.csv')
dev_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_dev_examples.csv')
test_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_test_examples.csv')

In [8]:
train_data_df.head()

Unnamed: 0.1,Unnamed: 0,post_id,text,target1,target2,target3,rationales,final_label,text_combined
0,0,22448349_gab,"['common', 'core', 'weed', 'too', 'much', 'rit...","['Men', 'Women']",['Women'],['None'],[],normal,common core weed too much ritalan chem trails ...
1,1,1178948520201637888_twitter,"['took', 'my', 'nan', 'to', 'the', 'hospital',...",['None'],['None'],['None'],[],normal,took my nan to the hospital for a x ray i turn...
2,2,1482573_gab,"['<user>', 'well', 'not', 'really', 'islam', '...",['Islam'],['Other'],['Islam'],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...",offensive,<user> well not really islam does not care for...
3,3,1097184028149587969_twitter,"['<user>', 'france', 'in', '<number>', 'after'...","['Islam', 'Other']",['Islam'],['Islam'],[],normal,<user> france in <number> after muslims take o...
4,4,1089569255111176192_twitter,"['i', 'will', 'not', 'tolerate', 'non', 'arab'...","['Arab', 'Men', 'Women']",['Arab'],"['Arab', 'Islam']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,...",hatespeech,i will not tolerate non arab women slandering ...


In [48]:
# train data text inspection

pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)

normal = train_data_df[train_data_df['final_label'] == 'normal'][['post_id', 'text_combined', 'final_label']]

count = normal.text_combined.str.split().apply(len).value_counts()


# normal.sort_values(by=['word_count']).head(50)

test

9     228
18    209
14    203
10    202
7     202
15    201
16    200
17    198
12    198
20    197
11    193
8     188
13    180
19    175
21    169
6     166
23    158
5     148
24    139
22    137
25    117
26    107
32    106
45    104
44    101
46    101
27     96
50     96
35     94
43     93
47     93
30     86
37     86
28     85
36     85
49     83
29     83
48     82
4      82
31     82
40     82
38     80
42     80
33     69
34     65
41     63
51     60
39     59
3      54
52     42
53     13
54     10
55      7
56      7
57      2
58      1
70      1
2       1
63      1
67      1
Name: text_combined, dtype: int64

In [9]:
X_train_id = train_data_df['post_id']
X_test_id = test_data_df['post_id']
X_dev_id = dev_data_df['post_id']

In [10]:
y_train = train_data_df['final_label']
y_test = test_data_df['final_label']
y_dev = dev_data_df['final_label']

In [11]:
# x_train_df = pd.DataFrame({'post_id' : X_train_id.to_list()})
# x_dev_df = pd.DataFrame({'post_id' : X_dev_id.to_list()})
# x_test_df = pd.DataFrame({'post_id' : X_test_id.to_list()})

# X_train_df = pd.merge(x_train_df, raw_data_final, how='inner', on='post_id')
# X_dev_df = pd.merge(x_dev_df, raw_data_final, how='inner', on='post_id')
# X_test_df = pd.merge(x_test_df, raw_data_final, how='inner', on='post_id')

X_train_text = train_data_df['text_combined'].to_list()
X_dev_text= dev_data_df['text_combined'].to_list()
X_test_text = test_data_df['text_combined'].to_list()

print(len(X_train_text))
print(len(X_dev_text))
print(len(X_test_text))

15383
1923
1923


In [12]:
original_train_data_df = train_data_df[['text_combined', 'final_label']]
original_train_data_df.head()

Unnamed: 0,text_combined,final_label
0,common core weed too much ritalan chem trails ...,normal
1,took my nan to the hospital for a x ray i turn...,normal
2,<user> well not really islam does not care for...,offensive
3,<user> france in <number> after muslims take o...,normal
4,i will not tolerate non arab women slandering ...,hatespeech


In [13]:
original_train_data_df['final_label'].value_counts()

normal        6251
hatespeech    4748
offensive     4384
Name: final_label, dtype: int64

### Get Augmented Data

In [14]:
nlpaug_ins_set1_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set1_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set2_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set2_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set3_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set3_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set4_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set4_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set5_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set5_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set6_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set6_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set7_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set7_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set8_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set8_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set9_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set9_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set10_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set10_0_2_df.csv')[['text_combined', 'final_label']]

nlpaug_sub_set1_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set1_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set2_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set2_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set3_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set3_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set4_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set4_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set5_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set5_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set6_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set6_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set7_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set7_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set8_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set8_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set9_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set9_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set10_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set10_0_2_df.csv')[['text_combined', 'final_label']]

In [15]:
nlpaug_ins_set1_df.head()

Unnamed: 0,text_combined,final_label
0,common core to weed too [UNK] much ritalan che...,normal
1,took my nan to the hospital for a x s ray i tu...,normal
2,< from user > well no not really islam does no...,offensive
3,< specific user > france in < series number > ...,normal
4,i will surely not tolerate non arab women slan...,hatespeech


In [16]:
# combine sets

# # 10 augmentations
# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df, nlpaug_ins_set3_df, 
#               nlpaug_ins_set4_df, nlpaug_ins_set5_df, nlpaug_ins_set6_df, nlpaug_ins_set7_df, 
#               nlpaug_ins_set8_df, nlpaug_ins_set9_df, nlpaug_ins_set10_df]

# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df, nlpaug_sub_set3_df, 
#               nlpaug_sub_set4_df, nlpaug_sub_set5_df, nlpaug_sub_set6_df, nlpaug_sub_set7_df, 
#               nlpaug_sub_set8_df, nlpaug_sub_set9_df, nlpaug_sub_set10_df]

# 7 augmentations
ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df, nlpaug_ins_set3_df, 
              nlpaug_ins_set4_df, nlpaug_ins_set5_df, nlpaug_ins_set6_df, nlpaug_ins_set7_df]

sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df, nlpaug_sub_set3_df, 
              nlpaug_sub_set4_df, nlpaug_sub_set5_df, nlpaug_sub_set6_df, nlpaug_sub_set7_df]

# # 5 augmentations
# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df, nlpaug_ins_set3_df, 
#               nlpaug_ins_set4_df, nlpaug_ins_set5_df]

# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df, nlpaug_sub_set3_df, 
#               nlpaug_sub_set4_df, nlpaug_sub_set5_df]

# # 2 augmentations
# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df]
# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df]



ins_train_df = pd.concat(ins_frames)
sub_train_df = pd.concat(sub_frames)