In [None]:
import re
import spacy
import codecs
import unidecode
import pandas as pd
import numpy as np
from tqdm import tqdm
import nlpaug.augmenter.word as naw
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
nlp = spacy.load('en_core_web_sm')

#### Steps in tweet cleaning:
* Decoding: unicode_escape for extra “\” before unicode character, then unidecode
* Apostrophe handled: there are two characters people use for contraction. “’”(apostrophe) and “‘“(single quote). If these two symbols are both used for contraction, it will be difficult to detect and properly map the right expanded form. So any “’”(apostrophe) is changed to “‘“(single quote)
* Contraction check: check if there’s any contracted form, and replace it with its original form
* Parsing: done with Spacy
* Filtering punctuation, white space, numbers, URL using Spacy methods while keeping the text content of hashtag intact
* Removed @mention
* Special character removal
* Single syllable token removal
* Spell correction: it is a simple spell correction dealing with repeated characters such as “sooooo goooood”. If the same character is repeated more than two times, it shortens the repetition to two. For example “sooooo goooood” will be transformed as “soo good”. This is not a perfect solution since even after correction, in case of “soo”, it is not a correct spelling. But at least it will help to reduce feature space by making “sooo”, “soooo”, “sooooo” to the same word “soo”

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "cant": "cannot",
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def spacy_cleaner(text):
    try:
        decoded = unidecode.unidecode(codecs.decode(text, 'unicode_escape'))
    except:
        decoded = unidecode.unidecode(text)
    apostrophe_handled = re.sub("’", "'", decoded)
    expanded = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    parsed = nlp(expanded)
    final_tokens = []
    for t in parsed:
        if t.is_punct or t.is_space or t.like_num or t.like_url or str(t).startswith('@'):
            pass
        else:
            sc_removed = re.sub("[^a-zA-Z]", '', str(t))
            if len(sc_removed) > 1:
                final_tokens.append(sc_removed)
    joined = ' '.join(final_tokens)
    spell_corrected = re.sub(r'(.)\1+', r'\1\1', joined)
    return spell_corrected

In [None]:
df = pd.read_csv('C:/Users/User/Desktop/FINAL-THESIS/en_dataset.csv')
print("Shape of dataframe: ", df.shape)
df.head(15)

In [None]:
# Separating out each of the labels to different words
def split_sentiment_outputs(output_label, sentiment_col="sentiment"):
    df[output_label] = df[sentiment_col].str.split('_')
    

# Converting the dataframe to have one column for each of the labels.
def transform_data_for_multilabel(output_label):
    row = df[output_label]
    for index, row in row.items():
        z = 0
        while z < len(row):
            if row[z] == output_label:
                df.at[index, output_label] = 1
                break
            else:
                df.at[index, output_label] = 0
            z = z + 1
            
output_labels = ["abusive", "hateful", "offensive", "disrespectful", "fearful", "normal"]
n_labels = len(output_labels)

In [None]:
for i in range(n_labels):
    split_sentiment_outputs(output_labels[i])
    
for i in range(n_labels):
    transform_data_for_multilabel(output_labels[i])

# Removing unnecessary columns from dataframe
df = df.drop(['directness', 'annotator_sentiment', 'target', 'group','HITId'], axis=1)

# Removing the 'sentiment' column as it is not needed anymore after all necessary information has been extracted
df = df.drop(['sentiment'], axis=1)

# Removing all duplicate rows
df = df.drop_duplicates()

# Applying the tweet cleaning function to each of the tweets
df['tweet_cleaned'] = df['tweet'].apply(lambda x: spacy_cleaner(x))

# Generating the number of words of each tweet in a column
df['tweet_length'] = df['tweet_cleaned'].apply(lambda x: len(x.split(' ')))

# Discard Empty Tweets if There Any.
empty_tweet_filter = df['tweet_cleaned'] != ""
df = df[empty_tweet_filter]
df = df.dropna()

#df = df.drop(columns=['tweet'])

print("Shape of dataframe: ", df.shape)
df.head(15)

In [None]:
print("Value Counts of Categories of the whole dataset:\n")
for category in df.columns[1:7]:
    print(f"{category}: {df[category].sum()}")
    

#Plot the Distribution of Target Labels in the Dataset
df2 = df[df.columns[1:7]]
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
df2.sum(axis=0).plot.bar()
plt.show()



In [None]:
#Data splitting - TRAIN-TEST-DEV-> 8:1:1
def split_df(df):
    if len(df) % 2 != 0:  # Handling `df` with `odd` number of rows
        df = df.iloc[:-1, :]
    df1, df2 =  np.array_split(df, 2)
    return df1, df2

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=2021)
df_test, df_val = split_df(df_test)

print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

print("\n Value Counts of Training set Categories before augmentation and after split:\n")
for category in df_train.columns[1:7]:
    print(f"{category}: {df_train[category].sum()}")

print("\n Value Counts of Val set Categories before augmentation and after split:\n")
for category in df_val.columns[1:7]:
    print(f"{category}: {df_val[category].sum()}")

print("\n Value Counts of Test set Categories before augmentation and after split:\n")
for category in df_test.columns[1:7]:
    print(f"{category}: {df_test[category].sum()}")

Since the data is imbalanced as there are less text examples for some of the labels (eg. fearful, abusive, disrespectful), we will try and increase the number of examples for each of these labels by a text augmentation technique, backtranslation. 

The back-translation process works in the following way:
* Take some sentence and translate to another language
* Translate the output sentence back to original language
* Check if the new sentence is different from the original sentence. If it is, then we use this new sentence as an augmented version of the original text.

In [None]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)

sub_word_with_bert_aug = naw.ContextualWordEmbsAug(
    model_path ='bert-base-uncased', action="substitute"
)

def generate_augmented_text_back_translate(text):
    try:
        augmented_text = back_translation_aug.augment(text)
    except:
        augmented_text = text
    return augmented_text


def generate_augmented_text_sub_words_with_bert(text):
    try:
        augmented_text = sub_word_with_bert_aug.augment(text)
    except:
        augmented_text = text
    return augmented_text

In [None]:
print('Back-Translation Augmentation Example:')
text = "Its a pretty horrible time to be a sportsperson at the moment."
augmented_text = generate_augmented_text_back_translate(text)
print("Original:")
print(text)
print()
print("Augmented Text:")
print(augmented_text)


print('Subtitute words via BERT embeddings Augmentation Example:')
text = "Its a pretty horrible time to be a sportsperson at the moment."
augmented_text = generate_augmented_text_sub_words_with_bert(text)
print("Original:")
print(text)
print()
print("Augmented Text:")
print(augmented_text)

In [None]:
# Function to augment examples for a label with both augmenting techniques. Keep in mind that we are only augmenting those tweets which have more than 5 words.
# Augmenting tweets with 2 or 3 words doesn't produce good augmentation results.
def augment_category_with_back_translate(category_label: str, df):
    temp_df1 = pd.DataFrame(columns=df.columns)
    df = df[(df[category_label] == 1) & (df['offensive'] == 0)]
    indexes = df[df['tweet_length'] > 5].index

    for idx in tqdm(indexes):
        text = df.loc[idx]['tweet_cleaned']
        temp_df1 = temp_df1.append(df.loc[[idx]].assign(**{'tweet_cleaned': generate_augmented_text_back_translate(text)}), 
                  ignore_index=True)
    return temp_df1


def augment_category_with_bert_embeddings_subs(category_label: str, df):
    temp_df2 = pd.DataFrame(columns=df.columns)
    df = df[(df[category_label] == 1) & (df['offensive'] == 0)]
    indexes = df[df['tweet_length'] > 5].index

    for idx in tqdm(indexes):
        text = df.loc[idx]['tweet_cleaned']
        temp_df2 = temp_df2.append(df.loc[[idx]].assign(**{'tweet_cleaned': generate_augmented_text_sub_words_with_bert(text)}), 
                  ignore_index=True)
    return temp_df2


In [None]:
for label in ['fearful', 'abusive', 'disrespectful']:
    temp_df1 = augment_category_with_back_translate(label, df_train)
    temp_df2 = augment_category_with_bert_embeddings_subs(label, df_train)
    df_train = pd.concat([df_train, temp_df1], ignore_index = True)
    df_train = pd.concat([df_train, temp_df2], ignore_index = True)
    df_train = df_train.drop_duplicates()

In [None]:
print("Shape of Training Set After Augmentation: ", df_train.shape)


In [1]:
print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

print("\n Value Counts of Training set Categories AFTER augmentation and after split:\n")
for category in df_train.columns[1:7]:
    print(f"{category}: {df_train[category].sum()}")

print("\n Value Counts of Val set Categories AFTER augmentation and after split:\n")
for category in df_val.columns[1:7]:
    print(f"{category}: {df_val[category].sum()}")

print("\n Value Counts of Test set Categories AFTER augmentation and after split:\n")
for category in df_test.columns[1:7]:
    print(f"{category}: {df_test[category].sum()}")


    
#Plot the Distribution of Target Labels in the Dataset
df3 = df_train[df_train.columns[1:7]]
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
df3.sum(axis=0).plot.bar()
plt.show()

NameError: name 'df_train' is not defined

We can see that the number of examples for labels of 'fearful', 'abusive' and 'disrespectful' have increased and the whole data is somewhat more balanced now for training.

In [None]:
df_train = df_train.drop(columns=['tweet_length'])
df_test = df_test.drop(columns=['tweet_length'])
df_val = df_val.drop(columns=['tweet_length'])

df_train.to_csv('C:/Users/User/Desktop/FINAL-THESIS/training_set_en_dataset_aug.csv', index=False)
df_test.to_csv('C:/Users/User/Desktop/FINAL-THESIS/test_set_en_dataset.csv', index=False)
df_val.to_csv('C:/Users/User/Desktop/FINAL-THESIS/val_set_en_dataset.csv', index=False)