# Data Augmentation

In [None]:
# reading data
import pandas as pd
import os
from nlpaug.util.file.download import DownloadUtil
df = pd.read_csv("new_data.csv")
df

In [None]:
# library for augmentation
!pip install nlpaug

In [None]:
# installing transformers
!pip install transformers

In [None]:
# installing specific version to avoid error
!pip install gensim==4.2

In [None]:
# downloading the models
DownloadUtil.download_word2vec(dest_dir='.') # word2vec model
DownloadUtil.download_glove(model_name='glove.6B', dest_dir='.') # GloVe model
DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='.') # fasttext model

In [None]:
# importing for augmentation
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# dictionary of models for augmentation
model_type_to_path_map = {
    "word2vec" : "GoogleNews-vectors-negative300.bin",
    "glove" : "glove.6B.300d.txt",
    "fasttext" : "wiki-news-300d-1M.vec"
}

# empty data holder
data_holder_list = []

# inclusion of augmented data in corresponding row of the dataframe
def include_in_row(row, augmented_texts, augmenter_name):

  # for each different augmented text
  for i, at in enumerate(augmented_texts):

    # updating row
      row[augmenter_name + str(i)] = at

  return row

# each augmenter will produce 2 different augmentation for each data point (n = 2)
for index, row in df.iterrows():
    print(f"Augmenting entry: {index}")
    text = row['message']

    #Substitute word by spelling mistake words dictionary
    augmenter_name = "SpellingAug"
    aug = naw.SpellingAug()
    augmented_texts = aug.augment(text, n=2)
    row = include_in_row(row, augmented_texts, augmenter_name)

    # Insert word randomly by word embeddings similarity
    # model_type: word2vec, glove or fasttext
    for key, value in model_type_to_path_map.items():
      # insert augment
      augmenter_name = key + " " + value + " " + "insert"
      aug = naw.WordEmbsAug(
          model_type = key,
          model_path = value,
          action = "insert")
      augmented_texts = aug.augment(text, n=2)
      row = include_in_row(row, augmented_texts, augmenter_name)

      # substitute augment
      augmenter_name = key + " " + value + " " + "substitute"
      aug = naw.WordEmbsAug(
          model_type = key,
          model_path = value,
          action = "substitute")
      augmented_texts = aug.augment(text, n=2)
      row = include_in_row(row, augmented_texts, augmenter_name)

    # contextual insert augment using bert-base-uncased
    augmenter_name = "ContextualWordEmbsAug bert-base-uncased insert"
    aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
    augmented_texts = aug.augment(text, n=2)
    row = include_in_row(row, augmented_texts, augmenter_name)

    # contextual substitute augment using bert-base-uncased
    augmenter_name = "ContextualWordEmbsAug bert-base-uncased substitute"
    aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
    augmented_texts = aug.augment(text, n=2)
    row = include_in_row(row, augmented_texts, augmenter_name)

    # contextual substitute augment using distilbert-base-uncased
    augmenter_name = "ContextualWordEmbsAug distilbert-base-uncased substitute"
    aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
    augmented_texts = aug.augment(text, n=2)
    row = include_in_row(row, augmented_texts, augmenter_name)

    # # contextual substitute augment roberta-base
    augmenter_name = "ContextualWordEmbsAug roberta-base substitute"
    aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")
    augmented_texts = aug.augment(text, n=2)
    row = include_in_row(row, augmented_texts, augmenter_name)

    # synonym augmentation
    augmenter_name = "SynonymAug wordnet"
    aug = naw.SynonymAug(aug_src='wordnet')
    augmented_texts = aug.augment(text, n=2)
    row = include_in_row(row, augmented_texts, augmenter_name)

    # saving directly to google drive for later use
    row.to_csv(f"/content/drive/MyDrive/aug/{index}.csv")

In [None]:
# empty dataframe
df_m = pd.DataFrame()

# rading all rows saved in drive and
# creating a full dataframe
for file_ in os.listdir("/content/drive/MyDrive/aug/"):
  df = pd.read_csv("/content/drive/MyDrive/aug/" + file_)
  df_m = pd.concat([df_m, df.T])

In [None]:
# saving the updated dataframe with augmented text
df_m.drop_duplicates().to_csv("new_aug.csv")

# Model Training

In [None]:
import tensorflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
nltk.download('stopwords')
from bs4 import BeautifulSoup
import plotly.graph_objs as go
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv("updated_final_df.csv")
df['message'] = df['message'].astype(str)
df

## Data Pre-processing


In [None]:
def clean_text(text):
    """
    text: a string

    return: cleaned text
    """
    text = text.lower()
    text = replace_symbols.sub(' ', text)
    text = bad_symbols.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

# replace symbols by space in text
replace_symbols = re.compile('[/(){}\[\]\|@,;]')

# remove symbols wfrom text
bad_symbols = re.compile('[^0-9a-z #+_]')

# remove stopwors from text
stopwords = set(stopwords.words('english'))

# cleaning the data and adding to the same column
df['message'] = df['message'].apply(clean_text)
df

In [None]:
# The maximum number of words to be used
max_words = 50000

# Max number of words in each complaint.
max_seq_len = 100

# This is fixed.
embeddign_dim = 100

# tokenizing texts
tokenizer = Tokenizer(num_words = max_words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower = True)
tokenizer.fit_on_texts(df['message'].values)
word_index = tokenizer.word_index
print(f'Unique tokens: {len(word_index)}')

In [None]:
# defining X
X = tokenizer.texts_to_sequences(df['message'].values)
X = pad_sequences(X, maxlen = max_seq_len)
print(f'Shape X: {X.shape}')

In [None]:
# getting Y
Y = pd.get_dummies(df['topic_field']).values
print('Shape Y: {Y.shape}')

In [None]:
# newly added data were from room 0
newly_added_df = df[df['room_number'] == 0]
display(newly_added_df)

# getting the original data to df DataFrame
df = df[df['room_number'] != 0]
display(df)

In [None]:
# also, reasing the base dataframe to include the corresponding augmented
# text only to train set

# It is necessary, as if we include a augmented version of original text
# in training, and if somehow another version is on test, then
# it will bias the model evaluation

# to fix this issue, it is necessary
df_ = pd.read_csv("base_df.csv")
df_x = df_["message"]
df_y = df_["field"]

# train test split on the base data
X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, test_size = 0.10, random_state = 42)

In [None]:
# getting value counts of training texts and corresponding classes
Y_train.value_counts()

In [None]:
# getting value counts of test texts and corresponding classes
Y_test.value_counts()

In [None]:
# Our X is containing all data including augmented
# and after tokenization

# we will use this function to get the newly splitted original
# texts and their augmented version to be in either
# training or test split

def get_tranformed(transformed, original_df):
  """
  Gets all tokenized sequences as transformed, and splitted dataframe as original_df

  outputs the augmented versions and original text to be in the specific set as return_arr
  """
  # placeholder for data
  return_arr = []

  for index, row in original_df.items():
    # getting the original index
    original_index = index

    # getting all data that have the original index
    temp_ = df[df['original_df_index'] == original_index]

    # adding each row to the placeholder to return
    for idx, row in temp_.iterrows():
      return_arr.append(transformed[idx])

  return return_arr

In [None]:
# getting all augmented version of the data into same split
X_train_n = get_tranformed(X, X_train)
X_test_n = get_tranformed(X, X_test)
Y_train_n = get_tranformed(Y, Y_train)
Y_test_n = get_tranformed(Y, Y_test)

print(f"# of Train data: {len(X_train_n)}")
print(f"# of Test data: {len(X_test_n)}")

In [None]:
# Now, adding the newly added data to the train set, only
for index, row in newly_added_df.iterrows():
  X_train_n.append(X[index])
  Y_train_n.append(Y[index])

# transforming to numpy array
X_train_n = np.array(X_train_n)
X_test_n = np.array(X_test_n)
Y_train_n = np.array(Y_train_n)
Y_test_n = np.array(Y_test_n)

print(f"Train data shape: {X_train_n.shape}")
print(f"Test data shape: {X_test_n.shape}")

In [None]:
# checking if both set got the actual amount of data
assert (X_train_n.shape[0] + X_test_n.shape[0]) == df.shape[0]

## Simple LSTM Model

In [None]:
# defining model

model = Sequential()
model.add(Embedding(max_words, embeddign_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(64, dropout = 0.3, recurrent_dropout = 0.2)) # .2, .3, .4, .5
model.add(Dense(10, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

In [None]:
# train hyperparameters
epochs = 3
batch_size = 32

history = model.fit(X_train_n, Y_train_n,
                    epochs = epochs,
                    batch_size = batch_size,
                    validation_split = 0.2,
                    callbacks = [EarlyStopping(monitor = 'val_loss', patience = 3, min_delta = 0.0001)])

In [None]:
accr = model.evaluate(X_test_n, Y_test_n)
print(f'Test set >  Loss: {accr[0] : 0.3f},  Accuracy: {accr[1] : 0.3f}')

In [None]:
# function to check generated data
def test_model(model, df_name):
    """
    this function will predict anything any message text availabe on df_name given a model

    will return a dictionary dictionary with actual label and predicted label
    """
    # initial correct and wrong
    correct = 0
    wrong = 0

    result = {}

    # for each data point
    for index, row in pd.read_csv(df_name).iterrows():
        new_complaint = [row['message']]

        # transforming to get prediction
        seq = tokenizer.texts_to_sequences(new_complaint)
        padded = pad_sequences(seq, maxlen = max_words)

        # getting prediction
        pred = model.predict(padded)
        labels = df.topic_field.unique().tolist()
        predict_label = labels[np.argmax(pred)]

        # adding actual labels and predicted labels
        result[index] = {
            "actual_label" : row['category'],
            "predicted_label" : predict_label
        }

        # correct if actual label is same as the predicted label
        if predict_label == row['category']:
            correct += 1
        else:
            wrong += 1

    print(f"Right: {correct}\tWrong: {wrong}")

    return result

# testing the model on holdout data
test_model(model, "new_data.csv")

## Bidirectional LSTM model

In [None]:
# model architecture
model = Sequential()
model.add(Embedding(input_dim = len(tokenizer.word_index) + 1, output_dim = 100, input_length = max_words))
model.add(Bidirectional(LSTM(64, dropout = 0.2, recurrent_dropout = 0.2, return_sequences = True)))
model.add(Bidirectional(LSTM(32, dropout = 0.2, recurrent_dropout = 0.2)))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

In [None]:
# hyperparameters
epochs = 3
batch_size = 32

history = model.fit(X_train_n, Y_train_n,
                    epochs = epochs,
                    batch_size = batch_size,
                    validation_split = 0.2,
                    callbacks = [EarlyStopping(monitor = 'val_loss', patience = 3, min_delta = 0.0001)])

In [None]:
# model performance
accr = model.evaluate(X_test_n,Y_test_n)
print(f'Test set >  Loss: {accr[0] : 0.3f},  Accuracy: {accr[1] : 0.3f}')

In [None]:
# testing the model on the holdout data
test_model(model, "new_data.csv")

# Wordcloud visualization

In [None]:
!pip install tueplots

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

from tueplots import bundles
plt.rcParams.update(bundles.icml2022())
import tueplots.constants.color.palettes as tue_palettes

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
# augmented dataframe
df

In [None]:
# get raw train data after split
train_raw = df[df['original_df_index'].apply(lambda x: x in X_train.index)]
display(train_raw)

# get raw test data after split
test_raw = df[df['original_df_index'].apply(lambda x: x in X_test.index)]
display(test_raw)

In [None]:
# creating a long string out of texts, on the whole dataset
long_string = ','.join(list(df['message'].values))

# wordcloud object
wordcloud = WordCloud(background_color = "white", max_words = 5000, contour_width = 3, contour_color = 'steelblue')

# generating a word cloud
wordcloud.generate(long_string)

# visualizing it
wordcloud.to_image()

In [None]:
# on training dataset
long_string = ','.join(list(train_raw['message'].values))

# wordcloud object
wordcloud = WordCloud(background_color = "white", max_words = 5000, contour_width = 3, contour_color = 'steelblue')

# generating a word cloud
wordcloud.generate(long_string)

# visualizing it
wordcloud.to_image()

In [None]:
# on test dataset
long_string = ','.join(list(test_raw['message'].values))

# wordcloud object
wordcloud = WordCloud(background_color = "white", max_words = 5000, contour_width = 3, contour_color = 'steelblue')

# generating a word cloud
wordcloud.generate(long_string)

# visualizing it
wordcloud.to_image()