In [None]:
import re
import swifter
import numpy as np
import pandas as pd
from typing import List, Union
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

import tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, Dropout, Dense, GlobalAveragePooling1D, GRU, Bidirectional

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('C:/Users/User/Desktop/FINAL-THESIS/training_set_en_dataset_without_aug.csv')
df_val = pd.read_csv('C:/Users/User/Desktop/FINAL-THESIS/test_set_en_dataset_without_aug.csv')
df_test = pd.read_csv('C:/Users/User/Desktop/FINAL-THESIS/val_set_en_dataset_without_aug.csv')

In [None]:
output_labels = ["abusive", "hateful", "offensive", "disrespectful", "fearful", "normal"]
n_labels = len(output_labels)

# Discard Empty Tweets if There Any.
def discard_empty_tweets(df):
    empty_tweet_filter = df["tweet"] != ""
    df = df[empty_tweet_filter]
    df = df.dropna()


discard_empty_tweets(df_train)
discard_empty_tweets(df_val)
discard_empty_tweets(df_test)

print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

In [None]:
# Transform to int type.

def transform_to_int(df):
    df['abusive'] = df['abusive'].astype(np.int)
    df['hateful'] = df['hateful'].astype(np.int)
    df['offensive'] = df['offensive'].astype(np.int)
    df['disrespectful'] = df['disrespectful'].astype(np.int)
    df['fearful'] = df['fearful'].astype(np.int)
    df['normal'] = df['normal'].astype(np.int)

transform_to_int(df_train)
transform_to_int(df_val)
transform_to_int(df_test)

df_train['one_hot_labels'] = list(df_train[output_labels].values)

In [None]:
# Plot the Distribution of Target Labels in the Dataset
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
toxic_comments_labels.sum(axis=0).plot.bar()
plt.show()

In [None]:
# Identifying indices of 'one_hot_labels' entries that only occur once
df_train = df_train.reset_index()
label_counts = df_train.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df_train[df_train.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

In [None]:
train_sentences = df_train["tweet_cleaned"].values
test_sentences = df_test["tweet_cleaned"].values
val_sentences = df_val["tweet_cleaned"].values

train_y = df_train[output_labels].values
test_y = df_test[output_labels].values
val_y = df_val[output_labels].values

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", 
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 1024].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 1024].

x = GlobalAveragePooling1D()(sequence_output)
x = Dropout(0.3)(x)
output = Dense(n_labels, activation='sigmoid', name='outputs')(x)

model = Model(inputs=text_input, outputs=output)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-4, decay=1e-6), 
              metrics=['accuracy', 
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall'),
                       tf.keras.metrics.AUC(name='auc',multi_label=True, thresholds=[0, 0.5]),
                       tf.keras.metrics.TruePositives(name='TP'),
                       tf.keras.metrics.TrueNegatives(name='TN'),
                       tf.keras.metrics.FalsePositives(name='FP'),
                       tf.keras.metrics.FalseNegatives(name='FN')])
print(model.summary())

In [None]:
checkpointer = ModelCheckpoint(filepath="../../FINAL-THESIS/BERT_weights_model_final_without_aug.hdf5", 
                               verbose=1, save_best_only=True)

history = model.fit(X_train=train_sentences, y_train=train_y, epochs=4, 
                    batch_size=32, callbacks=[checkpointer],
                    validation_data=(val_sentences, val_y))

In [None]:
#Metrics Plots
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['precision'])
plt.plot(history.history['val_precision'])
plt.title('model precision')
plt.ylabel('precision')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
plt.title('model auc')
plt.ylabel('auc')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

In [None]:
#model.load_weights(filepath='../../NN_weights.hdf5')

In [None]:
results = model.evaluate(train_sentences, train_y, batch_size=64)

In [None]:
#Initializing Metrics
F1_score = 2 * (results[2] * results[3])/(results[2] + results[3])
AUC=results[4]
TP=results[5]
TN=results[6]
FP=results[7]
FN=results[8]


#Printing Metrics Results
print("F1 SCORE: ", F1_score)
print("AUC SCORE: ", AUC)
print("True Positives: ", TP)
print("True Negatives: ", TN)
print("False Positives: ", FP)
print("False Negatives: ", FN)

In [None]:
y_pred = model.predict(train_sentences)
y_pred = np.where(y_pred > 0.5, 1, 0)

y_true = train_y

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=output_labels))