# Machine Learning and Content Analytics – Mini Project

# Offensive language and hate speech detection 
---
> Students: `Arkoumani Georgia - p2822104` `Poulou Myrto - p2822129` `Koutsodimitropoulou Anastasia - p2822119` `Zaragka Eftychia - p2822112` <br />
> Professor: Haris Papageorgiou (xaris@ilsp.gr) <br />
> Assistant responsible for this assignment: George Perakis (gperakis@aueb.gr) <br />
> Department of Management Science and Technology <br />
> Athens University of Economics and Business <br />
> Date: 28/08/2022

In [48]:
import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tokenization
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [50]:
zf = zipfile.ZipFile('more_cleaned_df.zip') 
df = pd.read_csv('more_cleaned_df.csv')
df = df.dropna()
df

Unnamed: 0,category,lemmatized
0,0,yes sort remind eld lady play movie titanic te...
1,0,lady buy gun learn use effectively kill mother...
2,0,amazing dad not forget girl crushs girl mom as...
3,0,delon love turkey brave turk indian muslim sha...
4,0,here thing person earth decide not matter feel...
...,...,...
1381257,1,know woman single
1381258,1,woman want mother say want patriarchal society
1381259,1,woman submissive man
1381260,1,woman essentially childlike unable understand ...


In [51]:
from sklearn.utils import shuffle

# Extra shuffle (can be skipped because it is performed in StratifiedKFold)
df = shuffle(df)

In [52]:
# Initial DataFrame is separated into two sub-DataFrames. One containing all positive records and one containing
# all negative records
df_positive = df[df['category']==1]
df_negative = df[df['category']==0]

In [53]:
# The chunk size of each DataFrame is initialized. If chunk size exceeds the available positive records
# system retrieves as many positive records as can be found.
data_length = 100000

chunk_size = int(data_length / 2)

df_positive_no = chunk_size if len(df_positive) > chunk_size else len(df_positive)

In [54]:
# Here both sub-DataFrames repeat the process of shuffling.
df_negative = shuffle(df_negative)
df_negative = df_negative[:chunk_size]

df_positive = shuffle(df_positive)
df_positive = df_positive[:df_positive_no] 

In [55]:
frames = [df_positive, df_negative]

df = pd.concat(frames, ignore_index=True)

df

Unnamed: 0,category,lemmatized
0,1,farleftist dickbag like jon stewart comedian f...
1,1,hoe house tony spot mf
2,1,prefer jameis winston speak tv damn coon
3,1,bitch smell like fart holdin th grade
4,1,horse shit not vandalize page idiot not vandal...
...,...,...
99995,0,get rid land line month miss get call day tele...
99996,0,not aca add customer national insurance compan...
99997,0,news globe will not report british parliament ...
99998,0,not agree mean prejudice agree mean open minde...


In [56]:
df[df['category']==0]

Unnamed: 0,category,lemmatized
50000,0,question obama inform august
50001,0,old remember large panic teach school ozone la...
50002,0,try tell people not judgment good luck
50003,0,welfare bum wait pay cheque not respond hard w...
50004,0,sadly probably right thing africa go bad gay p...
...,...,...
99995,0,get rid land line month miss get call day tele...
99996,0,not aca add customer national insurance compan...
99997,0,news globe will not report british parliament ...
99998,0,not agree mean prejudice agree mean open minde...


In [57]:
df[df['category']==1]

Unnamed: 0,category,lemmatized
0,1,farleftist dickbag like jon stewart comedian f...
1,1,hoe house tony spot mf
2,1,prefer jameis winston speak tv damn coon
3,1,bitch smell like fart holdin th grade
4,1,horse shit not vandalize page idiot not vandal...
...,...,...
49995,1,screw want
49996,1,sucka hoesucka hoe succsess
49997,1,bitch suck ball
49998,1,hear cunt smell like alley crab factory talk


In [58]:
df_positive = df[df['category']==1]
df_negative = df[df['category']==0]

In [59]:
len(df_positive)

50000

In [60]:
len(df_negative)

50000

In [61]:
df_positive = df[df['category']==1]
df_negative = df[df['category']==0]

# It is calculated the number of positive records to be created through augmentation process.
aug_sentences_to_build_no = len(df_negative) - len(df_positive)

In [62]:
# Augmentation Process
# If there is no record to get created a Exception is raised in order to stop the execution.
# If there are records to be created system proceeds to their creation.
if aug_sentences_to_build_no <= 0:
    raise Exception('No need for data augmentation (oversampling). Downsampling to [Class 1] is applied.')
else:
    print(f'{aug_sentences_to_build_no} records of [Class 1] are going to be created.')
    import nlpaug.augmenter.word as naw

# SMOTE might lose some of the contextual meanings, which is possibly important for BERT and less so for other simpler models.
# Another way of doing class balancing is increasing the number of samples by random word dropout, or synonym replacement. 
# nlpaug or textaugment python packages are ideal for such a work. In this case scenario sysnonym replacement is applied using
# nlpaug python package.
aug = naw.SynonymAug(aug_src='wordnet', lang='eng')

dict_positive_build = []

for _ in range(0, aug_sentences_to_build_no):
    df_positive_random_row = df_positive.sample(n=1)
    dict_positive_build.append({'category': 1, 'lemmatized': aug.augment(df_positive_random_row.iloc[0]['lemmatized'])[0]})
    
df_produced = pd.DataFrame(dict_positive_build)

frames = [df_positive, df_produced]

df_positive_new = pd.concat(frames, ignore_index=True)

frames = [df_positive_new, df_negative]

df = pd.concat(frames, ignore_index=True)

Exception: No need for data augmentation (oversampling). Downsampling to [Class 1] is applied.

In [63]:
df

Unnamed: 0,category,lemmatized
0,1,farleftist dickbag like jon stewart comedian f...
1,1,hoe house tony spot mf
2,1,prefer jameis winston speak tv damn coon
3,1,bitch smell like fart holdin th grade
4,1,horse shit not vandalize page idiot not vandal...
...,...,...
99995,0,get rid land line month miss get call day tele...
99996,0,not aca add customer national insurance compan...
99997,0,news globe will not report british parliament ...
99998,0,not agree mean prejudice agree mean open minde...


In [64]:
df[df['category']==1]

Unnamed: 0,category,lemmatized
0,1,farleftist dickbag like jon stewart comedian f...
1,1,hoe house tony spot mf
2,1,prefer jameis winston speak tv damn coon
3,1,bitch smell like fart holdin th grade
4,1,horse shit not vandalize page idiot not vandal...
...,...,...
49995,1,screw want
49996,1,sucka hoesucka hoe succsess
49997,1,bitch suck ball
49998,1,hear cunt smell like alley crab factory talk


In [65]:
df[df['category']==0]

Unnamed: 0,category,lemmatized
50000,0,question obama inform august
50001,0,old remember large panic teach school ozone la...
50002,0,try tell people not judgment good luck
50003,0,welfare bum wait pay cheque not respond hard w...
50004,0,sadly probably right thing africa go bad gay p...
...,...,...
99995,0,get rid land line month miss get call day tele...
99996,0,not aca add customer national insurance compan...
99997,0,news globe will not report british parliament ...
99998,0,not agree mean prejudice agree mean open minde...


In [66]:
# Split dataset to traing and test data
xtrain, xtest, ytrain, ytest = train_test_split(df['lemmatized'], df['category'], random_state=42, test_size=0.3)
df_train = pd.DataFrame(xtrain)
label = preprocessing.LabelEncoder()
y = label.fit_transform(ytrain)
y = to_categorical(y)

# we create a BERT embedding layer by importing the BERT model from hub.KerasLayer
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
# bert_en_uncased_L-12_H-768_A-12_4.tar.gz
bert_layer = hub.KerasLayer(m_url, trainable=True)

In [67]:
# we create a BERT vocab_file in the form a numpy array. We then set the text to lowercase
# and finally we pass our vocab_file and do_lower_case variables to the Tokenizer object.
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [68]:
# Encoding the text
# we create a BERT vocab_file in the form a numpy array. We then set the text to lowercase and 
# finally we pass our vocab_file and do_lower_case variables to the Tokenizer object.

def bert_encode(texts, tokenizer, max_len=512) -> tuple:
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [69]:
# Build The Model

# Now we are all set to create our model. To do so, we will create a function named build_model that having tf.keras.models.Model class. 
# Inside the function we will define our model layers. Our model will consist of three Dense neural network layers and also dropout layer. 
# We have chosen a learning rate to 2e-5.

# RELU function :- With default values, this returns max(x, 0), the element-wise maximum of 0 and the input tensor. 
# Modifying default parameters allows you to use non-zero thresholds, change the max value of the activation, 
# and to use a non-zero multiple of the input for values below the threshold.

# Softmax function :- Softmax converts a real vector to a vector of categorical probabilities. 
# The elements of the output vector are in range (0, 1) and sum to 1. Each vector is handled independently. 
# The axis argument sets which axis of the input the function is applied along. 
# Softmax is often used as the activation for the last layer of a classification network because 
# the result could be interpreted as a probability distribution. 
# The softmax of each vector x is computed as exp(x) / tf.reduce_sum(exp(x)).

# Binary corssentropy:- Computes the cross-entropy loss between true labels and predicted labels. 
# We can use this cross-entropy loss when there are only two label classes (assumed to be 0 and 1). 
# For each example, there should be a single floating-point value per prediction.

def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

    clf_output = sequence_output[:, 0, :]

    lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(2, activation='softmax')(lay)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [70]:
# Finding max length of documents
# len(max(df['lemmatized'].tolist(), key=len))

In [71]:
# Finding max length of documents
# Here we check only the first 128 characters of each text, and also we set train-test input and train labels
max_len = 128
train_input = bert_encode(xtrain, tokenizer, max_len=max_len)
test_input = bert_encode(xtest, tokenizer, max_len=max_len)
train_labels = y

labels = label.classes_
labels

model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 128, 768)]                'input_mask[0][0]',       

In [72]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_sh = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=2,
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1
)

Epoch 1/2
Epoch 1: val_accuracy improved from -inf to 0.90850, saving model to model.h5
Epoch 2/2
Epoch 2: val_accuracy improved from 0.90850 to 0.91629, saving model to model.h5


In [73]:
path = os.path.abspath('')
# save the model to disk
filename = os.path.join(path, 'bert_finalized_model')

model.save(filename)



INFO:tensorflow:Assets written to: C:\Users\Faey\Desktop\FINAL NOTEBOOKS\bert_finalized_model\assets


INFO:tensorflow:Assets written to: C:\Users\Faey\Desktop\FINAL NOTEBOOKS\bert_finalized_model\assets


In [74]:
# Load the model
# loaded_model = pickle.load(open(filename, 'rb'))
loaded_model = tf.keras.models.load_model(filename)

In [75]:
label = preprocessing.LabelEncoder()
y_t = label.fit_transform(ytest)
y_t = to_categorical(y_t)

test_labels = y_t

In [76]:
predictions = loaded_model.predict(test_input)

In [77]:
y_pred=np.argmax(predictions, axis=1)
y_test=np.argmax(test_labels, axis=1)

In [78]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred)) # Value between 0 and 1

print("Accuracy Percentage {} %:".format(100*accuracy_score(y_test, y_pred))) # Value between 0 and 100

Accuracy: 0.9109666666666667
Accuracy Percentage 91.09666666666666 %:
