This Jupyter notebook aims to classify bullying tweets using a variety of traditional machine learning and deep learning models, and then compares their performances with each other.

# Table of Contents

- [1 - Data Import & Data Cleaning](#1)
    - [1.1 - Installing & Import libraries](#1.1)
    - [1.1 - Importing Datasets](#1.2)
    - [1.2 - Cleaning](#1.3)
- [2 - Analyzing the dataset](#2)
    - [2.1 - Category-wise Analysis](#2.1)
    - [2.2 - Tweets length Analysis](#2.2)
- [3 - Traditional Models](#3)
    - [3.1 - Random Forest](#3.1)
    - [3.2 - Gradient Boosting](#3.2)
    - [3.3 - Naive Bayes](#3.3)
    - [3.4 - Logistic Regression](#3.4)
    - [3.5 - SVC](#3.5)
    - [3.6 - Stacking](#3.6)
    - [3.7 - Comparing the evaluations of traditional models](#3.7)
- [4 - Deep Learning Models](#4)
    - [4.1 - Simple LSTM](#4.1)
    - [4.2 - Fine Tuned LSTM](#4.2)
    - [4.3 - GRU](#4.3)
    - [4.4 - GRU (GloVe Embedding)](#4.4)
    - [4.5 - GRU (Word2Vec & Attention)](#4.5)
    - [4.6 - Bert](#4.6)
- [5 - Evaluation of all Models](#5)

<a name='1'></a>

# Data Import & Data Cleaning

<a name='1.1'></a>
## Installing & Import libraries

In [None]:
! pip install langdetect
! pip install contractions
! pip install emoji
! pip install imblearn
! pip install torch
! pip install transformers
! pip install demoji
! pip install nltk
! pip install gensim
! pip install plotly

In [None]:
!pip install keras

In [None]:
# General purpose
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")
import os

# Text cleaning
import re
import string
import emoji
import demoji
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
# Stop words for text cleaning
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')  # Download the punkt tokenizer
nltk.download('wordnet')  # Download WordNet for lemmatization

# Data preprocessing
from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from langdetect import detect, LangDetectException
from sklearn.preprocessing import OneHotEncoder
import contractions
from nltk.tokenize import word_tokenize

# Balancing
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import precision_recall_curve, roc_auc_score, average_precision_score, confusion_matrix, roc_curve, auc

# Traditional Machine Learning Models
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC

# PyTorch for Deep Learning models
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Tensorflow for Deep Learning models
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU, Dropout, SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization

# Tokenization for GRU (word2Vec Embedding)
from collections import Counter
from gensim.models import Word2Vec

# Transformers for BERT
import transformers
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

# Set seed for reproducibility
import random
seed_value = 2042
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.despine()
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)
from plotly.offline import iplot
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud


In [None]:
import nltk

# Specify the directory to download NLTK data
nltk_data_dir = "/kaggle/working/nltk_data"

# Download the necessary NLTK resources to the specified directory
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('omw-1.4', download_dir=nltk_data_dir)
nltk.download('punkt', download_dir=nltk_data_dir)

# Set the NLTK data path
nltk.data.path.append(nltk_data_dir)

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
import os
import zipfile

# Define the paths for the zip files
nltk_data_dir = "/kaggle/working/nltk_data"
corpora_dir = os.path.join(nltk_data_dir, "corpora")

# Ensure the corpora directory exists
if not os.path.exists(corpora_dir):
    os.makedirs(corpora_dir)

# Unzip the wordnet.zip and omw-1.4.zip files into the corpora directory
for zip_file in ["wordnet.zip", "omw-1.4.zip"]:
    zip_path = os.path.join(corpora_dir, zip_file)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(corpora_dir)

# Check if the data has been unzipped correctly
print("Contents of corpora directory after unzipping:", os.listdir(corpora_dir))

# Verify that NLTK can find the WordNet corpus
import nltk
nltk.data.path.append(nltk_data_dir)

try:
    from nltk.corpus import wordnet
    # Check if we can access the WordNet corpus
    print("WordNet path:", wordnet.root)
    print("Sample Synset:", wordnet.synsets('computer'))
except LookupError as e:
    print("Error accessing WordNet:", e)

<a name='1.2'></a>
## Importing Datasets

### First (Main) Dataset
In this study, the Fine-Grained Cyberbullying Dataset (FGCD) was used [kaggle](https://www.kaggle.com/datasets/andrewmvd/cyberbullying-classification), consisting of 47,692 labelled tweets distributed across distinct categories: Age, Ethnicity, Gender, Religion, Other-cyberbullying, and Not-cyberbullying. the dataset consists of 47,692 labelled tweets, with approximately 8000 rows for each class. Thus, it appears to be a balanced dataset

In [None]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
input_dir = '/kaggle/input'

# Find and read the CSV file
for dirname, _, filenames in os.walk(input_dir):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        print(file_path)  

        # Check if the file is the one you need
        if filename == 'cyberbullying_tweets.csv':
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            print("File loaded successfully!")

In [None]:
df

### Second Dataset
After data cleaning, significant class distribution imbalances were detected, particularly in the 'Other-cyberbullying' and 'Not-cyberbullying' classes. To address this, the RandomOverSampler was initially implemented. However, after testing different classification models, the accuracy and F1 scores for these classes remained notably lower. Therefore, a supplementary dataset with binary labels was introduced in this stage [kaggle](https://www.kaggle.com/datasets/saurabhshahane/cyberbullying-dataset). 1700 instances from the ‘non-bullying’ class were integrated to augment the 'Not-cyberbullying' class. Due to the diverse nature of the ‘bully’ class in the second dataset, filling the ‘Other cyberbullying’ class using this dataset seems illogical, leading to the removal of this column from the main dataset to maintain consistency.

In [None]:
input_dir = '/kaggle/input'

# Find and read the CSV file
for dirname, _, filenames in os.walk(input_dir):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        print(file_path)  

        # Check if the file is the one you need
        if filename == 'aggression_parsed_dataset.csv':
            # Read the CSV file into a DataFrame
            df2 = pd.read_csv(file_path)
            print("File loaded successfully!")

In [None]:
df2

In [None]:
useful_columns = ['Text', 'oh_label']
df2 = df2[useful_columns]

In [None]:
df2['oh_label'].value_counts()

In [None]:
df2 = df2.rename(columns={'Text':'tweet_text','oh_label': 'cyberbullying_type'})

In [None]:
df2

In [None]:
df2.iloc[115840][0]

In [None]:
# Tweets belonging to the category "not bully" is selected.

not_bully_df = df2[df2['cyberbullying_type'] == 0]

In [None]:
# the length of tweets is measured
not_bully_df['text_len'] = [len(text.split()) for text in not_bully_df.tweet_text];
not_bully_df

Because it has been understood in the next stages that 99% of the tweets in the main dataset are below 32 words, it has been decided to only include tweets with a length below 50 words randomly in the main dataset.

It is noted that the size of tweets will be reduced after cleaning and removing certain characters or stop words.

In [None]:
not_bully_df = not_bully_df[not_bully_df['text_len']<50] # tweets which their lenth are below 50
random_rows = not_bully_df.sample(n=1700) # 1700 tweets are added
random_rows

In [None]:
 # changing the labels from 0 to not_cyberbulling
random_rows['cyberbullying_type'] = random_rows['cyberbullying_type'].replace(0, 'not_cyberbullying')
# dropping the tweet lenth after filtering
random_rows = random_rows.drop(columns=['text_len'])


In [None]:
df = pd.concat([df, random_rows])
df

In [None]:
df.info()

In [None]:
# Renaming the columns name
df = df.rename(columns={'tweet_text': 'text', 'cyberbullying_type': 'sentiment'})

In [None]:
# Finding duplicated tweets and dropping them
print(f'Number of duplicated tweets',df.duplicated().sum())
df = df[~df.duplicated()]

In [None]:
df['sentiment'].value_counts()

At this stage, it is observed that the number of instances belonging to the not_cyberbullying class is significantly higher than the other classes. However, after cleaning, its count will decrease, and it can then be considered in a balanced situation along with the other classes.

<a name='1.3'></a>

## Data Cleaning

### Define Cleaning Functions

In [None]:
# Clean emojis from text
def remove_emoji(text):
    return demoji.replace(text, '')

#demoji.replace_with_desc()

In [None]:
# Remove punctuations, stopwords, links, mentions and new line characters
def remove_all_entities(text):
    # Replacing newline and carriage return characters with a space and converting the text to lowercase
    text = re.sub(r'\r|\n', ' ', text.lower())  
    # Removing links and mentions
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)  
    # Removing non-ASCII characters
    text = re.sub(r'[^\x00-\x7f]', '', text)
    
    # Defining the list of punctuation characters
    banned_list = string.punctuation
    # Creating a translation table to remove punctuation characters from text
    table = str.maketrans('', '', banned_list)
    # Removing punctuation characters from the text using the translation table
    text = text.translate(table)
    # Tokenizing the text into individual words, excluding stop words, and join them back into a single string
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [None]:
# Remove '#' symbols from hashtags at the end of the sentence, and keep those in the middle of the sentence
def clean_hashtags(tweet):
    # Removing hashtags at the end of the sentence
    new_tweet = re.sub(r'(\s+#[\w-]+)+\s*$', '', tweet).strip()
    
    # Removing the # symbol from hashtags in the middle of the sentence
    new_tweet = re.sub(r'#([\w-]+)', r'\1', new_tweet).strip()
    
    return new_tweet

In [None]:
# Remove special characters such as & and $ present in some words
def remove_chars(text):
    return ' '.join('' if ('$' in word) or ('&' in word) else word for word in text.split())

In [None]:
# Remove multiple spaces
def remove_mult_spaces(text):
    return re.sub(r"\s\s+", " ", text)

In [None]:
# Check if the text is in English, and return an empty string if it's not
def remove_non_english(text):
    try:
        language = detect(text)
    except LangDetectException:
        language = "unknown"
    return text if language == "en" else ""

In [None]:
# Expand contractions
def expand_contractions(text):
    return contractions.fix(text)

In [None]:
# Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [None]:
# Initialize lemmatizer for text cleaning
lemmatizer = WordNetLemmatizer()
# Lemmatize words
def lemmatize(text):
    # Tokenizing the input text into individual words
    words = word_tokenize(text)
    # Lemmatizing each word in the tokenized text using the lemmatizer
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Joining the lemmatized words back into a single string
    return ' '.join(lemmatized_words)

In [None]:
# Remove short words
def remove_short_words(text, min_len=2):
    # Spliting the input text into individual words
    words = text.split()
    # Filtering out words shorter than the specified minimum length
    long_words = [word for word in words if len(word) >= min_len]
    # Joining the remaining long words back into a single string
    return ' '.join(long_words)

In [None]:
# Replace elongated words with their base form
#Elongated words are words that contain repeated characters, such as "loooove" for "love" or "cooool" for "cool".
def correct_elongated_words(text):
    # Defining a regular expression pattern to match elongated words
    regular_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'
    # Using the regular expression substitution to correct elongated words
    # Replacing the elongated part of the word with a single occurrence of the repeated character
    return re.sub(regular_pattern, r'\1\3\4', text)

In [None]:
# Remove repeated punctuation
def remove_repeated_punctuation(text):
    return re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', text)

In [None]:
# Remove extra whitespace
def remove_extra_whitespace(text):
    return ' '.join(text.split())

In [None]:
def remove_url_shorteners(text):
    return re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', text)

In [None]:
# Remove spaces at the beginning and end of the tweet
def remove_spaces_tweets(tweet):
    return tweet.strip()

In [None]:
# Remove short tweets
def remove_short_tweets(tweet, min_words=3):
    words = tweet.split()
    return tweet if len(words) >= min_words else ""

In [None]:
# Function to call all the cleaning functions in the correct order
def clean_tweet(tweet):
    tweet = remove_emoji(tweet)
    tweet = expand_contractions(tweet)
    tweet = remove_non_english(tweet)
    tweet = remove_all_entities(tweet)
    tweet = clean_hashtags(tweet)
    tweet = remove_chars(tweet)
    tweet = remove_mult_spaces(tweet)
    tweet = remove_numbers(tweet)
    tweet = lemmatize(tweet)
    tweet = remove_short_words(tweet)
    tweet = correct_elongated_words(tweet)
    tweet = remove_repeated_punctuation(tweet)
    tweet = remove_extra_whitespace(tweet)
    tweet = remove_url_shorteners(tweet)
    tweet = remove_spaces_tweets(tweet)
    tweet = remove_short_tweets(tweet)
    tweet = ' '.join(tweet.split())  # Remove multiple spaces between words
    return tweet

### Applying the cleaning functions on dataset

In [None]:
df['text_clean'] = [clean_tweet(tweet) for tweet in df['text']]

In [None]:
df

### Duplicated tweets after cleaning

In [None]:
print(f'{int(df["text_clean"].duplicated().sum())} duplicated tweets is removed.')
df.drop_duplicates("text_clean", inplace=True)

In [None]:
df['sentiment'].value_counts()

As shown, except for the other_cyberbullying class, all classes have approximately a balanced number of rows.

<a name='2'></a>

# Analyzing the dataset

### Define n-gram and wordcloud visualization functions

In [None]:
def get_top_n_gram(corpus, ngram_range, n=None):
    # Initializing a CountVectorizer with specified n-gram range
    vec = CountVectorizer(ngram_range=ngram_range).fit(corpus)
    # Transforming the corpus into a bag of words representation
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    # Creating a list of tuples containing words and their frequencies
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    # Sorting the list of tuples by frequency in descending order
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
def generate_wordcloud(sentiment, df):
    plt.figure(figsize=(20, 10))
    subset = df[df['sentiment'] == sentiment]
    text_sentiment = subset.text_clean.values
    cloud = WordCloud(background_color='black', colormap="Dark2", collocations=False, width=2000, height=1000).generate(" ".join(text_sentiment))
    
    plt.axis('off')
    plt.title(sentiment.capitalize(), fontsize=40)
    plt.imshow(cloud)
    
    plt.savefig(f"{sentiment}_wordcloud.svg", format='svg', bbox_inches='tight')
    plt.show()

In [None]:
def plot_ngrams(sentiment, df):
    subset = df[df['sentiment'] == sentiment]
    text_sentiment = subset.text_clean.values
    
    # Calculating the top unigrams and bigrams
    unigrams = get_top_n_gram(text_sentiment, (1, 1), 10)
    bigrams = get_top_n_gram(text_sentiment, (2, 2), 10)

    # Creating DataFrames from the top unigrams and bigrams
    unigrams_df = pd.DataFrame(unigrams, columns=['text_clean', 'count'])
    bigrams_df = pd.DataFrame(bigrams, columns=['text_clean', 'count'])

    # Grouping by 'Text', summing the counts, and sorting the values in ascending order
    unigrams_grouped = unigrams_df.groupby('text_clean').sum()['count'].sort_values(ascending=True)
    bigrams_grouped = bigrams_df.groupby('text_clean').sum()['count'].sort_values(ascending=True)

    # Converting the Series objects to DataFrames
    unigrams_df = unigrams_grouped.reset_index()
    bigrams_df = bigrams_grouped.reset_index()


    # Creating subplots with two bar plots
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    # top 10 unigrams
    sns.barplot(x="count", y="text_clean", data=unigrams_df, ax=axes[0], palette="viridis", edgecolor = 'black')
    axes[0].set_xlabel("Count")
    axes[0].set_ylabel("Unigrams")
    axes[0].set_title("Top 10 Unigrams")
    axes[0].xaxis.grid(True, alpha=0.3)


    # top 10 bigrams
    sns.barplot(x="count", y="text_clean", data=bigrams_df, ax=axes[1], palette="viridis", edgecolor = 'black')
    axes[1].set_xlabel("Count")
    axes[1].set_ylabel("Bigrams")
    axes[1].set_title("Top 10 Bigrams")
    axes[1].xaxis.grid(True, alpha=0.3)


    # Showing plot
    plt.tight_layout()
    plt.suptitle(f'Top 10 Ngrams in {sentiment.capitalize()}', fontsize=16)
    #plt.savefig(f"{sentiment}_ngrams.svg", format='svg', bbox_inches='tight')
    plt.show()

<a name='2.1'></a>

## Category-wise Analysis

### Religion

In [None]:
generate_wordcloud('religion', df)
plot_ngrams('religion', df)

### Age

In [None]:
generate_wordcloud('age', df)
plot_ngrams('age', df)

### Ethnicity

In [None]:
generate_wordcloud('ethnicity', df)
plot_ngrams('ethnicity', df)

### Gender

In [None]:
generate_wordcloud('gender', df)
plot_ngrams('gender', df)

### Other_cyberbullying

In [None]:
generate_wordcloud('other_cyberbullying', df)
plot_ngrams('other_cyberbullying', df)

### Not_cyberbullying

In [None]:
generate_wordcloud('not_cyberbullying', df)
plot_ngrams('not_cyberbullying', df)

As mentioned, a significant number of tweets belonging to the class "other_cyberbullying" have been observed to be removed. Due to the highly unbalanced nature of this class compared to others and its generic nature, a decision has been made to remove tweets labeled as belonging to this class.

Note: It has been noted that by performing some tests, the f1 score for predicting the "other_cyberbullying" class resulted to be around 60%, a value far lower compared to the other f1 scores (around 92% using LSTM model). This supports the decision to remove this class.

In [None]:
# Dropping the other_cyberBulling class
df = df[df["sentiment"]!="other_cyberbullying"]

In [None]:
#  5 sentiments exist in the dataset
sentiments = ["religion","age","ethnicity","gender","not bullying"]

<a name='2.2'></a>

## Tweets length analysis

In [None]:
df['text_len'] = [len(text.split()) for text in df.text_clean]

In [None]:
# Sorting the rows based on the their lenths
df.sort_values(by=['text_len'], ascending=False)

In [None]:
plt.figure(figsize=(20,5))
ax = sns.countplot(x='text_len', data=df, palette='viridis')
plt.title('Count of tweets with high number of words', fontsize=25)
ax.bar_label(ax.containers[0])
plt.ylabel('Count', fontsize=20)
plt.xlabel('Tweet Length', fontsize=20)
ax.yaxis.grid(True, alpha=0.3)

# Calculate quartiles
q1 = np.percentile(df['text_len'], 25)
q2 = np.percentile(df['text_len'], 50)
q3 = np.percentile(df['text_len'], 75)
q4 = np.percentile(df['text_len'], 99)


# Add lines for quartiles
plt.axvline(x=q2, color='green', linestyle='--', label='50th Percentile (Median)')
plt.axvline(x=q3, color='blue', linestyle='--', label='75th Percentile')
plt.axvline(x=q4, color='red', linestyle='--', label='99th Percentile')


plt.legend(fontsize = 18)

# Save the plot as SVG
#plt.savefig('tweet_length_plot_with_quartiles.svg', format='svg', bbox_inches='tight')

plt.show()

To organise computational time, a new variable called text_len introduced to represent the word count for each tweet. Figure highlights tweet length distribution, indicating potential outliers beyond the 99th percentile. To optimize efficiency, rows exceeding the 0.99 quantile are removed. As a result, tweets with lengths exceeding 31 are eliminated, and the maximum length for all tweets is capped at 31.

In [None]:
df = df[df['text_len'] < df['text_len'].quantile(0.995)]

In [None]:
df['sentiment'].value_counts()

The length of the longest tweet is also obtained since it will be useful later.

In [None]:
max_len = np.max(df['text_len'])
max_len 

In [None]:
df.sort_values(by=["text_len"], ascending=False)

In [None]:
plt.figure(figsize=(16,5))
ax = sns.countplot(x='text_len', data=df, edgecolor = 'black',palette='viridis')
plt.title('Count of tweets with high number of words', fontsize=25)
ax.bar_label(ax.containers[0])
plt.ylabel('count', fontsize = 17)
plt.xlabel('Tweet Length', fontsize = 17)
ax.yaxis.grid(True, alpha = 0.3)
#plt.savefig('tweet_length_plot_after_filter.svg', format='svg', bbox_inches='tight')
plt.show()

<a name='3'></a>

# Traditional medels

### Sentiment column encoding

The target column will be encoded


In [None]:
df['sentiment_code'] = df['sentiment'].replace({'religion':0,'age':1,'ethnicity':2,'gender':3,'not_cyberbullying':4})

### Train - Test split

In [None]:
df['sentiment_code'].unique()

In [None]:
X = df['text_clean']
y = df['sentiment_code']

In [None]:
# using shuffle to ensure that each set (training and testing) contains a representative sample of each category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_value, shuffle=True)

In [None]:
(unique, counts) = np.unique(y_train, return_counts=True)
np.asarray((unique, counts)).T

The classes are not completely balanced, so it could be a good idea to oversample the training set such that all classes have the same count as the most populated one. The RandomOverSampler in Python's imbalanced-learn library is used to balance the class distribution by randomly duplicating samples from the minority class. Given the moderate level of imbalance, oversampling seems like an appropriate approach that shouldn't lead to overfitting issues and does not have the problem of sampling in NLP tasks.

### Oversampling of training set

In [None]:
y_train.value_counts()

In [None]:
# using RAndomoverSampler to completely balance the data
ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(np.array(X_train).reshape(-1, 1), np.array(y_train).reshape(-1, 1));
train_os = pd.DataFrame(list(zip([x[0] for x in X_train], y_train)), columns = ['text_clean', 'sentiment']);

In [None]:
X_train = train_os['text_clean'].values
y_train = train_os['sentiment'].values

In [None]:
(unique, counts) = np.unique(y_train, return_counts=True)
np.asarray((unique, counts)).T

First, creating a bag of words using CountVectorizer.

In [None]:
clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train)
X_test_cv = clf.transform(X_test)

The TF-IDF transformation is applied to associate weights to the different words based on their frequency (rarer words will be given more importance).

In [None]:
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)
X_test_tf = tf_transformer.transform(X_test_cv)

### Grid Search and Evaluation Functions

In [None]:
def perform_grid_search(clf, param_grid, X, y):
    """
    Performing grid search to find the best hyperparameters.
    """

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy',n_jobs=-1)

    # Perform grid search on training data
    grid_search.fit(X, y)

    # Best parameters and the best accuracy score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best Parameters:", best_params)
    print("Best Accuracy Score:", best_score)

    return grid_search

In [None]:
def evaluate_classification_with_model(model_name, model, X_test, y_true, sentiments):
    evaluation_results = {}
    evaluation_results['Model'] = model_name

    if hasattr(model, "predict_proba"):  # traditional models
        y_pred_probs = model.predict_proba(X_test)

        # Predicted classes
        y_pred = model.predict(X_test)

    else:  # neural network models
        # Predicted probabilities
        y_pred_probs = model.predict(X_test)

        # Predicted classes
        y_pred = np.argmax(y_pred_probs, axis=1)

    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    evaluation_results['Accuracy'] = accuracy

    # Precision, Recall, F1-score
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    evaluation_results['Precision'] = precision
    evaluation_results['Recall'] = recall
    evaluation_results['F1-score'] = f1

    # ROC-AUC for multi-class classification
    roc_auc = []
    for i in range(len(sentiments)):
        roc_auc.append(roc_auc_score(y_true == i, y_pred_probs[:, i]))
    evaluation_results['ROC-AUC'] = np.mean(roc_auc)

    # PR-AUC for multi-class classification
    pr_auc = []
    for i in range(len(sentiments)):
        pr_auc.append(average_precision_score(y_true == i, y_pred_probs[:, i]))
    evaluation_results['PR-AUC'] = np.mean(pr_auc)

    # ROC curve plotting
    plt.figure(figsize=(5, 5))
    for i, sentiment in enumerate(sentiments):
        fpr, tpr, _ = roc_curve(y_true == i, y_pred_probs[:, i])
        plt.plot(fpr, tpr, label=f'ROC curve ({sentiment}) (AUC = {roc_auc[i]:0.2f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model_name} Classification')
    plt.legend(loc="lower right", fontsize=12)

    plt.show()

    # Print evaluation metrics with correct formatting
    for key, value in evaluation_results.items():
        if isinstance(value, (int, float)):  # Check if the value is numeric
            print(f"{key}: {value:.4f}")
        else:
            print(f"{key}: {value}")

    return evaluation_results


In [None]:
def conf_matrix(y, y_pred, title, labels):
    fig, ax =plt.subplots(figsize=(6.5,6.5))
    ax=sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap="viridis", fmt='g', cbar=False, annot_kws={"size":20})
    plt.title(title, fontsize=20)
    ax.xaxis.set_ticklabels(labels, fontsize=15.5) 
    ax.yaxis.set_ticklabels(labels, fontsize=15.5)
    ax.set_ylabel('Test', fontsize=15)
    ax.set_xlabel('Predicted', fontsize=15)

    file_name = f"{title}_confusion_matrix.svg"
    #plt.savefig(file_name, format='svg', bbox_inches='tight')
    plt.show()

<a name='3.1'></a>

## Random Forest
As the dataset is large and the model size is too big for using gridSearch in Random Forest, hyperparameters such as n_estimators, max_depth, and min_samples_split are manually tuned. The parameters that lead to better results than the default settings are selected and used to train the final model.

In [None]:
list_of_evaluations = [] # this list of all models' evaluations
runtime = {} # a dictionary containing all model's runtimes

In [None]:
start_time = time.time()

rf_clf = RandomForestClassifier(n_estimators=150)
rf_clf.fit(X_train_tf, y_train)

end_time = time.time()

# Calculate the runtime
runtime['RF'] = end_time - start_time

In [None]:
runtime

In [None]:
rf_pred = rf_clf.predict(X_test_tf)

print('Classification Report for Random Forest:\n',classification_report(y_test, rf_pred, target_names=sentiments))
conf_matrix(y_test,rf_pred,'Random Forest Sentiment Analysis', sentiments)

In [None]:
list_of_evaluations.append(evaluate_classification_with_model('RF', rf_clf, X_test_tf, y_test,sentiments))

<a name='3.2'></a>

## Gradient Boosting Classifier
As the dataset is large and the model size is too big for using gridSearch in GB, hyperparameters such as n_estimators and max_depth are manually tuned. The parameters that lead to better results than the default settings are selected and used to train the final model.

In [None]:
start_time = time.time()

gb_clf = GradientBoostingClassifier(n_estimators=150)
gb_clf.fit(X_train_tf, y_train)
end_time = time.time()

# Calculate the runtime
runtime['GB'] = end_time - start_time

gb_pred = gb_clf.predict(X_test_tf)

print('Classification Report for Gradient Boosting:\n',classification_report(y_test, gb_pred, target_names=sentiments))
conf_matrix(y_test,gb_pred,'Gradient Boosting Sentiment Analysis', sentiments)

In [None]:
list_of_evaluations.append(evaluate_classification_with_model('GB', gb_clf, X_test_tf, y_test,sentiments))

<a name='3.3'></a>

## Multinomial Naive Bayes classifier
GridSearch is used to find the best hyper parameters

In [None]:
# Defining the parameter grid
nb_param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0],
}
# Initialize Multinomial Naive Bayes classifier
start_time = time.time()
clf = MultinomialNB()

nb_clf = perform_grid_search(clf, nb_param_grid, X_train_tf, y_train)
end_time = time.time()
runtime['NB'] = end_time - start_time

In [None]:
nb_pred = nb_clf.predict(X_test_tf)

print('Classification Report for Naive Bayes:\n',classification_report(y_test, nb_pred, target_names=sentiments))
conf_matrix(y_test,nb_pred,'Naive Bayes Sentiment Analysis', sentiments)

In [None]:
# evaluation_result = evaluating_new(nb_clf, X_test_tf, y_test, 'Naive Bayes')
list_of_evaluations.append(evaluate_classification_with_model('NB', nb_clf, X_test_tf, y_test, sentiments))

<a name='3.4'></a>

## LogisticRegression

In [None]:
start_time = time.time()

lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train_tf, y_train)

end_time = time.time()
runtime['LR'] = end_time - start_time

lr_pred = lr_clf.predict(X_test_tf)

print('Classification Report for Logistic Regression:\n',classification_report(y_test, lr_pred, target_names=sentiments))

conf_matrix(y_test,lr_pred,'Logistic Regression Sentiment Analysis', sentiments)

In [None]:
list_of_evaluations.append(evaluate_classification_with_model('LR', lr_clf, X_test_tf, y_test, sentiments))

<a name='3.5'></a>

## SVC
The most efficient kernel found for the Support Vector Classifier (SVC) is the sigmoid kernel. It's noteworthy that its performance is comparable to that of the 'rbf' kernel, yet it exhibits significantly lower runtime.

In [None]:
#start_time = time.time()

svm_clf = SVC(kernel='sigmoid')
svm_clf.fit(X_train_tf, y_train)

#end_time = time.time()
#runtime['SVC'] = end_time - start_time


svm_pred = svm_clf.predict(X_test_tf)

print('Classification Report for SVM:\n',classification_report(y_test, svm_pred, target_names=sentiments))

conf_matrix(y_test,svm_pred,'SVM Sentiment Analysis', sentiments)

<a name='3.6'></a>

## Stacking

In [None]:
def get_stacking():
    '''
    Create a stacking classifier
    '''
    level0 = []
    level0.append(('DT', LogisticRegression()))
    level0.append(('SVC', SVC(kernel='sigmoid')))
    level0.append(('NB', MultinomialNB(alpha = 0.5)))
    
    level1 = LogisticRegression()
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv = 5)
    return model

In [None]:
start_time = time.time()

stacking_model = get_stacking();

stacking_model.fit(X_train_tf, y_train)

end_time = time.time()
runtime['Stacking'] = end_time - start_time


In [None]:
stacking_pred = stacking_model.predict(X_test_tf)

print('Classification Report for stacking_model:\n',classification_report(y_test, stacking_pred, target_names=sentiments))

conf_matrix(y_test,stacking_pred,'stacking_model Sentiment Analysis', sentiments)

In [None]:
list_of_evaluations.append(evaluate_classification_with_model('Stacking', stacking_model, X_test_tf, y_test,sentiments))

<a name='3.7'></a>

## Comparing the evaluations of traditional models

In [None]:
evaluation_df = pd.DataFrame(list_of_evaluations)

In [None]:
evaluation_df.set_index('Model', inplace=True)
evaluation_df

In [None]:
# Plot all evaluation metrics for all models
for metric in evaluation_df.columns:
    plt.plot(evaluation_df.index, evaluation_df[metric], marker='o', label=metric)

# Customize the plot
plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Evaluation Metrics for Different Models')
plt.legend()
#plt.xticks(rotation=45)  
plt.grid(True, alpha = 0.3)  
plt.tight_layout() 

#plt.savefig('evaluation_metrics_plot.svg', format='svg')

plt.show()

In [None]:
runtime

<a name='4'></a>

# Deep Learning Models

### Function for Plotting Accuracy Across Epochs

In [None]:
def plotting_funct(history_df, name_of_model):
    '''
    This function plots training and validation loss, as well as training and validation accuracy, versus epoch.

    Parameters:
        - history_df: DataFrame containing the training history.
        - name_of_model: Name of the model for saving the figure.

    Returns:
        None
    '''
    # Using matplotlib to create a figure with 1x2 sub-plots
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))

    # Plot 1: the training and validation losses versus epoch.
    axes[0].plot(history_df["loss"], label="Training")
    axes[0].plot(history_df["val_loss"], label="Validation")
    # Add a legend to the plot.
    axes[0].legend()
    # Label both axes.
    axes[0].set_xlabel("Epoch")
    axes[0].set_ylabel("Loss = Cross Entropy")
    # Add title to the plot.
    axes[0].set_title('Losses versus Epoch')
    axes[0].grid(True, alpha=0.2)
    axes[0].set_ylim(0, 1)

    # Add small dots to the loss plot
    axes[0].scatter(history_df.index, history_df["loss"], c='blue', s=5, alpha=0.5)
    axes[0].scatter(history_df.index, history_df["val_loss"], c='darkorange', s=5, alpha=0.5)

    # Plot 2: the training and validation accuracy versus epoch.
    axes[1].plot(history_df["accuracy"], label="Training")
    axes[1].plot(history_df["val_accuracy"], label="Validation")
    # Add a legend to the plot.
    plt.legend()
    # Label both axes.
    axes[1].set_xlabel("Epoch")
    axes[1].set_ylabel("Accuracy")
    # Add title to the plot.
    axes[1].set_title('Accuracy versus Epoch')
    axes[1].grid(True, alpha=0.2)
    axes[1].set_ylim(0, 1)

    # Add small dots to the accuracy plot
    axes[1].scatter(history_df.index, history_df["accuracy"], c='blue', s=5, alpha=0.5)
    axes[1].scatter(history_df.index, history_df["val_accuracy"], c='darkorange', s=5, alpha=0.5)

    # Saving and showing the figure
    plt.tight_layout()
    #plt.savefig(f'{name_of_model}_val_and_accuracy_vs_epochs.svg', format='svg', bbox_inches='tight')
    plt.show()

### Preparing the data

In [None]:
# Splitting the train data to train and validation
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=seed_value)

In [None]:
(unique, counts) = np.unique(y_train, return_counts=True)
np.asarray((unique, counts)).T

In [None]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text_clean'])
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences  = tokenizer.texts_to_sequences(X_test)
X_valid_sequences  = tokenizer.texts_to_sequences(X_valid)

In [None]:
num_unique_tokens = len(tokenizer.word_index)
print("Number of unique tokens:", num_unique_tokens)

In [None]:
token_lengths = [len(sequence) for sequence in X_train_sequences]
mean_length   = np.mean(token_lengths)
median_length = np.median(token_lengths)
max_length    = int(mean_length + 2 * np.std(token_lengths))

print(f"Mean Token Length   : {mean_length}")
print(f"Median Token Length : {median_length}")
print(f"Max Token Length    : {max_length}")

In [None]:
max_len

In [None]:
# Padding
max_length     = 31
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded  = pad_sequences(X_test_sequences , maxlen=max_length, padding='post')
X_valid_padded  = pad_sequences(X_valid_sequences , maxlen=max_length, padding='post')

In [None]:
X_test_padded

<a name='4.1'></a>

## Simple LSTM Model

In [None]:
# Build the LSTM model
embedding_dim = 200
num_classes   = 5

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len),
    LSTM(units=100, dropout=0.2, recurrent_dropout=0.2),
    Dense(units=num_classes, activation='softmax')
])

model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_valid_padded, y_valid), callbacks=[early_stopping])

In [None]:
history_df = pd.DataFrame(history.history)
plot = plotting_funct(history_df, 'Simple LSTM')

In [None]:
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)


print('Classification Report for LSTM:\n',classification_report(y_test, y_pred, target_names=sentiments))

conf_matrix(y_test,y_pred,'LSTM', sentiments)

<a name='4.2'></a>

## Fine Tuned LSTM

In [None]:
embedding_dim = 200


start_time = time.time()
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len),
    LSTM(units=100, dropout=0.5, recurrent_dropout=0.5), #return_sequences=True
    BatchNormalization(momentum=0.9),
    Dense(units=num_classes, activation='softmax')
])
model.summary()

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_padded, y_train, epochs=10, batch_size=512, validation_data=(X_valid_padded, y_valid), callbacks=[early_stopping])

end_time = time.time()
runtime['LSTM'] = end_time - start_time

In [None]:
history_df = pd.DataFrame(history.history)
plot = plotting_funct(history_df, 'Simple LSTM')

In [None]:
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)


print('Classification Report for LSTM:\n',classification_report(y_test, y_pred, target_names=sentiments))

conf_matrix(y_test,y_pred,'LSTM', sentiments)

In [None]:
list_of_evaluations.append(evaluate_classification_with_model('LSTM', model, X_test_padded, y_test,sentiments))

<a name='4.3'></a>

## GRU

In [None]:
embedding_dim = 200


start_time = time.time()
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len),
    GRU(units=100, dropout=0.5, recurrent_dropout=0.5), #return_sequences=True
    BatchNormalization(momentum=0.9),
    Dense(units=num_classes, activation='softmax')
])
model.summary()

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_padded, y_train, epochs=10, batch_size=128, validation_data=(X_valid_padded, y_valid), callbacks=[early_stopping])

end_time = time.time()
runtime['GRU'] = end_time - start_time

In [None]:
history_df = pd.DataFrame(history.history)
plot = plotting_funct(history_df, 'Simple GRU')

In [None]:
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)


print('Classification Report for GRU:\n',classification_report(y_test, y_pred, target_names=sentiments))

conf_matrix(y_test,y_pred,'GRU', sentiments)

In [None]:
list_of_evaluations.append(evaluate_classification_with_model('GRU', model, X_test_padded, y_test,sentiments))

<a name='4.4'></a>

## GRU Using GloVe Embedding

In [None]:
# # Split all sentences
elements = (' '.join([sentence for sentence in X])).split()


In [None]:
labels = set(y)

In [None]:
def create_lookup_tables(text):
    """Create lookup tables for vocabulary
    :param text: The text split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    vocab = set(text)
    
    vocab_to_int = {word: i for i, word in enumerate(vocab)}
    int_to_vocab = {v:k for k, v in vocab_to_int.items()}
    
    return vocab_to_int, int_to_vocab

In [None]:
elements.append("<UNK>")

# Map vocabularies to int
vocab_to_int, int_to_vocab = create_lookup_tables(elements)
labels_to_int, int_to_labels = create_lookup_tables(y)

print("Vocabulary of our dataset: {}".format(len(vocab_to_int)))

In [None]:
def convert_to_int(data, data_int):
    """Converts all our text to integers
    :param data: The text to be converted
    :return: All sentences in ints
    """
    all_items = []
    for sentence in data: 
        all_items.append([data_int[word] if word in data_int else data_int["<UNK>"] for word in sentence.split()])
    
    return all_items

In [None]:
enc = OneHotEncoder()

enc.fit(y_train.reshape(-1, 1))

# Fit and transform the target values
y_train_encoded = enc.fit_transform(y_train.reshape(-1, 1)).toarray()
y_test_encoded = enc.transform(y_test.values.reshape(-1, 1)).toarray()
y_valid_encoded = enc.fit_transform(y_valid.reshape(-1, 1)).toarray()

In [None]:
# Hyperparameters
max_sentence_length = 31
#embedding_vector_length = 300
dropout = 0.2

In [None]:
# the downloaded GloVe path
path_to_glove_file = 'glove.6B.200d.txt'

In [None]:
# Importing the glove
embedding_index = {}
with open(path_to_glove_file, encoding = 'utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit = 1)
        coefs = np.fromstring(coefs, 'f', sep = ' ')
        embedding_index[word] = coefs
print('Found %s word vectors.' % len(embedding_index))

In [None]:
num_tokens = len(set(elements)) + 2
embedding_dim = 200
hits = 0
misses = 0

In [None]:
# Creating the Embedding Matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word,i in vocab_to_int.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print('Converted %d words (%d misses)'% (hits, misses))

In [None]:
# Building the GRU Model
model = Sequential()
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.constant(embedding_matrix),
    trainable = False,
)
model.add(embedding_layer)
model.add(GRU(100, dropout=0.5, recurrent_dropout=0.5, return_sequences=False)) # return_sequences=True,
#model.add(GRU(100, dropout=dropout, recurrent_dropout=dropout))
model.add(BatchNormalization(momentum=0.9))
model.add(Dense(len(labels), activation='softmax'))

In [None]:
# Fitting the Model
start_time = time.time()

optimizer = keras.optimizers.Adam()

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_padded, y_train_encoded, batch_size=128, epochs=10, validation_data=(X_valid_padded, y_valid_encoded), callbacks=[early_stopping])

end_time = time.time()
runtime['GRU with GloVe'] = end_time - start_time

In [None]:
history_df = pd.DataFrame(history.history)
plot = plotting_funct(history_df, 'GRU with GloVe Embedding')

In [None]:
int_to_labels = {4: 'religion', 0: 'age', 1: 'ethnicity', 2: 'gender', 3: 'not_cyberbullying'}

# Map predictions to categories
y_pred_categories = [int_to_labels[pred] for pred in y_pred]

In [None]:
y_test_numerical = np.argmax(y_test_encoded, axis=1)

In [None]:
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)

print('Classification Report for GRU with Glove Embedding:\n',classification_report(y_pred, y_test))
sentiments2 = ["age","ethnicity","gender","not bullying","religion"]
conf_matrix(y_pred, y_test,'GRU with Glove Embedding',sentiments)

In [None]:
# Final evaluation of the model
scores = model.evaluate(X_test_padded, y_test_encoded, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
list_of_evaluations.append(evaluate_classification_with_model('GRU with Glove', model, X_test_padded, y_test,sentiments))

## Attention Is All You Need
In this section, it is tried to deploy the GRU model with a Word2Vec embedding matrix and attention mechanism using the PyTorch library.


The code for this section has been obtained from: [Kaggle](https://www.kaggle.com/code/ludovicocuoghi/detecting-bullying-tweets-pytorch-lstm-bert)


<a name='4.5'></a>

## GRU (Word2Vec & Attention)

In [None]:
# With the help of This function tweets are tokenized 
def Tokenize(column, seq_len):
    ##Create vocabulary of words from column
    corpus = [word for text in column for word in text.split()]
    count_words = Counter(corpus)
    sorted_words = count_words.most_common()
    vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

    ##Tokenize the columns text using the vocabulary
    text_int = []
    for text in column:
        r = [vocab_to_int[word] for word in text.split()]
        text_int.append(r)
    ##Add padding to tokens
    features = np.zeros((len(text_int), seq_len), dtype = int)
    for i, review in enumerate(text_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[: seq_len]
        features[i, :] = np.array(new)

    return sorted_words, features

In [None]:
vocabulary, tokenized_column = Tokenize(df["text_clean"], max_len)

In [None]:
keys = []
values = []
for key, value in vocabulary[:20]:
    keys.append(key)
    values.append(value)

### Word Embedding by Word2Vec

In [None]:
Word2vec_train_data = list(map(lambda x: x.split(), X_train))

In [None]:
EMBEDDING_DIM = 200

In [None]:
word2vec_model = Word2Vec(Word2vec_train_data, vector_size=EMBEDDING_DIM)

In [None]:
print(f"Vocabulary size: {len(vocabulary) + 1}")

In [None]:
VOCAB_SIZE = len(vocabulary) + 1 #+1 for the padding

In [None]:
# Define an empty embedding matrix of shape (VOCAB_SIZE, EMBEDDING_DIM)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

# Fill the embedding matrix with pre-trained values from word2vec
for word, token in vocabulary:
    # Check if the word is present in the word2vec model's vocabulary
    if word in word2vec_model.wv.key_to_index:
        # If the word is present, retrieve its embedding vector and add it to the embedding matrix
        embedding_vector = word2vec_model.wv[word]
        embedding_matrix[token] = embedding_vector

# Print the shape of the embedding matrix
print("Embedding Matrix Shape:", embedding_matrix.shape)

### Train - Validation - Test split

In [None]:
X = tokenized_column
y = df['sentiment_code'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_value)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=seed_value)

In [None]:
(unique, counts) = np.unique(y_train, return_counts=True)
np.asarray((unique, counts)).T

In [None]:
ros = RandomOverSampler()
X_train_os, y_train_os = ros.fit_resample(np.array(X_train),np.array(y_train))

In [None]:
(unique, counts) = np.unique(y_train_os, return_counts=True)
np.asarray((unique, counts)).T

### PyTorch datasets and dataloaders

In [None]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid))

In [None]:
BATCH_SIZE = 32

In [None]:
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=True) 
valid_loader = DataLoader(valid_data, shuffle=False, batch_size=BATCH_SIZE, drop_last=True)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE, drop_last=True)

### PyTorch GRU modeling with Attention Layer

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_dim, is_bidirectional):
        super(Attention, self).__init__()
        self.is_bidirectional = is_bidirectional
        # The attention linear layer which transforms the input data to the hidden space
        self.attn = nn.Linear(hidden_dim * (4 if is_bidirectional else 2), hidden_dim * (2 if is_bidirectional else 1))
        # The linear layer that calculates the attention scores
        self.v = nn.Linear(hidden_dim * (2 if is_bidirectional else 1), 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        # Concatenate the last two hidden states in case of a bidirectional LSTM
        if self.is_bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=-1)
        else:
            hidden = hidden[-1]
        # Repeat the hidden state across the sequence length
        hidden_repeated = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        # Calculate attention weights
        attn_weights = torch.tanh(self.attn(torch.cat((hidden_repeated, encoder_outputs), dim=2)))
        # Compute attention scores
        attn_weights = self.v(attn_weights).squeeze(2)
        # Apply softmax to get valid probabilities
        return nn.functional.softmax(attn_weights, dim=1)

In [None]:
class GRU_Sentiment_Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, gru_layers, dropout, is_bidirectional):
        super(GRU_Sentiment_Classifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = gru_layers
        self.is_bidirectional = is_bidirectional

        # The Embedding layer that converts input words to embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # GRU layer which processes the embeddings
        self.gru = nn.GRU(embedding_dim, hidden_dim, gru_layers, batch_first=True, bidirectional=is_bidirectional)
        # Attention layer to compute the context vector
        self.attention = Attention(hidden_dim, is_bidirectional)
        # Fully connected layer which classifies the context vector into classes
        self.fc = nn.Linear(hidden_dim * (2 if is_bidirectional else 1), num_classes)
        # Apply LogSoftmax to outputs for numerical stability
        self.softmax = nn.LogSoftmax(dim=1)
        # Dropout layer for regularisation
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        # Transform words to embeddings
        embedded = self.embedding(x)
        # Pass embeddings to GRU
        out, hidden = self.gru(embedded, hidden)
        # Calculate attention weights
        attn_weights = self.attention(hidden, out)
        # Calculate context vector by taking the weighted sum of GRU outputs
        context = attn_weights.unsqueeze(1).bmm(out).squeeze(1)
        # Classify the context vector
        out = self.softmax(self.fc(context))
        return out, hidden

    def init_hidden(self, batch_size):
        # Factor determines the size of hidden states depending on bidirectionality
        factor = 2 if self.is_bidirectional else 1
        # Initial hidden state is zero
        h0 = torch.zeros(self.num_layers * factor, batch_size, self.hidden_dim).to(DEVICE)
        return h0


In [None]:
NUM_CLASSES = 5  # We are dealing with a multiclass classification of 5 classes
HIDDEN_DIM = 100  # Number of neurons of the internal state (internal neural network in the LSTM)
GRU_LAYERS = 1  # Number of stacked GRU layers

IS_BIDIRECTIONAL = False  # Set this to False for unidirectional GRU, and True for bidirectional GRU

LR = 4e-4  # Learning rate
DROPOUT = 0.5  # GRU Dropout
EPOCHS = 10  # Number of training epochs

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GRU_Sentiment_Classifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES, GRU_LAYERS, DROPOUT, IS_BIDIRECTIONAL)

model = model.to(DEVICE)

# Initialize the embedding layer with the previously defined embedding matrix
model.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
# Allow the embedding matrix to be fine-tuned to better adapt to our dataset and get higher accuracy
model.embedding.weight.requires_grad = True

# Set up the criterion (loss function)
criterion = nn.NLLLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=5e-6)

print(model)


In [None]:
total_step = len(train_loader)
total_step_val = len(valid_loader)

early_stopping_patience = 4
early_stopping_counter = 0

start_time = time.time()

valid_acc_max = 0  # Initialize best accuracy top 0

for e in range(EPOCHS):

    # Lists to host the train and validation losses of every batch for each epoch
    train_loss, valid_loss = [], []
    # Lists to host the train and validation accuracy of every batch for each epoch
    train_acc, valid_acc = [], []

    # Lists to host the train and validation predictions of every batch for each epoch
    y_train_list, y_val_list = [], []

    # Initalize number of total and correctly classified texts during training and validation
    correct, correct_val = 0, 0
    total, total_val = 0, 0
    running_loss, running_loss_val = 0, 0

    ####TRAINING LOOP####
    model.train()

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)  # Load features and targets in device

        h = model.init_hidden(labels.size(0))

        model.zero_grad()  # Reset gradients

        output, h = model(inputs, h)  # Get output and hidden states from GRU network

        loss = criterion(output, labels)
        loss.backward()

        running_loss += loss.item()

        optimizer.step()

        y_pred_train = torch.argmax(output, dim=1)  # Get tensor of predicted values on the training set
        y_train_list.extend(y_pred_train.squeeze().tolist())  # Transform tensor to list and the values to the list

        correct += torch.sum(y_pred_train == labels).item()  # Count correctly classified texts per batch
        total += labels.size(0)  # Count total texts per batch

    train_loss.append(running_loss / total_step)
    train_acc.append(100 * correct / total)

    ####VALIDATION LOOP####
    with torch.no_grad():

        model.eval()

        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            val_h = model.init_hidden(labels.size(0))

            output, val_h = model(inputs, val_h)

            val_loss = criterion(output, labels)
            running_loss_val += val_loss.item()

            y_pred_val = torch.argmax(output, dim=1)
            y_val_list.extend(y_pred_val.squeeze().tolist())

            correct_val += torch.sum(y_pred_val == labels).item()
            total_val += labels.size(0)

        valid_loss.append(running_loss_val / total_step_val)
        valid_acc.append(100 * correct_val / total_val)

    # Save model if validation accuracy increases
    if np.mean(valid_acc) >= valid_acc_max:
        torch.save(model.state_dict(), './state_dict.pt')
        print(f'Epoch {e+1}:Validation accuracy increased ({valid_acc_max:.6f} --> {np.mean(valid_acc):.6f}).  Saving model ...')
        valid_acc_max = np.mean(valid_acc)
        early_stopping_counter = 0  # Reset counter if validation accuracy increases
    else:
        print(f'Epoch {e+1}:Validation accuracy did not increase')
        early_stopping_counter += 1  # Increase counter if validation accuracy does not increase

    if early_stopping_counter > early_stopping_patience:
        print('Early stopped at epoch :', e+1)
        break

    print(f'\tTrain_loss : {np.mean(train_loss):.4f} Val_loss : {np.mean(valid_loss):.4f}')
    print(f'\tTrain_acc : {np.mean(train_acc):.3f}% Val_acc : {np.mean(valid_acc):.3f}%')

end_time = time.time()
runtime['GRU with Word2Vec & Attention'] = end_time - start_time

In [None]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    y_pred_list = []
    y_test_list = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            test_h = model.init_hidden(labels.size(0))

            output, val_h = model(inputs, test_h)
            y_pred_test = torch.argmax(output, dim=1)
            y_pred_list.extend(y_pred_test.squeeze().tolist())
            y_test_list.extend(labels.squeeze().tolist())
    
    return y_pred_list, y_test_list

y_pred_list, y_test_list = evaluate_model(model, test_loader)

In [None]:
print('Classification Report for GRU :\n', classification_report(y_test_list, y_pred_list, target_names=sentiments))

In [None]:
conf_matrix(y_test_list,y_pred_list,'GRU with Word2Vec embedding and attention layer', sentiments)

In [None]:
# saving the evaluation metrics for GRU (Word2Vec and Attention)
evaluation = {}
evaluation['Model'] = 'GRU (Word2Veb & Attention)'
evaluation['Accuracy'] = accuracy_score(y_test_list, y_pred_list)
evaluation['Precision'] = precision_score(y_test_list, y_pred_list, average='macro')
evaluation['Recall'] = recall_score(y_test_list, y_pred_list, average='macro')
evaluation['F1-score'] = f1_score(y_test_list, y_pred_list, average='macro')

list_of_evaluations.append(evaluation)

<a name='4.6'></a>
## Bert 

In this section, an attempt is made to utilize the pre-trained BERT classifier.




The code for this section has been obtained from: [Kaggle](https://www.kaggle.com/code/ludovicocuoghi/detecting-bullying-tweets-pytorch-lstm-bert)


In [None]:
X = df['text_clean'].values
y = df['sentiment_code'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_value)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=seed_value)

In [None]:
ros = RandomOverSampler()
X_train_os, y_train_os = ros.fit_resample(np.array(X_train).reshape(-1,1),np.array(y_train).reshape(-1,1))

In [None]:
X_train_os = X_train_os.flatten()
y_train_os = y_train_os.flatten()

In [None]:
(unique, counts) = np.unique(y_train_os, return_counts=True)
np.asarray((unique, counts)).T

### BERT Tokenization

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
def bert_tokenizer(data):
    input_ids = []
    attention_masks = []
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
            max_length=MAX_LEN,             # Choose max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length 
            return_attention_mask=True      # Return attention mask
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [None]:
# Tokenize train tweets
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in X_train]

# Find the longest tokenized tweet
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

In [None]:
MAX_LEN = 128

In [None]:
train_inputs, train_masks = bert_tokenizer(X_train_os)
val_inputs, val_masks = bert_tokenizer(X_valid)
test_inputs, test_masks = bert_tokenizer(X_test)

### Data preprocessing for PyTorch BERT model

In [None]:
# Convert target columns to pytorch tensors format
train_labels = torch.from_numpy(y_train_os)
val_labels = torch.from_numpy(y_valid)
test_labels = torch.from_numpy(y_test)

### Dataloaders

In [None]:
batch_size = 32

In [None]:
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Create the DataLoader for our test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

### BERT Modeling

In [None]:
class Bert_Classifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(Bert_Classifier, self).__init__()
        # Specify hidden size of BERT, hidden size of the classifier, and number of labels
        n_input = 768
        n_hidden = 50
        n_output = 5

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate the classifier (a fully connected layer followed by a ReLU activation and another fully connected layer)
        self.classifier = nn.Sequential(
            nn.Linear(n_input, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_output)
        )

        # Freeze the BERT model weights if freeze_bert is True (useful for feature extraction without fine-tuning)
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # Feed input data (input_ids and attention_mask) to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        # Extract the last hidden state of the `[CLS]` token from the BERT output (useful for classification tasks)
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed the extracted hidden state to the classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [None]:
# Function for initializing the BERT Classifier model, optimizer, and learning rate scheduler
def initialize_model(epochs=4):
    # Instantiate Bert Classifier
    bert_classifier = Bert_Classifier(freeze_bert=False)

    bert_classifier.to(device)

    # Set up optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # learning rate, set to default value
                      eps=1e-8    # decay, set to default value
                      )

    # Calculate total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Define the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
EPOCHS=2

In [None]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=EPOCHS)

### BERT Training

In [None]:
# Define Cross entropy Loss function for the multiclass classification task
loss_fn = nn.CrossEntropyLoss()

def bert_train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):

    print("Start training...\n")
    for epoch_i in range(epochs):
        print("-"*10)
        print("Epoch : {}".format(epoch_i+1))
        print("-"*10)
        print("-"*38)
        print(f"{'BATCH NO.':^7} | {'TRAIN LOSS':^12} | {'ELAPSED (s)':^9}")
        print("-"*38)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0
        
        ###TRAINING###

        # Put the model into the training mode
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass and get logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update model parameters:
            # fine tune BERT params and train additional dense layers
            optimizer.step()
            # update learning rate
            scheduler.step()

            # Print the loss values and time elapsed for every 100 batches
            if (step % 100 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch
                
                print(f"{step:^9} | {batch_loss / batch_counts:^12.6f} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        ###EVALUATION###
        
        # Put the model into the evaluation mode
        model.eval()
        
        # Define empty lists to host accuracy and validation for each batch
        val_accuracy = []
        val_loss = []

        for batch in val_dataloader:
            batch_input_ids, batch_attention_mask, batch_labels = tuple(t.to(device) for t in batch)
            
            # We do not want to update the params during the evaluation,
            # So we specify that we dont want to compute the gradients of the tensors
            # by calling the torch.no_grad() method
            with torch.no_grad():
                logits = model(batch_input_ids, batch_attention_mask)

            loss = loss_fn(logits, batch_labels)

            val_loss.append(loss.item())

            # Get the predictions starting from the logits (get index of highest logit)
            preds = torch.argmax(logits, dim=1).flatten()

            # Calculate the validation accuracy 
            accuracy = (preds == batch_labels).cpu().numpy().mean() * 100
            val_accuracy.append(accuracy)

        # Compute the average accuracy and loss over the validation set
        val_loss = np.mean(val_loss)
        val_accuracy = np.mean(val_accuracy)
        
        # Print performance over the entire training data
        time_elapsed = time.time() - t0_epoch
        print("-"*61)
        print(f"{'AVG TRAIN LOSS':^12} | {'VAL LOSS':^10} | {'VAL ACCURACY (%)':^9} | {'ELAPSED (s)':^9}")
        print("-"*61)
        print(f"{avg_train_loss:^14.6f} | {val_loss:^10.6f} | {val_accuracy:^17.2f} | {time_elapsed:^9.2f}")
        print("-"*61)
        print("\n")
    
    print("Training complete!")

In [None]:
start_time = time.time()


bert_train(bert_classifier, train_dataloader, val_dataloader, epochs=EPOCHS)

end_time = time.time()
runtime['Bert'] = end_time - start_time