**For the coding part of our project, we will be using the files from this Cyberbullying Detection Dataset on Kaggle:** https://www.kaggle.com/datasets/sayankr007/cyber-bullying-data-for-multi-label-classification/data?select=final_hateXplain.csv

In [None]:
!pip install vaderSentiment
!pip install spacy

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# loading the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# loading the dataset
hateXplain_df = pd.read_csv('/content/hateXplain.csv.zip')

# displaying the first few rows
print(hateXplain_df.head())

                       post_id  annotator_id   label target  \
0  1179055004553900032_twitter             1  normal    NaN   
1  1179055004553900032_twitter             2  normal    NaN   
2  1179055004553900032_twitter             3  normal    NaN   
3  1179063826874032128_twitter             1  normal    NaN   
4  1179063826874032128_twitter             2  normal    NaN   

                                         post_tokens  
0  i dont think im getting my baby them white 9 h...  
1  i dont think im getting my baby them white 9 h...  
2  i dont think im getting my baby them white 9 h...  
3  we cannot continue calling ourselves feminists...  
4  we cannot continue calling ourselves feminists...  


In [None]:
# preprocessing function to clean and tokenize text using spaCy
def preprocess_text_spacy(text):

    doc = nlp(text)
    return [token.text for token in doc]

# apply preprocessing to the 'post_tokens' column
hateXplain_df['cleaned_text'] = hateXplain_df['post_tokens'].apply(lambda x: " ".join(preprocess_text_spacy(x)))

# function to analyze sentiment using VADER
def analyze_sentiment_with_emoji(text):

    sentiment_score = analyzer.polarity_scores(text)
    return sentiment_score['compound']  # the compound score reflects overall sentiment

# apply sentiment analysis to the cleaned text
hateXplain_df['sentiment_score'] = hateXplain_df['cleaned_text'].apply(analyze_sentiment_with_emoji)

# displaying the sentiment score for the first few rows
print(hateXplain_df[['cleaned_text', 'sentiment_score']].head())


                                        cleaned_text  sentiment_score
0  i do nt think i m getting my baby them white 9...           0.0000
1  i do nt think i m getting my baby them white 9...           0.0000
2  i do nt think i m getting my baby them white 9...           0.0000
3  we can not continue calling ourselves feminist...           0.0387
4  we can not continue calling ourselves feminist...           0.0387


In [None]:
# initialize Tokenizer for text processing
tokenizer = Tokenizer(num_words=10000)  # limit vocab size to 10,000 words
tokenizer.fit_on_texts(hateXplain_df['cleaned_text'])

# converting text into sequences of integers
X_text = tokenizer.texts_to_sequences(hateXplain_df['cleaned_text'])
X_text = pad_sequences(X_text, maxlen=100)  # padding sequences to maxlen of 100 words

# converting sentiment_score to numpy array and add it as a feature
X_sentiment = np.array(hateXplain_df['sentiment_score']).reshape(-1, 1)

# encoding the labels (e.g., hate speech or normal)
encoder = LabelEncoder()
y = encoder.fit_transform(hateXplain_df['label'])

In [None]:
# splitting the data into training and test sets for X_with_emojis (text + sentiment)
X_with_emojis = np.concatenate([X_text, X_sentiment], axis=1)  # concatenate text and sentiment features

# splitting the data into training and test sets for X_without_emojis (text only)
X_without_emojis = X_text  # using only text features

# splitting the features and labels into training and test sets
X_train_emojis, X_test_emojis, y_train, y_test = train_test_split(X_with_emojis, y, test_size=0.2, random_state=42)
X_train_no_emojis, X_test_no_emojis, y_train, y_test = train_test_split(X_without_emojis, y, test_size=0.2, random_state=42)

# checking the shapes after reshaping
print(f"Shape of X_train_emojis: {X_train_emojis.shape}, X_test_emojis: {X_test_emojis.shape}")
print(f"Shape of X_train_no_emojis: {X_train_no_emojis.shape}, X_test_no_emojis: {X_test_no_emojis.shape}")

Shape of X_train_emojis: (48355, 101), X_test_emojis: (12089, 101)
Shape of X_train_no_emojis: (48355, 100), X_test_no_emojis: (12089, 100)


In [None]:
import zipfile

# specifying the path to the GloVe zip file
zip_file_path = '/content/glove.6B.100d.txt.zip'

# specifying the directory where we want to extract the files
extract_path = '/content/'

# extracting the contents of the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("File extracted")

# function to load pre-trained GloVe embeddings
def load_glove_embeddings(file_path):

    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# loading the GloVe embeddings
embeddings_index = load_glove_embeddings('/content/glove.6B.100d.txt')

# function to create the embedding matrix
def create_embedding_matrix(word_index, embeddings_index, embedding_dim=100):
    """Create embedding matrix from pre-trained GloVe embeddings."""
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))  # +1 for padding token
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# preparing the embedding matrix for the tokenizer
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embeddings_index, embedding_dim=100)


File extracted


In [None]:
# Define CNN-LSTM model without emojis (text only)
model_without_emojis = Sequential()

# Embedding layer using pre-trained GloVe embeddings
model_without_emojis.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                                   output_dim=100,
                                   weights=[embedding_matrix],
                                   input_length=X_text.shape[1],
                                   trainable=True))

# CNN layers to extract local features
model_without_emojis.add(Conv1D(64, 5, activation='relu'))  # Convolution layer to extract features
model_without_emojis.add(MaxPooling1D(pool_size=4))  # MaxPooling to reduce the dimensionality

# LSTM layer to capture sequential dependencies
model_without_emojis.add(LSTM(64, return_sequences=False))  # LSTM layer for sequential dependencies

# Dropout layer for regularization
model_without_emojis.add(Dropout(0.5))

# Fully connected layers
model_without_emojis.add(Dense(64, activation='relu'))
model_without_emojis.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

# Compile the model
model_without_emojis.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model_without_emojis.summary()



In [None]:
# Define CNN-LSTM model with emojis (text + sentiment)
model_with_emojis = Sequential()

# Embedding layer using pre-trained GloVe embeddings
model_with_emojis.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                                output_dim=100,
                                weights=[embedding_matrix],
                                input_length=X_text.shape[1],
                                trainable=True))

# CNN layers to extract local features
model_with_emojis.add(Conv1D(64, 5, activation='relu'))  # Convolution layer to extract features
model_with_emojis.add(MaxPooling1D(pool_size=4))  # MaxPooling to reduce the dimensionality

# LSTM layer to capture sequential dependencies
model_with_emojis.add(LSTM(64, return_sequences=False))  # LSTM layer for sequential dependencies

# Dropout layer for regularization
model_with_emojis.add(Dropout(0.5))

# Fully connected layers
model_with_emojis.add(Dense(64, activation='relu'))
model_with_emojis.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

# Compile the model
model_with_emojis.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model_with_emojis.summary()



In [None]:
# Train the model with emojis (text + sentiment)
history_with_emojis = model_with_emojis.fit(X_train_emojis, y_train, epochs=5, batch_size=32, validation_data=(X_test_emojis, y_test))

# Evaluate the model with emojis
test_loss_with_emojis, test_acc_with_emojis = model_with_emojis.evaluate(X_test_emojis, y_test)
print(f"Test Accuracy with Emojis: {test_acc_with_emojis:.4f}")

# Train the model without emojis (text only)
history_without_emojis = model_without_emojis.fit(X_train_no_emojis, y_train, epochs=5, batch_size=32, validation_data=(X_test_no_emojis, y_test))

# Evaluate the model without emojis
test_loss_without_emojis, test_acc_without_emojis = model_without_emojis.evaluate(X_test_no_emojis, y_test)
print(f"Test Accuracy without Emojis: {test_acc_without_emojis:.4f}")

Epoch 1/5
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 10ms/step - accuracy: 0.4298 - loss: -14.0515 - val_accuracy: 0.5376 - val_loss: -186.5952
Epoch 2/5
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - accuracy: 0.4523 - loss: -489.6204 - val_accuracy: 0.4427 - val_loss: -1858.4788
Epoch 3/5
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.4818 - loss: -3309.8594 - val_accuracy: 0.5542 - val_loss: -6523.0098
Epoch 4/5
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.4910 - loss: -9564.5430 - val_accuracy: 0.4708 - val_loss: -13258.7412
Epoch 5/5
[1m1512/1512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - accuracy: 0.4947 - loss: -18847.3594 - val_accuracy: 0.4791 - val_loss: -21706.3965
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.4759 - loss: -20268.6445
Test Accuracy with Emoji