<a href="https://colab.research.google.com/github/devroopsaha744/HateSpeechDetect-text/blob/main/HateSpeechDetect_using_DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets -q
!pip install imbalanced-learn -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout
from tensorflow.keras.utils import pad_sequences, to_categorical
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from tensorflow.keras.models import load_model

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from gensim.models import Word2Vec
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
data = load_dataset("tdavidson/hate_speech_offensive")

Downloading readme:   0%|          | 0.00/5.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24783 [00:00<?, ? examples/s]

In [5]:
data_split = data['train'].train_test_split(test_size = 0.3)
train = data_split['train']
test = data_split['test']

In [6]:
train_df = train.to_pandas()
test_df = test.to_pandas()

In [7]:
# Load the stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join the tokens back into a single string
    text = ' '.join(tokens)
    return text

# Apply the preprocessing function to the 'text' column
train_df['tweet'] = train_df['tweet'].apply(preprocess_text)
test_df['tweet'] = test_df['tweet'].apply(preprocess_text)

In [8]:
train_df['tokens'] = train_df['tweet'].apply(word_tokenize)
test_df['tokens'] =  test_df['tweet'].apply(word_tokenize)

# Train Word2Vec model
model = Word2Vec(sentences=train_df['tokens'], vector_size=100, window=5, min_count=1, workers=4)
def get_average_word2vec(tokens, model, vector_size):
    # Initialize an empty vector
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

vector_size = model.vector_size

train_df['text_vec'] = train_df['tokens'].apply(lambda x: get_average_word2vec(x, model, vector_size))
test_df['text_vec'] = test_df['tokens'].apply(lambda x: get_average_word2vec(x, model, vector_size))

# Convert lists of vectors to a 2D array for the classifier
X_train = np.stack(train_df['text_vec'].values)
y_train = train_df['class'].values
X_test = np.stack(test_df['text_vec'].values)
y_test = test_df['class'].values

In [9]:
#Applying data-resampling
# Define the resampling pipeline
over = SMOTE(sampling_strategy= 'auto')  # Oversample the minority class to 10% of the majority class
under = RandomUnderSampler(sampling_strategy='auto')  # Undersample the majority class to 50% of the original

# Combine the oversampling and undersampling in a pipeline
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# Apply resampling to the training data
X_train_res, y_train_res = pipeline.fit_resample(X_train, y_train)

In [10]:
y_train_res = to_categorical(y_train_res, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

In [19]:
y_train_res

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [11]:
model = Sequential()

model.add(LSTM(128,input_shape=(100,1),return_sequences=False))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(3,activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               66560     
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 195       
                                                                 
Total params: 75011 (293.01 KB)
Trainable params: 75011 (293.01 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [13]:
model.fit(X_train_res,y_train_res,epochs=50,validation_data=(X_test,y_test), batch_size = 32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7d88d846dbd0>

In [14]:
y_pred = model.predict(X_test)



In [15]:
y_pred

array([[0.20659773, 0.77047944, 0.02292282],
       [0.33701906, 0.08009719, 0.5828837 ],
       [0.42827934, 0.389693  , 0.18202765],
       ...,
       [0.14547609, 0.03614113, 0.8183828 ],
       [0.4858342 , 0.21205065, 0.3021151 ],
       [0.45058474, 0.17845197, 0.37096334]], dtype=float32)

In [16]:
y_pred_classes = np.argmax(y_pred, axis = 1)

In [17]:
set(y_pred_classes)

{0, 1, 2}

In [18]:
y_test_classes = np.argmax(y_test, axis = 1)

In [19]:
set(y_test_classes)

{0, 1, 2}

In [21]:
print(classification_report(y_test_classes, y_pred_classes))

              precision    recall  f1-score   support

           0       0.11      0.42      0.17       424
           1       0.94      0.82      0.87      5796
           2       0.68      0.39      0.49      1215

    accuracy                           0.73      7435
   macro avg       0.57      0.54      0.51      7435
weighted avg       0.85      0.73      0.77      7435



In [None]:
model.save("LSTM-HateSpeech.h5")

  saving_api.save_model(


In [None]:
#Metrics for LSTM
'''
       precision    recall  f1-score   support

           0       0.11      0.42      0.17       424
           1       0.94      0.82      0.87      5796
           2       0.68      0.39      0.49      1215

    accuracy                           0.73      7435
   macro avg       0.57      0.54      0.51      7435
weighted avg       0.85      0.73      0.77      7435

'''