In [1]:
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [29]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense
from tensorflow.keras.utils import pad_sequences, to_categorical

In [30]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from gensim.models import Word2Vec
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
data = load_dataset("tdavidson/hate_speech_offensive")

In [32]:
data_split = data['train'].train_test_split(test_size = 0.3)
train = data_split['train']
test = data_split['test']

In [33]:
train_df = train.to_pandas()
test_df = test.to_pandas()

In [34]:
# Load the stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join the tokens back into a single string
    text = ' '.join(tokens)
    return text

# Apply the preprocessing function to the 'text' column
train_df['tweet'] = train_df['tweet'].apply(preprocess_text)
test_df['tweet'] = test_df['tweet'].apply(preprocess_text)

In [35]:
train_df['tokens'] = train_df['tweet'].apply(word_tokenize)
test_df['tokens'] =  test_df['tweet'].apply(word_tokenize)

# Train Word2Vec model
model = Word2Vec(sentences=train_df['tokens'], vector_size=100, window=5, min_count=1, workers=4)
def get_average_word2vec(tokens, model, vector_size):
    # Initialize an empty vector
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

vector_size = model.vector_size

train_df['text_vec'] = train_df['tokens'].apply(lambda x: get_average_word2vec(x, model, vector_size))
test_df['text_vec'] = test_df['tokens'].apply(lambda x: get_average_word2vec(x, model, vector_size))

# Convert lists of vectors to a 2D array for the classifier
X_train = np.stack(train_df['text_vec'].values)
y_train = train_df['class'].values
X_test = np.stack(test_df['text_vec'].values)
y_test = test_df['class'].values

In [13]:
X_train = pad_sequences(X_train,padding='post')
X_test = pad_sequences(X_test,padding='post')

In [14]:
X_train.shape

(17348, 100)

In [36]:
y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

In [40]:
model = Sequential()

model.add(LSTM(128,input_shape=(100,1),return_sequences=False))
model.add(Dense(3,activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 128)               66560     
                                                                 
 dense_2 (Dense)             (None, 3)                 387       
                                                                 
Total params: 66947 (261.51 KB)
Trainable params: 66947 (261.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [41]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [39]:
model.fit(X_train,y_train,epochs=30,validation_data=(X_test,y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
 81/543 [===>..........................] - ETA: 19s - loss: 0.5926 - accuracy: 0.7751

KeyboardInterrupt: 

In [21]:
y_set = set(y_test)
print(y_set)

{0, 1, 2}
