In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import numpy as np
import matplotlib.pyplot as plt
import sys

/home/kwamena/miniconda3/bin/python


In [26]:

# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/cyber-kwamena/set/main/sample_submission.csv')



In [16]:
# Data cleaning and exploration if needed
# Display the column names of your DataFrame
print(df.columns)


Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')


In [27]:

# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['id'])
sequences = tokenizer.texts_to_sequences(df['id'])
padded = pad_sequences(sequences, maxlen=200, padding='post', truncating='post')


In [28]:

# Multi-label classification
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
labels = df[label_columns]


In [29]:

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=0, shuffle=True, stratify=labels)


In [30]:

# Building the RNN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=200))
model.add(LSTM(64, dropout=0.2))
model.add(Dense(6, activation='sigmoid'))  # 6 categories

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 32)           160000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                24832     
                                                                 
 dense_1 (Dense)             (None, 6)                 390       
                                                                 
Total params: 185222 (723.52 KB)
Trainable params: 185222 (723.52 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:

# Training the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Save the model
model.save('toxic_model.keras')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
df.shape

(153164, 7)

In [32]:
df['id'].value_counts()

id
ffffce3fb183ee80    1
00001cee341fdb12    1
0000247867823ef7    1
00013b17ad220c46    1
00017563c3f7919a    1
                   ..
000634272d0d44eb    1
000663aff0fffc80    1
000689dd34e20979    1
000834769115370c    1
000844b52dee5f3f    1
Name: count, Length: 153164, dtype: int64

In [13]:

# Evaluation of the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy*100:.2f}%")


Accuracy: 0.00%
