In [1]:
import pandas as pd

df_train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
df_test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
df_test_labels = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
X_train = df_train['comment_text']
y_train = df_train[df_train.columns[2:]].values

X_test = df_test['comment_text']

In [5]:
X_train

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
tokenized_test = tokenizer.texts_to_sequences(X_test)

In [8]:
vocab_size = len(tokenizer.word_index)

In [9]:
max_len = max(len(x) for x in tokenized_train)
# max_len_2 = max(len(x) for x in tokenized_test)
print(max_len)

X_train = pad_sequences(tokenized_train, maxlen = max_len, truncating = 'post')
X_test = pad_sequences(tokenized_test, maxlen = max_len, truncating = 'post')

1403


In [10]:
X_train.shape

(159571, 1403)

In [11]:
y_train.shape

(159571, 6)

In [12]:
from sklearn.model_selection import train_test_split
X_t, X_v, y_t, y_v = train_test_split(X_train,y_train,test_size=0.2,random_state=1)

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_t,y_t))
train_dataset = train_dataset.shuffle(10000).batch(256).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((X_v,y_v))
val_dataset = val_dataset.batch(256).prefetch(tf.data.AUTOTUNE)

In [14]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-03-19 20:24:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-19 20:24:51--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... 

  pid, fd = os.forkpty()


connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-19 20:24:52--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-03-19 20:27:34 (5.08 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [15]:
def get_glove_embedding(glove_path):
    embedding_index={}
    with open(glove_path, encoding="utf8") as f:
        for line in f:
            line = line.split()
            word = line[0]
            vector = line[1:]
            embedding_index[word]=vector
    return embedding_index

In [16]:
embedding_index = get_glove_embedding('/kaggle/working/glove.6B.100d.txt')

In [17]:
vocab_size = len(tokenizer.word_index)
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size+1,embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

print(embedding_matrix.shape)

(210338, 100)


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Embedding, Input, GlobalMaxPool1D

In [19]:
#recurrent_dropout applies dropout to the recurrent connections of the LSTM while dropout applies to input connection.
#GlobalMaxPool1D() takes the maximum value across all timesteps in the sequence.
#LSTM output is a (batch_size, timesteps, features) since return_sequence = True, it converts it to (batch_size,features)
#features_dim = unites in lstm

from tensorflow.keras.models import Model

strategy = tf.distribute.MirroredStrategy()
print("no. of gpus ", strategy.num_replicas_in_sync)

with strategy.scope():
    inp = Input(shape=(max_len,))
    x = Embedding(vocab_size+1,embedding_dim,weights=[embedding_matrix], trainable=True)(inp)
    x = Bidirectional(LSTM(64,return_sequences=True,dropout=0.2,recurrent_dropout=0.2))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(64,activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(6,activation='sigmoid')(x)
    model_1 = Model(inputs=inp,outputs=x)
    
    model_1.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

no. of gpus  2


In [20]:
model_1.summary()

In [21]:
from tensorflow.keras.callbacks import ModelCheckpoint
best_model = ModelCheckpoint(
    filepath='best_model.keras',
    monitor='val_accuracy',
    mode='max',
    verbose=1,
    save_best_only=True
)

#using multiple cpu cores for faster dataloading
#it only works with keras datagenerator not numpy arrays

history=model_1.fit(train_dataset, epochs=2, validation_data=val_dataset, callbacks=[best_model])

Epoch 1/2
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2316s[0m 5s/step - accuracy: 0.6164 - loss: 0.1408 - val_accuracy: 0.9944 - val_loss: 0.0527
Epoch 2/2


  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2294s[0m 5s/step - accuracy: 0.9284 - loss: 0.0537 - val_accuracy: 0.9942 - val_loss: 0.0473


In [22]:
model_1.save("final_model.keras")

In [10]:
model = keras.models.load_model("/kaggle/input/comment_toxicity_bilstm_glove_1/keras/default/1/final_model.keras")
model.summary()

In [11]:
sample_sub = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [12]:
df_test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [13]:
sample_sub

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.5,0.5,0.5,0.5,0.5,0.5
153160,fffd7a9a6eb32c16,0.5,0.5,0.5,0.5,0.5,0.5
153161,fffda9e8d6fafa9e,0.5,0.5,0.5,0.5,0.5,0.5
153162,fffe8f1340a79fc2,0.5,0.5,0.5,0.5,0.5,0.5


In [14]:
predictions = model.predict(X_test)
predictions.shape

[1m4787/4787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1033s[0m 216ms/step


(153164, 6)

In [15]:
submission = pd.DataFrame(predictions, columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
submission.insert(0, 'id', df_test['id'])

In [16]:
submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.986040,0.318938,0.956028,0.046706,0.839115,0.190805
1,0000247867823ef7,0.001342,0.000005,0.000177,0.000019,0.000106,0.000058
2,00013b17ad220c46,0.002611,0.000015,0.000467,0.000045,0.000245,0.000125
3,00017563c3f7919a,0.000849,0.000003,0.000074,0.000009,0.000051,0.000028
4,00017695ad8997eb,0.002584,0.000010,0.000291,0.000032,0.000214,0.000073
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.788785,0.008212,0.320807,0.010364,0.259854,0.019570
153160,fffd7a9a6eb32c16,0.021753,0.000095,0.001537,0.000329,0.001943,0.000631
153161,fffda9e8d6fafa9e,0.001454,0.000007,0.000184,0.000028,0.000109,0.000082
153162,fffe8f1340a79fc2,0.001548,0.000005,0.000142,0.000027,0.000089,0.000080


In [17]:
sample_sub

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.5,0.5,0.5,0.5,0.5,0.5
153160,fffd7a9a6eb32c16,0.5,0.5,0.5,0.5,0.5,0.5
153161,fffda9e8d6fafa9e,0.5,0.5,0.5,0.5,0.5,0.5
153162,fffe8f1340a79fc2,0.5,0.5,0.5,0.5,0.5,0.5


In [18]:
submission.to_csv('submission.csv', index=False)