In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
train = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv').fillna('')
#df_train2 = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv').fillna('')
test = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/test.csv').fillna('')
df_val = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/validation.csv').fillna('')

In [None]:
train.head()

In [None]:
train.drop(columns=['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], inplace=True)

In [None]:
train.head()

In [None]:
df_val.head()

In [None]:
test.head()

**Tokenizing Dataset**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocab_size = 5000
embedding_dim = 16
max_length = 5000
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [None]:
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(train.comment_text.values)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train.comment_text.values)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type)

valid_sequences = tokenizer.texts_to_sequences(df_val.comment_text.values)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type)

testing_sequences = tokenizer.texts_to_sequences(test.content.values)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

**Tokenization**

In [None]:
from tqdm import tqdm

In [None]:
embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

**Model**

In [None]:
training_sequences

In [None]:
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(len(word_index) + 1,
                     300,
                     input_length=max_length))
    model.add(tf.keras.layers.LSTM(300, dropout=0.3, recurrent_dropout=0.3))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
model.summary()

In [None]:
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [None]:
# Early Stopping:

cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [None]:
model.fit(training_padded, train['toxic'], epochs=10, validation_data=(valid_padded, df_val['toxic']),
          batch_size=64*strategy.num_replicas_in_sync)

In [None]:
# prediction:

pred = model.predict(testing_padded)

In [None]:
sub = pd.DataFrame(pred, columns=['toxic'])
sub

In [None]:
test