## Libraries

In [30]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

# setting the column width to none to display full text of the comment
pd.set_option('display.max_colwidth', None)

# setting up GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

## Importing Data


In [3]:
data = pd.read_csv("/content/train.csv.zip")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [5]:
data.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
98705,1000e093d62f4731,the world. Just give it up mate you are a sad individual kensington even more fucking sadder bet your an american as well do the owrld a favour and get lostt,1,0,1,0,1,0
128903,b15ad896aee9ed7c,Chapter 8 footnote 168 and 168 WCR\n\nRPJ,0,0,0,0,0,0
118793,7adbab3eb0f95913,"""\nAlright, but just an update: I've just come across several Broadcast & Cable articles from '99 that have ratings info on almost all five of the series' original UPN episodes! (turns out they were way lower then I thought, lol) So, there's no rush for that info, I'm settled for now. Still, though, I'm a total n00b when it comes to ratings; I'm revising the pilot, so can you tell me if you know what a 1.4/2 Nielsen Rating translates to in numbers? Cheers, '''''' I am Jack's lack of surprise """,0,0,0,0,0,0
107248,3d4ea4092721dfb9,"Going by the chart at WP:CANVASS, I limited the scale to those users who had already contributed to that thread, phrased the notice as neutrally as my wordsmithing abilities would allow, and left a comment at the primary discussion that I had done so. Craftyminion left a comment at that thread at 01:50 2009-11-10 server time. If I omitted anyone, please let me know or notify them yourself.\n I posted exactly the same message to the talkpage of everyone who had commented at that thread, with the exceptions of myself and Keepcalmandcarryon; example notification. My notification to KCACO also included an apology for notifying them of a thread to which they had recently contributed and may reasonably be assumed to be watching; given that that thread has been petering out, as long AN/I threads are wont to do, I decided to err on the side of caution. - 2/0 (cont.)",0,0,0,0,0,0
44247,7629e6b88b80df94,"Gu Kalia\ngo watch some news, or just search those things online, and see if they are fake or not. go ahead. i challenge you. i just reverted your change.",0,0,0,0,0,0


Lets look at some of the toxic comments


In [6]:
data[data['toxic'] == 1].sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
137571,e027d1e7c3589a57,"Make that four, fifth is underway. This is rediculous! 18:32, 29 Dec 2004 (UTC)",1,0,0,0,0,0
46556,7c63fc6015529a59,DIFFERENT IP ADDRESS - 90.196.78.205 - SEE WHAT I MEAN ! BAN WHAT BAN ?\n\nGO GET YOURSELF A REAL LIFE INSTEAD OF OBSESSIVELY HANGING AROUND WIKIPEDIA DELUDING YOURSELF OF PETTY POWERS WEHN YOU OBVIOUSLY CAN'T ASSERT YOURSELF IN THE REAL WORLD.,1,0,0,0,0,0
102509,2498248e712000ec,STOP GETTING RID OF THE FLAGS! THEY HAVE BEEN IN THE UFC WIKI PAGES EVER SINCE THE START! THEY EVEN SHOW THE FLAGS IN TALE OF THE TAPE! Annoying cunts like yourself are the reason people say Wikipedia is shit.,1,0,0,0,0,0
99202,12a5ad583296b545,Fuck you bitch \n\nFuck you bitch,1,1,1,0,1,0
36778,62399956a195230e,Do I know you? ==BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe You aRe a FGGT!\n==,1,0,0,0,0,0


## Preprocessing


In [7]:
from tensorflow.keras.layers import TextVectorization

Splitting into X and y 

In [8]:
X = data['comment_text']
y = data[data.columns[2:]].values

Number of words in our vocabulary


In [10]:
MAX_FEATURES = 200000

In [11]:
vectorizer = TextVectorization(max_tokens = MAX_FEATURES, output_sequence_length = 1800, output_mode = 'int')

We are using this 'TextVectorization' to create word embedding of training data. In other words we are creating numerical representations of our text data.

In [12]:
vectorizer.adapt(X.values)

In [13]:
print(vectorizer.get_vocabulary()[:100])

['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'you', 'i', 'is', 'that', 'in', 'it', 'for', 'this', 'not', 'on', 'be', 'as', 'have', 'are', 'your', 'with', 'if', 'article', 'was', 'or', 'but', 'page', 'my', 'an', 'from', 'by', 'do', 'at', 'about', 'me', 'so', 'wikipedia', 'can', 'what', 'there', 'all', 'has', 'will', 'talk', 'please', 'would', 'its', 'no', 'one', 'just', 'like', 'they', 'he', 'dont', 'which', 'any', 'been', 'should', 'more', 'we', 'some', 'other', 'who', 'see', 'here', 'also', 'his', 'think', 'im', 'because', 'know', 'how', 'am', 'people', 'why', 'edit', 'articles', 'only', 'out', 'up', 'when', 'were', 'use', 'then', 'may', 'time', 'did', 'them', 'now', 'being', 'their', 'than', 'thanks', 'even', 'get', 'make', 'good', 'had']


In [14]:
print(vectorizer("Hello, How are you?")[:5])

tf.Tensor([288  73  20   7   0], shape=(5,), dtype=int64)


In [15]:
%%time
vectorized_text = vectorizer(X.values)

CPU times: user 2.82 s, sys: 1.65 s, total: 4.47 s
Wall time: 4.47 s


In [16]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

data pipeline for tensorflow

In [17]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [18]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [19]:
print(batch_X.shape, batch_y.shape)

(16, 1800) (16, 6)


## train, val, test


In [20]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [21]:
print(f'Training Examples: {len(train)}, Validation Examples: {len(val)}, Test Examples: {len(test)}')

Training Examples: 6981, Validation Examples: 1994, Test Examples: 997


## Neural Network


In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [23]:
def build_model():
    model = Sequential()
    # Embedding layer
    model.add(Embedding(MAX_FEATURES+1, 32))
    # Bidirectional LSTM layer
    model.add(Bidirectional(LSTM(32, activation= 'tanh')))
    #  dense layers
    model.add(Dense(128, activation= 'relu'))
    model.add(Dense(256, activation= 'relu'))
    model.add(Dense(128, activation= 'relu'))
    # final layer with multiple outputs
    model.add(Dense(6, activation= 'sigmoid'))
    # Compiling the model
    model.compile(loss= 'BinaryCrossentropy', optimizer= 'Adam')
    
    return model

In [24]:
model = build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [31]:
%%time
# Training the model
history = model.fit(train, epochs = 1, validation_data= val)

CPU times: user 10min 43s, sys: 10.2 s, total: 10min 53s
Wall time: 11min


ValueError: ignored

In [33]:
history.history


{'loss': [0.05893781781196594], 'val_loss': [0.04865012317895889]}

In [67]:
input_text = vectorizer("Fuck you bitch")

In [61]:
res = model.predict(np.expand_dims(input_text, 0))



In [62]:
res

array([[0.99972755, 0.40807196, 0.9861391 , 0.01927693, 0.8981676 ,
        0.13493586]], dtype=float32)

### result = [toxic, severe_toxic, obscene, threat, insult, identity_hate] 

In [63]:
(res > 0.4).astype('int')

array([[1, 1, 1, 0, 1, 0]])