In [108]:
import pandas as pd
import numpy as np

import tensorflow as tf
from keras.layers import Dense, LSTM, Embedding, TextVectorization, Dropout, Bidirectional
from keras.preprocessing.text import one_hot 
from keras.models import Sequential


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



### Reading Data

In [3]:
train = pd.read_csv('train.csv')
#test = pd.read_csv('test.csv')

In [5]:
train.head()
df = train.drop(columns=['id'])

In [6]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   comment_text   159571 non-null  object
 1   toxic          159571 non-null  int64 
 2   severe_toxic   159571 non-null  int64 
 3   obscene        159571 non-null  int64 
 4   threat         159571 non-null  int64 
 5   insult         159571 non-null  int64 
 6   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 8.5+ MB


In [9]:
df.shape

(159571, 7)

# Data Preprocessing

In [28]:
x = df['comment_text']
y = df[df.columns[1:]]

In [29]:
y_arr =y.values #converting the df to array

In [30]:
MAX_FEATURE = 10000 #VOCAB SIZE

In [33]:
vectorizer = TextVectorization(
    max_tokens= MAX_FEATURE,
    output_sequence_length=1800,
    output_mode='int'
    )

In [34]:
vectorizer.adapt(x.values)

In [44]:
vectorizer('hello world fuck you man')[:10]

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([286, 261, 139,   7, 440,   0,   0,   0,   0,   0], dtype=int64)>

In [45]:
vectorized_text = vectorizer(x.values)


In [48]:
x.values.shape

(159571,)

In [46]:
vectorized_text #vectorizer pads up the remaining word places with 0

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[ 643,   76,    2, ...,    0,    0,    0],
       [   1,   54, 2506, ...,    0,    0,    0],
       [ 425,  440,   70, ...,    0,    0,    0],
       ...,
       [   1, 7329,  383, ...,    0,    0,    0],
       [   5,   12,  533, ...,    0,    0,    0],
       [   5,    8,  130, ...,    0,    0,    0]], dtype=int64)>

In [89]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y_arr))
dataset = dataset.cache()
dataset = dataset.shuffle(16000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [91]:
batch_X, batch_Y = dataset.as_numpy_iterator().next()

In [92]:
len(dataset)#its batches (16)

9974

In [96]:
len(batch_X[9])

1800

In [102]:
train= dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test= dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [106]:
len(train),len(val),len(test)

(6981, 1994, 997)

In [107]:
train_generator = train.as_numpy_iterator()

# Model Building

In [114]:

model = Sequential()

#Input Embedding Layer +1 is for the Unkown token
model.add(Embedding(MAX_FEATURE+1,32))

#layer of LSTM
model.add(Bidirectional(LSTM(128,activation='relu',return_sequences=True)))
model.add(Bidirectional(LSTM(64,activation='relu',return_sequences=True)))
model.add(Bidirectional(LSTM(32,activation='relu')))

#layer of Fully connected Networks
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))

#Final 6 classifier nuerons
model.add(Dense(6,activation='sigmoid'))





In [115]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

In [116]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          320032    
                                                                 
 bidirectional_1 (Bidirectio  (None, None, 256)        164864    
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 128)        164352    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                      

In [118]:
history= model.fit(train,epochs=2,validation_data=val,verbose=2)    

Epoch 1/2


KeyboardInterrupt: 