In [13]:

import os
import numpy as np
import pandas as pd
import tensorflow as tf

In [14]:
df=pd.read_csv("toxiccomments.csv")

In [15]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [16]:
df.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [18]:
df.iloc[3]['comment_text']

'"\nMore\nI can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It\'s listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  "'

In [17]:
df[df.columns[2:]].iloc[3]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 3, dtype: int64

In [19]:
from tensorflow.keras.layers import TextVectorization

In [20]:
X=df['comment_text']
y=df[df.columns[2:]].values

In [21]:
#no. of words in vocab
MAX_WORDS=150000

In [22]:
vectorizer=TextVectorization(max_tokens=MAX_WORDS,output_sequence_length=2000,
                            output_mode='int')




In [23]:
vectorizer.adapt(X.values)




In [24]:
 vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on',
 'be',
 'as',
 'have',
 'are',
 'your',
 'with',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'so',
 'wikipedia',
 'can',
 'what',
 'there',
 'all',
 'has',
 'will',
 'talk',
 'please',
 'would',
 'its',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'dont',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 'some',
 'other',
 'who',
 'see',
 'here',
 'also',
 'his',
 'think',
 'im',
 'because',
 'know',
 'how',
 'am',
 'people',
 'why',
 'edit',
 'articles',
 'only',
 'out',
 'up',
 'when',
 'were',
 'use',
 'then',
 'may',
 'time',
 'did',
 'them',
 'now',
 'being',
 'their',
 'than',
 'thanks',
 'even',
 'get',
 'make',
 'good',
 'had',
 'very',
 'information',
 'does',
 'could',
 'well',
 'want',
 'such',
 'sources',
 'way',
 'name',
 'these',
 'deletion',
 'pages',
 'first',
 'help'

In [25]:
vectorizer('Hello world Whats up Hello World')[:6]

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([288, 263, 651,  81, 288, 263], dtype=int64)>

In [27]:
vectorized_text=vectorizer(X.values)

In [28]:
vectorized_text

<tf.Tensor: shape=(159571, 2000), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [29]:
dataset=tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset=dataset.cache()
dataset=dataset.shuffle(200000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8)

In [30]:
batch_X,batch_y=dataset.as_numpy_iterator().next()

In [31]:
len(dataset)

9974

In [32]:
train=dataset.take(int(len(dataset)*.7))
val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.1))

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
model=Sequential()
model.add(Embedding(MAX_WORDS+1,32))
model.add(Bidirectional(LSTM(32,activation='tanh')))
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(6,activation='sigmoid'))


In [34]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')




In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          4800032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [36]:
history=model.fit(train,epochs=1,validation_data=val)



In [37]:
batch=test.as_numpy_iterator().next()

In [38]:
input_str=vectorizer('What a stupid guy!')


In [39]:
input_str

<tf.Tensor: shape=(2000,), dtype=int64, numpy=array([ 40,   6, 563, ...,   0,   0,   0], dtype=int64)>

In [40]:
batch_X,batch_y=test.as_numpy_iterator().next()

In [41]:
(model.predict(batch_X)>0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [42]:
np.expand_dims(input_str,0)

array([[ 40,   6, 563, ...,   0,   0,   0]], dtype=int64)

In [43]:
res=model.predict(np.expand_dims(input_str,0))



In [44]:
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy

In [45]:
pre=Precision()
recall=Recall()
ca=CategoricalAccuracy()

In [46]:
for batch in test.as_numpy_iterator():
    X_true,y_true=batch
    yhat=model.predict(X_true)
    y_true=y_true.flatten()
    yhat=yhat.flatten()
    pre.update_state(y_true,yhat)
    recall.update_state(y_true,yhat)
    ca.update_state(y_true,yhat)
    















In [47]:
print(f'Precision: {pre.result().numpy()},Recall:{recall.result().numpy()},Accuracy:{ca.result().numpy()}')

Precision: 0.8816362023353577,Recall:0.5831894278526306,Accuracy:0.45336008071899414


In [48]:
#Testing

In [49]:
import tensorflow as tf
import gradio as gr

In [50]:
model.save('tox.h5')

  saving_api.save_model(


In [51]:
model = tf.keras.models.load_model('tox.h5')

In [52]:
input_str = vectorizer('I will kill you!')


In [53]:
ans = model.predict(np.expand_dims(input_str,0))



In [60]:
ans

array([[0.82320905, 0.04896668, 0.56949294, 0.07345247, 0.46707335,
        0.10100992]], dtype=float32)

In [61]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [62]:
interface = gr.Interface(fn=score_comment, 
                         inputs="text",
                        outputs='text')

In [69]:
with gr.Blocks(theme=gr.themes.Default(primary_hue=gr.themes.colors.red, secondary_hue=gr.themes.colors.pink)) as demo:
    ...

In [70]:
interface.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Running on public URL: https://0eb7986e8106bf4eb4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




