In [3]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [4]:
os.path.join('jigsaw-toxic-comment-classification-challenge','train.cvs','train.csv')

'jigsaw-toxic-comment-classification-challenge\\train.cvs\\train.csv'

In [5]:
#import the dataset
df=pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv','train.csv'))

In [6]:
#preprocessing
#step-1 : tokenize the data (text vectorization)
from tensorflow.keras.layers import TextVectorization

In [7]:
#x->all the comments
#y->related labeles
x=df['comment_text']
y=df[df.columns[2:]].values

In [8]:
MAX_WORDS=200000 #number of words in the vocab

In [9]:
#output_sequence_length=each sentence word length
#output_mode=mode of token values : map every word into an 'int' value
vectorizer=TextVectorization(max_tokens=MAX_WORDS,
                            output_sequence_length=1800,
                            output_mode='int')

In [10]:
#import comments to the vectorizer vocabulary
vectorizer.adapt(x.values)

In [11]:
#vectorizer.get_vocabulary()-to see the mapped words

In [None]:
#vectorized_text=vectorizer(x.values)
vocab = vectorizer.get_vocabulary()
config = vectorizer.get_config()

# Save the vocabulary and configuration to a JSON file
with open('vectorizer_config_new.json', 'w', encoding='utf-8') as f:
    json.dump({'vocab': vocab, 'config': config}, f, ensure_ascii=False, indent=4)

In [12]:
#vectorized_text = to see the all vectorized sentences

In [13]:
#create dataset with tensorflow data pipeline : suitable for train model with large amount of data
#map>cache>shuffle>batch>prefetch

dataset=tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset=dataset.cache() #cache data
dataset=dataset.shuffle(160000) #shuffle data
dataset=dataset.batch(16) #batches the data : each batch has 16 samples
dataset=dataset.prefetch(8) #helps to prevent bottlenecks

In [14]:
#partitioning data

train=dataset.take(int(len(dataset)*.7)) #assign 70% of data from the dataset for training the model
val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) #skip 70% of data and take other 20% of data for validation
test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) #skip 90% of data and take other 10% for test

In [15]:
#prepocessing done
#build nueral network->

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [17]:
model=Sequential() #instantiate sequential api
#create the embedding layer
model.add(Embedding(MAX_WORDS+1,32))

#create the lstm layer
#bidirectional=pass information both backwards and forwards(checking sentences)
model.add(Bidirectional(LSTM(32,activation='tanh')))

#feature exraction layers
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))

#map dence output to a value between 0-1 using sigmoid activation
model.add(Dense(6,activation='sigmoid'))

In [18]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [22]:
#train the model
#epochs=how long the model will train
history=model.fit(train, epochs=1, validation_data=val)



In [23]:
#training done
#predictions=>

In [24]:
input_text=vectorizer('i love you')

In [25]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [26]:
batch=test.as_numpy_iterator().next()

In [27]:
batch_x, batch_y=test.as_numpy_iterator().next()

In [28]:
model.predict(np.expand_dims(input_text,0))#predict a value we set



array([[0.05501266, 0.00068528, 0.0133937 , 0.00277676, 0.01097367,
        0.0043622 ]], dtype=float32)

In [29]:
(model.predict(batch_x)>0.5).astype(int) #predict a batch in pipeline



array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [30]:
#Prediction done
#Evaluate the model=>

In [31]:
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy

In [32]:
pre=Precision()
re=Recall()
acc=CategoricalAccuracy()

In [33]:
for batch in test.as_numpy_iterator():
    #unpacking the batches
    x_true,y_true=batch
    #do predictions
    yhat=model.predict(x_true)
    
    y_true=y_true.flatten()
    yhat=yhat.flatten()
    
    pre.update_state(y_true,yhat)
    re.update_state(y_true,yhat)
    acc.update_state(y_true,yhat)









In [35]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.861177384853363, Recall: 0.6203438639640808, Accuracy: 0.47943830490112305


In [36]:
model.save('model.h5')

In [38]:
import pickle

In [39]:
pickle.dump(model,open('nlp_model','wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\bidirectional
......vars
...layers\bidirectional\backward_layer
......vars
...layers\bidirectional\backward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\forward_layer
......vars
...layers\bidirectional\forward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\layer
......vars
...layers\bidirectional\layer\cell
......vars
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dense_2
......vars
.........0
.........1
...layers\dense_3
......vars
.........0
.........1
...layers\embedding
......vars
.........0
...metrics\mean
......vars
.........0
.........1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........17
.........18
.........19
.........2
.........20
.........21
.........22
.........23
.........24
........

In [40]:
loaded_model=pickle.load(open('nlp_model','rb'))

Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2023-04-04 11:08:18         3616
metadata.json                                  2023-04-04 11:08:18           64
variables.h5                                   2023-04-04 11:08:22     77946848
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers\bidirectional
......vars
...layers\bidirectional\backward_layer
......vars
...layers\bidirectional\backward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\forward_layer
......vars
...layers\bidirectional\forward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\layer
......vars
...layers\bidirectional\layer\cell
......vars
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dense_2
......vars
.........0
.........1
...layers\dense_3
......vars
.........

In [49]:
loaded_model.predict(np.expand_dims(vectorizer('I hate you'),0))



array([[0.39954525, 0.003991  , 0.07474607, 0.01089181, 0.09199408,
        0.02147427]], dtype=float32)

In [41]:
pickle.dump(model,open('nlp_model.pkl','wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\bidirectional
......vars
...layers\bidirectional\backward_layer
......vars
...layers\bidirectional\backward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\forward_layer
......vars
...layers\bidirectional\forward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\layer
......vars
...layers\bidirectional\layer\cell
......vars
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dense_2
......vars
.........0
.........1
...layers\dense_3
......vars
.........0
.........1
...layers\embedding
......vars
.........0
...metrics\mean
......vars
.........0
.........1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........17
.........18
.........19
.........2
.........20
.........21
.........22
.........23
.........24
........

In [44]:
vectorizer

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x7bce35a400>

In [49]:
modelVec=Sequential()

In [51]:
modelVec.add(vectorizer)

In [56]:
modelVec.save('vectorizer.pkl')

INFO:tensorflow:Assets written to: vectorizer.pkl\assets


In [62]:
tf.saved_model.save(vectorizer, "D:/NLP Model/NLP Model Final")

INFO:tensorflow:Assets written to: D:/NLP Model/NLP Model Final\assets


In [63]:
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))

InvalidArgumentError: Cannot convert a Tensor of dtype resource to a NumPy array.

In [45]:
import json

In [46]:
config = vectorizer.get_config()

In [47]:
with open('vectorizer_config.json', 'w') as f:
    json.dump(config, f)