# Install Dependencies and Bring in Data

In [2]:
!pip install tensorflow pandas matplotlib scikit-learn





In [3]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

2024-06-13 11:52:18.603176: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-13 11:52:19.565030: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-13 11:52:19.565097: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-06-13 11:52:19.673090: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-13 11:52:22.001579: W tensorflow/stream_executor/platform/de

In [5]:
# Data is downloaded from kaggle - kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
df = pd.read_csv(os.path.join('data', 'train.csv'))

In [6]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Processing the data

In [27]:
from tensorflow.keras.layers import TextVectorization
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import string

In [29]:
# cleaning the text before vectorising
def clean(text):
    for puncuation in string.punctuation:
        text = text.replace(puncuation, ' ')
    
    lowercase = text.lower()
    tokenised = word_tokenize(lowercase)
    word_only = [word for word in tokenised if word.isalpha()]
    
    stop_words = list(stopwords.words('english'))
        
    without_stopword = [word for word in word_only if word not in stop_words]
    
    verb_lemmatized = [ WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in without_stopword ]
    
    noun_lemmatised = [ WordNetLemmatizer().lemmatize(word, pos='n')
                      for word in verb_lemmatized ]
    
    return " ".join(noun_lemmatised)

In [30]:
X = df['comment_text'].apply(clean)

In [34]:
y = df[df.columns[2:]].values

In [35]:
max_words = 100000

In [37]:
vectorizer = TextVectorization(max_tokens=max_words,
                               output_sequence_length=1000,
                               output_mode='int')

2024-06-13 14:12:20.879413: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2024-06-13 14:12:20.894599: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2024-06-13 14:12:20.894660: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-T6TOAJH): /proc/driver/nvidia/version does not exist
2024-06-13 14:12:20.896830: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [38]:
vectorizer.adapt(X.values)

In [52]:
vectorized_text = vectorizer(X.values)

2024-06-13 14:19:12.833051: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1276568000 exceeds 10% of free system memory.


In [55]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [56]:
batch_x, batch_y = dataset.as_numpy_iterator().next()

2024-06-13 14:28:23.763994: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1276568000 exceeds 10% of free system memory.


In [71]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)+2)

# Create Sequential Model

In [78]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [79]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(max_words+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [80]:
# compile the model
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [81]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          3200032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [84]:
history = model.fit(train, epochs=1, validation_data=val, verbose=1)



2024-06-13 15:51:31.925052: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1276568000 exceeds 10% of free system memory.




# Make predictions

In [120]:
input_text = vectorizer(clean('You freaking idiot! I am going to kill you fucking moron.'))

In [121]:
res = model.predict(np.expand_dims(input_text, 0))



In [122]:
res

array([[0.9999438 , 0.4886239 , 0.99360424, 0.0170372 , 0.9486099 ,
        0.1633346 ]], dtype=float32)

In [95]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')