In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM, Embedding
from tensorflow.keras.models import Model

In [None]:
df = pd.read_csv('data.csv')
df = df.sample(frac=1)

In [None]:
X = df.tweet
Y = df["class"].values
df_train, df_test, y_train, y_test = train_test_split(X,Y,test_size=0.20)

In [None]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(df_train)
sequences_train = tokenizer.texts_to_sequences(df_train)
sequences_test = tokenizer.texts_to_sequences(df_test)

In [None]:
V = len(tokenizer.word_index)
print(V)

10822


In [None]:
data_train = pad_sequences(sequences_train)
print(data_train.shape)

(2560, 75)


In [None]:
T = data_train.shape[1]

In [None]:
data_test = pad_sequences(sequences_test,maxlen=T)

In [None]:
data_test.shape

(640, 75)

In [None]:
from tensorflow.keras import initializers
D = 10 #embedding dimensionality 

M = 5 #hidden state vector


i = Input(shape=(T,))
x = Embedding(V+1,D)(i)
x = LSTM(M,return_sequences=True,kernel_initializer=initializers.RandomNormal(stddev=0.01))(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1,activation="sigmoid")(x)

model = Model(i,x)

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit(data_train,y_train,epochs=10,validation_data=(data_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2cc7f80510>

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
embed_dim = 10  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 15  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(T,))
x = Embedding(V+1,D)(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1,)(x)
x = layers.Dense(20, activation="elu",kernel_initializer=initializers.RandomNormal(stddev=0.01))(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
history = model.fit(data_train, y_train, epochs=10, validation_data=(data_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
import joblib

In [None]:
joblib.dump(model,'blah')
joblib.dump(tokenizer,'tok')



INFO:tensorflow:Assets written to: ram://363b37d5-d9ff-4a73-be11-156c072a61fb/assets


INFO:tensorflow:Assets written to: ram://363b37d5-d9ff-4a73-be11-156c072a61fb/assets


['tok']

In [None]:
!pwd

/content


In [None]:
l = ['this is hate speech']
l = tokenizer.texts_to_sequences(l)
l = pad_sequences(l, maxlen=75)

In [None]:
model.predict(l)

array([[0.99450743]], dtype=float32)