In [32]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np
import pandas as pd
import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [33]:
data = pd.read_csv('../../Data/cleaned_v4.csv')

In [34]:
from keras_preprocessing.text import Tokenizer
from keras_preprocessing import sequence
from keras_preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
# label_tokenizer = Tokenizer()
# label_tokenizer.fit_on_texts(dataset["Label"])
# token_label_seq = np.array(label_tokenizer.texts_to_sequences(dataset['Label']))
tokenizer.fit_on_texts(data['Content_cleaned_v2'])
sequences = tokenizer.texts_to_sequences(data['Content_cleaned_v2'])

maxlen = 700
X = sequence.pad_sequences(sequences, maxlen=maxlen)
Y = pd.get_dummies(data['Label'])

In [35]:
word_index = tokenizer.word_index
num_words = len(word_index) + 1

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=0.9, random_state=42)

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((12277, 700), (12277, 9), (1365, 700), (1365, 9))

In [37]:

from tensorflow.keras import layers

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [43]:
from gensim.models import Word2Vec, KeyedVectors
w2v = Word2Vec.load("../../FeatureEngineering/Vectorize/w2vmodel_skip.model")

In [44]:
import numpy as np
notin = []
embedding_dim = 128
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
print('num_words:', num_words)
for word, i in word_index.items():
    if word in w2v.wv.key_to_index:
            embedding_matrix[i] = w2v.wv[word]
    else:
        notin.append(word)

num_words: 43216


In [45]:
 # Two seperate embedding layers, one for tokens, one for token index (positions)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, embedding_matrix):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim= vocab_size, output_dim=embed_dim, input_length=maxlen,weights=[embedding_matrix],trainable=False)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [48]:
embed_dim = 128  # Embedding size for each token
num_heads = 12    # Number of attention heads
ff_dim = 256    # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, num_words, embed_dim, embedding_matrix= embedding_matrix)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(9, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 700)]             0         
                                                                 
 token_and_position_embeddin  (None, 700, 128)         5621248   
 g_7 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_7 (Transf  (None, 700, 128)         857600    
 ormerBlock)                                                     
                                                                 
 global_average_pooling1d_7   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_30 (Dropout)        (None, 128)               0   

In [49]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, Y_train, batch_size=16, epochs=10, validation_split=0.1, workers=4)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: 