In [1]:
import spacy
import pandas as pd
import numpy as np

nlp = spacy.load('en_core_web_md')

In [2]:
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')

In [76]:
from tensorflow.keras.layers import Bidirectional,LSTM,Embedding,StringLookup,Dense
from tensorflow.keras.initializers import Constant
from tensorflow.keras import Model
from tensorflow.keras.activations import softmax
from tensorflow.math import reduce_sum

In [3]:
data = pd.read_pickle('../data/kaggle-insults/raw.pickle')
df = pd.DataFrame.from_dict({'info':data['info'],'texts':data['texts']})

In [4]:
df = df.sample(4)

In [5]:
def custom_preprocess(row):
    return [t.text.lower() for t in nlp(row)]


df['cleaned'] = df['texts'].apply(lambda row: custom_preprocess(row))
cleaned_texts = tf.ragged.constant(df.cleaned)

2022-03-26 00:07:55.985687: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# {word : word vectors}
embeddings_index = {}

for key,vector in list(nlp.vocab.vectors.items()):
    
    embeddings_index[nlp.vocab.strings[key]] = vector

In [7]:
lookup_layer = StringLookup()
lookup_layer.adapt(cleaned_texts)

In [8]:
voc = lookup_layer.get_vocabulary()
emb_dim = 300

embedding_matrix = np.zeros((len(voc),emb_dim))

In [10]:
hits = 0
misses = 0

for i,word in enumerate(voc):
    temp_vec = embeddings_index.get(word)
    if temp_vec is not None:
        embedding_matrix[i] = temp_vec
        hits += 1
    else:
        misses += 1
        print(word)
        
print(hits,misses)

[UNK]
\\
custom_number
custom_break
pooface
120 5


In [34]:
embedding_layer = Embedding(input_dim=len(voc),output_dim=emb_dim,embeddings_initializer=Constant(tf.constant(embedding_matrix,dtype=tf.float32)),trainable=False)

In [42]:
embedding_layer(lookup_layer(cleaned_texts[3]))

<tf.Tensor: shape=(10, 300), dtype=float32, numpy=
array([[-0.11076 ,  0.30786 , -0.5198  , ..., -0.059105,  0.47604 ,
         0.05661 ],
       [-0.19859 , -0.062818, -0.36614 , ..., -0.58451 ,  0.27879 ,
        -0.26205 ],
       [ 0.27204 , -0.06203 , -0.1884  , ...,  0.13015 , -0.18317 ,
         0.1323  ],
       ...,
       [ 0.12274 , -0.29241 ,  0.32318 , ..., -0.81275 ,  0.28465 ,
        -0.053287],
       [ 0.012001,  0.20751 , -0.12578 , ...,  0.13871 , -0.36049 ,
        -0.035   ],
       [-0.49594 ,  0.26918 , -0.18897 , ...,  0.027965,  0.029533,
         0.031204]], dtype=float32)>

In [77]:
class DeepmojiNet(Model):
    def __init__(self,lookup_layer,embedding_layer,out_dim):
        super().__init__()
        self.lookup_layer = lookup_layer
        self.embedding_layer = embedding_layer
        self.lstm1 = Bidirectional(LSTM(units=512,return_sequences=True))
        self.lstm2 = Bidirectional(LSTM(units=512,return_sequences=True))
        self.dense1 = Dense(units=1, activation='relu')
        self.dense2 = Dense(units=128, activation='relu')
        self.dense3 = Dense(units=out_dim, activation=None)

    def call(self,x):
        idx = self.lookup_layer(x)
        embs = self.embedding_layer(idx)
        # put F.tanh here?

        hiddens1 = self.lstm1(embs)
        hiddens2 = self.lstm2(hiddens1)

        word_repr = tf.concat([embs,hiddens1,hiddens2],axis=-1)
        o = self.dense1(word_repr)
        att_weights = softmax(o,axis=1)
        sent_repr = reduce_sum(word_repr * att_weights,axis=1)

        output = self.dense2(sent_repr)
        output = self.dense3(output)
        
        return output

In [78]:
model = DeepmojiNet(lookup_layer,embedding_layer,2)
temp_x = cleaned_texts[3]
temp_x = tf.expand_dims(temp_x,0)

In [81]:
output = model(temp_x)

In [82]:
output.shape

TensorShape([1, 2])