# keras word embedding

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

reviews = ['nice food',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible food',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement']

sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [None]:
one_hot("amazing restaurant amazing",30)

[8, 19, 8]

In [None]:
one_hot("amazing restaurant",300)

[277, 156]

In [None]:
vocab_size = 30
encoded_reviews = [one_hot(d, vocab_size) for d in reviews]
print(encoded_reviews)

[[12, 12], [8, 19], [28, 7], [21, 26, 10], [25, 6, 22], [28, 12], [17, 6, 20], [13, 29], [13, 13], [11, 5]]


In [None]:
max_length = 4
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length,
                               padding='post')
print(padded_reviews)

[[12 12  0  0]
 [ 8 19  0  0]
 [28  7  0  0]
 [21 26 10  0]
 [25  6 22  0]
 [28 12  0  0]
 [17  6 20  0]
 [13 29  0  0]
 [13 13  0  0]
 [11  5  0  0]]


In [None]:
embeded_vector_size = 5

model = Sequential()
model.add(Embedding(vocab_size, embeded_vector_size,
                    input_length=max_length,name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
X = padded_reviews
y = sentiment

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 5)              150       
                                                                 
 flatten (Flatten)           (None, 20)                0         
                                                                 
 dense (Dense)               (None, 1)                 21        
                                                                 
Total params: 171
Trainable params: 171
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x7ffaeb145690>

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(X, y)
accuracy



1.0

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
len(weights)

30

In [None]:
weights

array([[ 0.05870454, -0.00191321,  0.00729362,  0.07374565,  0.02376828],
       [ 0.03285155,  0.02628367,  0.04371591,  0.00559996, -0.01653464],
       [ 0.03908497, -0.04701314,  0.014692  ,  0.02376301,  0.00262793],
       [ 0.02987387, -0.0442349 ,  0.02711478, -0.03544682, -0.01024095],
       [-0.00118687, -0.01629859, -0.04506552,  0.0413137 ,  0.03240464],
       [ 0.01155346,  0.0334807 , -0.02579642, -0.02105372,  0.03738065],
       [ 0.04367358, -0.00353683,  0.03077312,  0.02827638, -0.03605047],
       [-0.07503589, -0.08900342,  0.09316859,  0.00881381, -0.05569955],
       [ 0.05216512,  0.09184054,  0.02476582,  0.03609883, -0.04740902],
       [-0.0190045 ,  0.00851209,  0.04740706,  0.01810377,  0.00554786],
       [-0.05010461, -0.00273943, -0.09008278, -0.02555314, -0.07532882],
       [-0.03128028, -0.06922905, -0.09629741, -0.06495026,  0.04774296],
       [ 0.03742787,  0.05731171,  0.03935966,  0.02454265, -0.07309615],
       [-0.03321441, -0.05754425, -0.0

In [None]:
weights[8]

array([ 0.05216512,  0.09184054,  0.02476582,  0.03609883, -0.04740902],
      dtype=float32)

data = data.loc[:, ~data.columns.str.contains('Unnamed: 2', case=False)] 
data = data.loc[:, ~data.columns.str.contains('Unnamed: 3', case=False)] 
data = data.loc[:, ~data.columns.str.contains('Unnamed: 4', case=False)] 

data=data.rename(columns = {'v1': 'label', 'v2': 'text'}, inplace = False)

print('File has {} rows and {} columns'.format(data.shape[0],data.shape[1]))

In [None]:
#https://github.com/codebasics/deep-learning-keras-tf-tutorial/blob/master/47_BERT_text_classification/BERT_email_classification-handle-imbalance.ipynb
#https://www.section.io/engineering-education/classification-model-using-bert-and-tensorflow/

In [None]:
! pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.8.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 2.5 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 28.7 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.1 tf-estimator-nightly-2.8.0.dev2021122109


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
df=data

In [None]:
df.head()

Unnamed: 0,label,text,gt
2184,ham,think rule tamilnadu tough people,0
3594,ham,amazing rearrange letters gives meaning dormit...,0
1460,ham,referin mei wat waitin treat somebody shld ric...,0
5497,ham,think sent text home phone cant display texts ...,0
4379,ham,package programs well,0


In [None]:
df['spam']=df['label'].apply(lambda x: 1 if x=='spam' else 0)
df.sample(5)

Unnamed: 0,label,text,gt,spam
2217,ham,wot wed nite til,0,0
4751,ham,popped loo hello hello,0,0
2596,ham,nice ready thursday,0,0
3900,ham,got call landline number asked come anna nagar...,0,0
2949,ham,maybe westshore hyde park village place near h...,0,0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'],df['spam'], stratify=df['spam'])

In [None]:
X_train.head(4)

308                             ask abt movie wan ktv oso
4928    sorry joined league people dont keep touch mea...
3578                                            cool text
1036               gentle princess make sweet gentle love
Name: text, dtype: object

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8435169 , -0.5132726 , -0.8884571 , ..., -0.7474883 ,
        -0.75314724,  0.91964495],
       [-0.8720837 , -0.50543964, -0.94446695, ..., -0.858475  ,
        -0.7174535 ,  0.88082975]], dtype=float32)>

In [None]:
e = get_sentence_embeding([
    "banana", 
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]],[e[1]])

array([[0.9911088]], dtype=float32)

In [None]:
cosine_similarity([e[0]],[e[3]])

array([[0.84703815]], dtype=float32)

In [None]:
cosine_similarity([e[3]],[e[4]])

array([[0.9872035]], dtype=float32)

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
 20/131 [===>..........................] - ETA: 24:54 - loss: 0.2521 - accuracy: 0.9094 - precision: 0.8043 - recall: 0.4302