In [1]:
import pandas as pd
import numpy as np

### Load SMS Spam Data

In [2]:
df = pd.read_csv('./data/SMSSpamCollection', sep='\t', names=['label', 'data'])
df.head()

Unnamed: 0,label,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Load GloVe Word Embeddings
https://nlp.stanford.edu/projects/glove/
"GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space."

In [3]:
EMBEDDING_DIM = 50
embeddings = {}

with open(f'glove.6b/glove.6B.{EMBEDDING_DIM}d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.strip().split()
        w = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        embeddings[w] = vectors

In [4]:
# Sample embedding
embeddings['the']

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [5]:
import string
import re

translator = str.maketrans('', '', string.punctuation)

def normalize_word(text):
    # Remove white space, cast to lowercase, 
    # remove punctuation and numbers
    text = text.lower()
    text = text.translate(translator)
    text = text.strip(' ')
    text = re.sub(r'\d+', '', text)
    return text

### Encode Text into Numpy Array

In [6]:
import string
def encode_sentence(sentence, embeddings):
    encoded_sentence = []
    words = map(lambda w: normalize_word(w), sentence.split(' '))
    for word in words:
        if word in embeddings:
            word_embedding = embeddings[word]
            encoded_sentence.append(word_embedding)
        else:
            print(f"WARN: No embedding found for word: {word}")
    return np.array(encoded_sentence)

In [7]:
encode_sentence("hello there", embeddings)

array([[-0.38497 ,  0.80092 ,  0.064106, -0.28355 , -0.026759, -0.34532 ,
        -0.64253 , -0.11729 , -0.33257 ,  0.55243 , -0.087813,  0.9035  ,
         0.47102 ,  0.56657 ,  0.6985  , -0.35229 , -0.86542 ,  0.90573 ,
         0.03576 , -0.071705, -0.12327 ,  0.54923 ,  0.47005 ,  0.35572 ,
         1.2611  , -0.67581 , -0.94983 ,  0.68666 ,  0.3871  , -1.3492  ,
         0.63512 ,  0.46416 , -0.48814 ,  0.83827 , -0.9246  , -0.33722 ,
         0.53741 , -1.0616  , -0.081403, -0.67111 ,  0.30923 , -0.3923  ,
        -0.55002 , -0.68827 ,  0.58049 , -0.11626 ,  0.013139, -0.57654 ,
         0.048833,  0.67204 ],
       [ 0.68491 ,  0.32385 , -0.11592 , -0.35925 ,  0.49889 ,  0.042541,
        -0.40153 , -0.36793 , -0.61441 , -0.41148 , -0.3482  , -0.21952 ,
        -0.22393 , -0.64966 ,  0.85443 ,  0.33582 ,  0.2931  ,  0.16552 ,
        -0.55082 , -0.61277 , -0.14768 ,  0.47551 ,  0.65877 , -0.07103 ,
         0.56147 , -1.2651  , -0.74117 ,  0.36365 ,  0.5623  , -0.27365 ,
       

In [10]:
def encode_X(dfx, embeddings):
    encoded_df = []
    for x in dfx.values:
        sentence_embedding = encode_sentence(x, embeddings)
        encoded_df.append(sentence_embedding)
    return np.array(encoded_df)

In [11]:
X = encode_X(df['data'], embeddings)

WARN: No embedding found for word: 
WARN: No embedding found for word: wkly
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: questionstd
WARN: No embedding found for word: ratetcs
WARN: No embedding found for word: freemsg
WARN: No embedding found for word: 
WARN: No embedding found for word: chgs
WARN: No embedding found for word: minnaminunginte
WARN: No embedding found for word: nurungu
WARN: No embedding found for word: vettam
WARN: No embedding found for word: callertune
WARN: No embedding found for word: 
WARN: No embedding found for word: callertune
WARN: No embedding found for word: receivea
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: pday
WARN: No embedding found for word: 
WARN: N

WARN: No embedding found for word: addressull
WARN: No embedding found for word: alrightokay
WARN: No embedding found for word: wrking
WARN: No embedding found for word: aiyah
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: httpwwwurawinnercom
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: sfirst
WARN: No embedding found for word: timedhoni
WARN: No embedding found for word: teju
WARN: No embedding found for word: hourish
WARN: No embedding found for word: nothis
WARN: No embedding found for word: groundamla
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: speedchat
WARN: No embedding found for word: speedchat
WARN: No embedding found 

WARN: No embedding found for word: claypot
WARN: No embedding found for word: fishhead
WARN: No embedding found for word: beehoon
WARN: No embedding found for word: nydc
WARN: No embedding found for word: messageits
WARN: No embedding found for word: evng
WARN: No embedding found for word: jaklin
WARN: No embedding found for word: idk
WARN: No embedding found for word: linerental
WARN: No embedding found for word: 
WARN: No embedding found for word: ntwk
WARN: No embedding found for word: 
WARN: No embedding found for word: avble
WARN: No embedding found for word: 
WARN: No embedding found for word: calloptoutj
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: pendingi
WARN: No embedding found for word: 
WARN: No embedding found for word: dayswill
WARN: No embedding found for word: thrurespect
WARN: No embedding found for w

WARN: No embedding found for word: nigpun
WARN: No embedding found for word: dismissial
WARN: No embedding found for word: screwd
WARN: No embedding found for word: 
WARN: No embedding found for word: thanx
WARN: No embedding found for word: 
WARN: No embedding found for word: heehee
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: sleepwellamptake
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: chillaxin
WARN: No embedding found for word: iknow
WARN: No embedding found for word: 
WARN: No embedding found for word: wellda
WARN: No embedding found for word: studentfinancial
WARN: No embedding found for word: crisisspk
WARN: No embedding found for word: 
WARN: No embedding found for word: deliveredtomorrow
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found 

WARN: No embedding found for word: quiteamuzing
WARN: No embedding found for word: thatscool
WARN: No embedding found for word: babeprobpop
WARN: No embedding found for word: 
WARN: No embedding found for word: satthen
WARN: No embedding found for word: brekkie
WARN: No embedding found for word: psxtra
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: £call
WARN: No embedding found for word: 
WARN: No embedding found for word: tcsbcmwcnxxcallcostppmmobilesvary
WARN: No embedding found for word: max£
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: wkly
WARN: No embedding found for word: 
WARN: No embedding found for word: 
WARN: No embedding found for word: mnths
WARN: No embedding found for wor

In [14]:
print(X)

[array([[ 1.4828e-01,  1.7761e-01,  4.2346e-01, -3.1489e-01,  3.2273e-01,
        -7.2413e-01, -7.8955e-01,  4.9214e-01, -2.0693e-01, -5.5088e-04,
        -4.7877e-01,  2.8853e-01, -5.7376e-01,  2.7217e-01,  1.1129e+00,
         5.7808e-01,  6.9321e-01, -2.8652e-01, -5.4545e-02, -6.1826e-01,
         1.7227e-01,  2.9263e-01,  3.8184e-01,  6.2186e-01,  5.5461e-01,
        -1.7411e+00, -2.8802e-01, -1.7140e-01,  7.4743e-01, -1.0135e+00,
         3.3596e+00,  1.1370e+00, -1.0028e+00,  1.7685e-01, -6.1795e-03,
        -6.3491e-02,  1.9077e-01,  4.4046e-02,  3.8228e-01, -4.1607e-01,
        -5.0359e-01, -8.3803e-02,  1.7508e-01,  4.0420e-01,  7.7324e-02,
         1.7415e-01,  1.2541e-01, -2.1820e-01,  1.2971e-01,  3.2953e-01],
       [ 2.0025e-01, -3.2821e-01, -4.0859e-01, -7.9438e-01, -1.6211e-02,
        -1.5642e-01, -8.7742e-01,  7.9077e-01, -7.2598e-01, -8.4135e-01,
         3.2721e-01,  1.6083e-01, -3.9978e-01, -1.6564e-01,  9.7777e-01,
         7.5359e-01, -5.8771e-01, -1.8122e-01, -8

In [27]:
y = df['label']
y = np.array(list(map(lambda x: 0 if 'ham' else 1, y)))
print(y)

[0 0 0 ... 0 0 0]


In [29]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
model.fit(x=X, y=y)
# history = model.fit(train_dataset, epochs=10,
#                     validation_data=test_dataset, 
#                     validation_steps=30)

ValueError: Input 0 of layer sequential_1 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 1]