In [1]:
# Reading and fixiung data
import pandas as pd
import numpy as np

messages=pd.read_csv('../input/spamcsv/spam.csv',encoding='latin1' )
messages=messages[['v1','v2']]
messages.columns=['label', 'text']
#target variable encoding
labels=np.where(messages['label']=='spam',1,0)

In [2]:
messages

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# Cleaning the data
import re
import nltk
stopwords=nltk.corpus.stopwords.words('english')

In [4]:
messages['clean_text']=messages['text'].apply(lambda x: re.findall('\w+',x.lower()) )


In [5]:
messages

Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, don, t, think, he, goes, to, usf, he,..."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[will, ì_, b, going, to, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity, was, in, mood, for, that, so, any, othe..."
5570,ham,The guy did some bitching but I acted like i'd...,"[the, guy, did, some, bitching, but, i, acted,..."


In [6]:
messages['cleaner_text']=messages['clean_text'].apply(lambda x: [i for i in x if i not in stopwords])



In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(messages['cleaner_text'].apply(lambda x: ' '.join(x)), labels, test_size=0.2)

In [8]:
X_train

3983    hey missed tm last night phone charge smiles m...
2625    free ring tone text polys 87131 every week get...
2523                                     sorry call later
1393                                                oh ok
2315                               significant dont worry
                              ...                        
1146                                            babe lost
3974    u think girl propose u today seing ur bloody f...
2029                               thanx birthday already
4800    guy car shop flirting got phone number paperwo...
4118    hiya u like hlday pics looked horrible took mo...
Name: cleaner_text, Length: 4457, dtype: object

# Compare NLP Models 

## 1. Build model on TF-IDF Vectors

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

In [11]:
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
vectorizer.vocabulary_

In [13]:
#storage as a sparse matrix. here the size is 7502 but only 1 element stored for 1st row in test set. that means all others are 0
X_test_tfidf[0]

<1x7571 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [14]:
X_test_tfidf[0].toarray() #mostly zeroes. this is less efficient method of storing

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit Random Forest on top of these vectors

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()
rf_tfidf=rf.fit(X_train_tfidf, y_train)


In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rf_tfidf.predict(X_test_tfidf)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       968
           1       0.99      0.84      0.91       147

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



## 2. Build model on Word2Vec model

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(messages['cleaner_text'], labels, test_size=0.2)

from gensim.models import Word2Vec
w2v = Word2Vec(sentences=X_train, vector_size=100, window=5)


In [25]:
X_train

185     [hello, handsome, finding, job, lazy, working,...
2696    [nope, going, home, go, pump, petrol, lor, lik...
1780    [big, brother, alert, computer, selected, u, 1...
2072                                   [wondarfull, song]
678                                  [cause, freaky, lol]
                              ...                        
5280                  [vikky, come, around, lt, time, gt]
4363                                           [remember]
2709                              [sorry, de, went, shop]
3449                    [ya, telling, abt, tht, incident]
5203                              [fine, absolutly, fine]
Name: cleaner_text, Length: 4457, dtype: object

In [26]:
w2v.corpus_count

4457

In [28]:
words=set(w2v.wv.index_to_key)

In [35]:
#Replace words with their respective word vector in the sentence
X_train_w2v=[]
for i in X_train:
    mid=[]
    for j in i:
        if j in words:
            mid.append(np.array(w2v.wv[j]))
    X_train_w2v.append(np.array(mid))

In [38]:
#Replace words with their respective word vector in the sentence
X_test_w2v=[]
for i in X_test:
    mid=[]
    for j in i:
        if j in words:
            mid.append(np.array(w2v.wv[j]))
    X_test_w2v.append(np.array(mid))

In [None]:
#len of sentence in test set are not matching with their count of words 
#for which word vectors were created (as there were conditions like min_count while creating it)
# we are gonna fix this if we want to use it in a ML model
for i, j in enumerate(X_test_w2v):
    print(len(X_test.iloc[i]),len(j) )
    

In [41]:
#to fix the above issue, we can take the average of word vectors for the words contained in the sentence and
#then we'll be left with just one word vector representing a sentence
X_test_w2v_avg=[]  #sentence vector

for i in X_test_w2v:
    if (len(i)!=0):
        X_test_w2v_avg.append(i.mean(axis=0))
    else:
        X_test_w2v_avg.append(np.zeros(100))
    

In [42]:
X_train_w2v_avg=[]  #sentence vector

for i in X_train_w2v:
    if (len(i)!=0):
        X_train_w2v_avg.append(i.mean(axis=0))
    else:
        X_train_w2v_avg.append(np.zeros(100))
    

In [43]:
#unaveraged version where we have array of arrays
X_train_w2v[0]

array([[-1.22347131e-01,  2.66526401e-01,  3.71571188e-03,
         1.94218243e-03,  9.34828073e-02, -3.66924345e-01,
         7.48324394e-02,  5.83561361e-01, -1.97200462e-01,
        -1.64139435e-01, -1.65900156e-01, -2.83149660e-01,
        -1.14116654e-01,  1.51255086e-01,  9.59521383e-02,
        -1.27788499e-01,  7.19507262e-02, -1.92199767e-01,
        -3.32515128e-02, -4.48238671e-01,  8.91986117e-02,
         9.57286954e-02,  2.59735901e-02, -6.77553713e-02,
        -2.22508553e-02, -1.95921641e-02, -1.71424806e-01,
        -1.93138883e-01, -2.04100087e-01, -6.10178076e-02,
         2.54934818e-01, -1.29426830e-02,  9.68029127e-02,
        -2.53352344e-01, -1.18303657e-01,  2.75886536e-01,
         4.90510873e-02, -2.17145354e-01, -9.75972787e-02,
        -5.58880091e-01,  7.40302633e-03, -1.77603334e-01,
        -1.26718625e-01,  1.17016863e-02,  1.91906571e-01,
        -1.43502966e-01, -1.77416384e-01,  1.33500043e-02,
         1.89731359e-01,  1.52102724e-01,  2.16776073e-0

In [44]:
#averaged version - single array for a sentence (averaged)
X_train_w2v_avg[0]

array([-0.09111859,  0.20675366,  0.00749649,  0.00349744,  0.07981019,
       -0.29623663,  0.0644623 ,  0.46065006, -0.16239934, -0.13760644,
       -0.134692  , -0.23036654, -0.08917848,  0.11991364,  0.08204506,
       -0.1069146 ,  0.05770572, -0.14605232, -0.02760949, -0.3655604 ,
        0.06934724,  0.07497665,  0.01466099, -0.06119435, -0.01276602,
       -0.02491332, -0.13952595, -0.14898425, -0.164676  , -0.04872329,
        0.19625801, -0.00786726,  0.08202547, -0.19725522, -0.08915152,
        0.22450595,  0.04005738, -0.16612026, -0.08652509, -0.4421545 ,
        0.00522258, -0.14084867, -0.09794976,  0.01570944,  0.1604316 ,
       -0.11051732, -0.15145509,  0.00820561,  0.14938167,  0.12819181,
        0.17671287, -0.16772352, -0.1099661 ,  0.04305852, -0.146527  ,
        0.10517949,  0.10028197, -0.01706503, -0.24452782,  0.08214767,
        0.01024006,  0.09247468, -0.09859493, -0.07183642, -0.34143636,
        0.17888317,  0.09548838,  0.1425312 , -0.25705966,  0.24

### Fit Random Forest on top of these vectors

In [65]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()
rf_w2v=rf.fit(X_train_w2v_avg, y_train)


from sklearn.metrics import classification_report
print(classification_report(y_test, rf_w2v.predict(X_test_w2v_avg)))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92       964
           1       0.08      0.01      0.01       151

    accuracy                           0.85      1115
   macro avg       0.47      0.50      0.47      1115
weighted avg       0.76      0.85      0.80      1115



## 3. Build model on Doc2Vec model

In [50]:
X_train, X_test, y_train, y_test= train_test_split(messages['cleaner_text'], labels, test_size=0.2)


In [52]:
import gensim
train_tagged_doc=[gensim.models.doc2vec.TaggedDocument(j,[i]) for i,j in enumerate (X_train)]
test_tagged_doc=[gensim.models.doc2vec.TaggedDocument(j,[i]) for i,j in enumerate (X_test)]

In [54]:
train_tagged_doc[0:5]

[TaggedDocument(words=['er', 'yeah', 'b', '15', '26', 'sorry', 'tell', 'pub', 'cafe', 'sit', 'come', 'wen', 'u'], tags=[0]),
 TaggedDocument(words=['greatest', 'test', 'courage', 'earth', 'bear', 'defeat', 'without', 'losing', 'heart', 'gn', 'tc'], tags=[1]),
 TaggedDocument(words=['ok', 'try', 'week', 'end', 'course', 'coimbatore'], tags=[2]),
 TaggedDocument(words=['u', 'drive', 'lor'], tags=[3]),
 TaggedDocument(words=['good', 'evening', 'sir', 'hope', 'nice', 'day', 'wanted', 'bring', 'notice', 'late', 'paying', 'rent', 'past', 'months', 'pay', 'lt', 'gt', 'charge', 'felt', 'would', 'inconsiderate', 'nag', 'something', 'give', 'great', 'cost', 'didnt', 'speak', 'however', 'recession', 'wont', 'able', 'pay', 'charge', 'month', 'hence', 'askin', 'well', 'ahead', 'month', 'end', 'please', 'help', 'thank', 'everything'], tags=[4])]

In [55]:
#training a basic doc2vec model
doc2vec=gensim.models.Doc2Vec(train_tagged_doc, vector_size=100, window=5, min_count=2)

In [62]:
X_train_d2v= [doc2vec.infer_vector(i.words) for i in train_tagged_doc]

In [63]:
X_test_d2v= [doc2vec.infer_vector(i.words) for i in test_tagged_doc]

In [66]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()
rf_d2v=rf.fit(X_train_d2v, y_train)


from sklearn.metrics import classification_report
print(classification_report(y_test, rf_d2v.predict(X_test_d2v)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       964
           1       0.97      0.79      0.87       151

    accuracy                           0.97      1115
   macro avg       0.97      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



## 4. Build model on RNN

In [74]:
#train and test split
X_train, X_test, y_train, y_test= train_test_split(messages['cleaner_text'], labels, test_size=0.2)
#importing tensorflow and model building libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Using Tokenizer

Tokenizer: Allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...

By default, all punctuation is removed, turning the texts into space-separated sequences of words (words maybe include the ' character). These sequences are then split into lists of tokens. They will then be indexed or vectorized.m

## Using Pad Sequences
This function transforms a list (of length num_samples) of sequences (lists of integers) into a 2D Numpy array of shape (num_samples, num_timesteps). num_timesteps is either the maxlen argument if provided, or the length of the longest sequence in the list.



In [75]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [76]:
tokenizer=Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [None]:
print(tokenizer.index_word)

In [79]:
X_train_seq=tokenizer.texts_to_sequences(X_train)
X_test_seq=tokenizer.texts_to_sequences(X_test)

In [81]:
X_train_seq[1]

[6,
 87,
 632,
 589,
 183,
 436,
 981,
 6,
 87,
 23,
 12,
 489,
 103,
 71,
 1106,
 1293,
 289,
 269,
 1843,
 1844,
 1845,
 1497]

In [82]:
X_train_seq_padded = pad_sequences(X_train_seq,maxlen=50, padding='post')
X_test_seq_padded = pad_sequences(X_test_seq,maxlen=50, padding='post')

In [83]:
X_test_seq_padded[1]

array([   1, 7028,  671,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [84]:
#Building the model
model = keras.Sequential()
# Add an Embedding layer expecting input vocab of size 1000, and
# output embedding dimension of size 32.
model.add(layers.Embedding(input_dim=len(tokenizer.index_word)+1, output_dim=32)) #you can test output_dim

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(32, dropout=0, recurrent_dropout=0)) #output of previous layer i.e 32

# Add a Dense layer with 10 units.
model.add(layers.Dense(32, activation='relu'))
#final layer which will tell whether it's a spam or ham
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

2022-10-05 22:51:45.269345: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          245408    
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 254,817
Trainable params: 254,817
Non-trainable params: 0
_________________________________________________________________


In [85]:
import tensorflow.keras.backend as K
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

#compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer="adam",
    metrics=['accuracy',recall_m,precision_m]
)


In [86]:
history=model.fit(
    X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), batch_size=32, epochs=10
)

2022-10-05 22:51:59.561988: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Plot the evaluation metrics by each epoch for the model to see if we are over or underfitting
import matplotlib.pyplot as plt

for i in ['accuracy', 'precision_m', 'recall_m']:
    acc = history.history[i]
    val_acc = history.history['val_{}'.format(i)]
    epochs = range(1, len(acc) + 1)

    plt.figure()
    plt.plot(epochs, acc, label='Training Accuracy')
    plt.plot(epochs, val_acc, label='Validation Accuracy')
    plt.title('Results for {}'.format(i))
    plt.legend()
    plt.show()