In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import layers, Input, Model, models
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.random import set_seed

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn import svm
from sklearn.metrics import *
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


1. Data import

In [2]:
data = pd.read_csv('data/spam.csv')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
data = data.iloc[:,:2]

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [16]:
data['v1'].value_counts()/data.shape[0]

ham     0.865937
spam    0.134063
Name: v1, dtype: float64

In [17]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

How we can see our data set is unbalanced, 86.5% to 13.5% percent, in favor of not spam.

2. Data cleaning and preprocessing

In [3]:
# let transform our labels to binary values
Y = LabelEncoder().fit_transform(data['v1']).reshape(-1,1)
Y.shape

(5572, 1)

In [4]:
X = data['v2']
X.shape

(5572,)

In [5]:
# Let check how many words we have in our dataset

# tokenize the document
words = []

for i in range(0, X.shape[0]):
    result = text_to_word_sequence(X[i])
    words.extend(result)
    
len(set(words))

8916

In [156]:
# Let see top 10 words, whether there are some useless words or single characters
pd.DataFrame(words).value_counts().head(10)

i      2351
to     2242
you    2150
a      1433
the    1328
u      1172
and     979
in      898
is      889
me      802
dtype: int64

How we can see, many of our popular words are stopwords, let remove them.

In [7]:
# Our stopwords which we'll remove are presented below
#nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df = pd.DataFrame(X)
stop_words = set(stopwords.words('english'))


df['clean'] = df['v2'].apply(lambda x: ' '.join([word for word in x.split() if not word.lower() in (stop_words)]))
print(df)

                                                     v2  \
0     Go until jurong point, crazy.. Available only ...   
1                         Ok lar... Joking wif u oni...   
2     Free entry in 2 a wkly comp to win FA Cup fina...   
3     U dun say so early hor... U c already then say...   
4     Nah I don't think he goes to usf, he lives aro...   
...                                                 ...   
5567  This is the 2nd time we have tried 2 contact u...   
5568              Will �_ b going to esplanade fr home?   
5569  Pity, * was in mood for that. So...any other s...   
5570  The guy did some bitching but I acted like i'd...   
5571                         Rofl. Its true to its name   

                                                  clean  
0     Go jurong point, crazy.. Available bugis n gre...  
1                         Ok lar... Joking wif u oni...  
2     Free entry 2 wkly comp win FA Cup final tkts 2...  
3             U dun say early hor... U c already say...  
4

In [None]:
# # let remove stopwords
# stop_words = set(stopwords.words('english'))

# # tokenize our text
# X_clean = []

# for i in range(X.shape[0]):
#     #tokenize each sentences
#     word_tokens = text_to_word_sequence(X[i])
#     filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words] # filter stop words from orignal sentences
#     X_clean.append(filtered_sentence)

In [10]:
# Let check again if there are still any useless words not included in stopwords

# tokenize the document
words = []
df = df['clean']

for i in range(0, df.shape[0]):
    result = text_to_word_sequence(df[i])
    words.extend(result)
    
len(set(words))

    
pd.DataFrame(words).value_counts().head(20)
#there are still some stopwords

u       1172
call     591
2        515
i'm      394
get      391
ur       385
gt       318
lt       316
4        316
ok       287
free     284
go       280
know     261
now      255
good     245
like     245
got      239
you      233
come     229
time     220
dtype: int64

In [11]:
# let remove another stopwords, not defined before

more_stopwords = set(['u','4','2',"i'm","i'll",'r','ur','n'])
updated_step_words = stop_words | more_stopwords

df1 = pd.DataFrame(df)

df1['clean_1'] = df1['clean'].apply(lambda x: ' '.join([word for word in x.split() if not word.lower() in (updated_step_words)]))
print(df1)

                                                  clean  \
0     Go jurong point, crazy.. Available bugis n gre...   
1                         Ok lar... Joking wif u oni...   
2     Free entry 2 wkly comp win FA Cup final tkts 2...   
3             U dun say early hor... U c already say...   
4               Nah think goes usf, lives around though   
...                                                 ...   
5567  2nd time tried 2 contact u. U �750 Pound prize...   
5568                      �_ b going esplanade fr home?   
5569           Pity, * mood that. So...any suggestions?   
5570  guy bitching acted like i'd interested buying ...   
5571                                    Rofl. true name   

                                                clean_1  
0     Go jurong point, crazy.. Available bugis great...  
1                           Ok lar... Joking wif oni...  
2     Free entry wkly comp win FA Cup final tkts 21s...  
3                 dun say early hor... c already say...  
4

In [14]:
df1['clean_1']

0       Go jurong point, crazy.. Available bugis great...
1                             Ok lar... Joking wif oni...
2       Free entry wkly comp win FA Cup final tkts 21s...
3                   dun say early hor... c already say...
4                 Nah think goes usf, lives around though
                              ...                        
5567    2nd time tried contact u. �750 Pound prize. cl...
5568                        �_ b going esplanade fr home?
5569             Pity, * mood that. So...any suggestions?
5570    guy bitching acted like i'd interested buying ...
5571                                      Rofl. true name
Name: clean_1, Length: 5572, dtype: object

In [12]:
# Let split our dataset on train and test

X_train, X_test,Y_train, Y_test = train_test_split(df1['clean_1'],Y, test_size= 0.2)

3. Text tokenize

In [None]:
x_list musi byc lista slow z kazdego wiersza

In [22]:
# let check the longest sms in our dataset
maxims = []
for i,j in enumerate(X_list):
    maxims.append(len(X_list[i]))
    
np.max(maxims)

548

In [25]:
#let create tokenizer
max_words = 800
max_len = 76

tok = Tokenizer(max_words,lower= True)
tok.fit_on_texts(X_train) #token trainging on X_train

sequences = tok.texts_to_sequences(X_train) #conversion from text to vectors
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len) #matrix with padding has been created

test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [26]:
#let check records where after tokenizing we have only zeros
test_0 = pd.DataFrame({"X" : test_sequences_matrix.argmax(axis=1)})
test_0_filtred = test_0[test_0["X"]<1]

test_to_remove = test_0_filtred.index

train_0 = pd.DataFrame({"X" : sequences_matrix.argmax(axis=1)})
train_0_filtred = train_0[train_0["X"]<1]

train_to_remove = train_0_filtred.index


In [27]:
# let remove those zeros tokens from our dataset 

#X_train 
sequences_matrix = np.delete(sequences_matrix, train_to_remove,axis = 0 )

#X_test
test_sequences_matrix =  np.delete(test_sequences_matrix, test_to_remove,axis = 0 )

#Y_train
Y_train = np.delete(Y_train, train_to_remove, axis = 0)

#Y_test
Y_test = np.delete(Y_test, test_to_remove, axis = 0)


4. First models 

In [582]:
# to make reproduced results.
np.random.seed(500)

In [None]:
#random forrest, knn, svm, rnn

a) SVM

In [584]:
#simple model as benchmark

model1_svm = make_pipeline(MinMaxScaler(),svm.SVC())
model1_svm.fit(sequences_matrix, Y_train.flatten())

y_pred1_svm = model1_svm.predict(test_sequences_matrix)

# evaluation
print(classification_report(Y_test.flatten(),y_pred1_svm))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95       948
           1       0.75      0.37      0.50       139

    accuracy                           0.90      1087
   macro avg       0.83      0.68      0.72      1087
weighted avg       0.89      0.90      0.89      1087



As we can see, model predict not-spam class very well, but reach only 50% f1_score for spam class. 
We are going to improve this model a little bit, by changing default parameters.

In [585]:
# another simple model

model2_svm = make_pipeline(MinMaxScaler(),svm.SVC(kernel = 'poly'))
model2_svm.fit(sequences_matrix, Y_train.flatten())

y_pred2_svm = model2_svm.predict(test_sequences_matrix)


# evaluation
print(classification_report(Y_test.flatten(),y_pred2_svm))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95       948
           1       0.77      0.36      0.49       139

    accuracy                           0.90      1087
   macro avg       0.84      0.67      0.72      1087
weighted avg       0.89      0.90      0.89      1087



In [587]:
# another simple model

model3_svm = make_pipeline(MinMaxScaler(),svm.SVC(kernel = 'linear'))
model3_svm.fit(sequences_matrix, Y_train.flatten())

y_pred3_svm = model3_svm.predict(test_sequences_matrix)


# evaluation
print(classification_report(Y_test.flatten(),y_pred3_svm))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94       948
           1       0.62      0.32      0.42       139

    accuracy                           0.89      1087
   macro avg       0.76      0.65      0.68      1087
weighted avg       0.87      0.89      0.87      1087



Three basic models with default parameters, only with different kernel don't reach sufficient results.

b) KNN

In [603]:
#simple model as benchmark

model1_knn = make_pipeline(MinMaxScaler(),KNeighborsClassifier())
model1_knn.fit(sequences_matrix, Y_train.flatten())

y_pred1_knn = model1_knn.predict(test_sequences_matrix)

# evaluation
print(classification_report(Y_test.flatten(),y_pred1_knn))

              precision    recall  f1-score   support

           0       0.91      0.96      0.94       948
           1       0.60      0.38      0.47       139

    accuracy                           0.89      1087
   macro avg       0.76      0.67      0.70      1087
weighted avg       0.87      0.89      0.88      1087



In [607]:
#simple model as benchmark

model2_knn = make_pipeline(MinMaxScaler(),KNeighborsClassifier(weights = 'distance'))
model2_knn.fit(sequences_matrix, Y_train.flatten())

y_pred2_knn = model2_knn.predict(test_sequences_matrix)

# evaluation
print(classification_report(Y_test.flatten(),y_pred2_knn))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       948
           1       0.74      0.53      0.62       139

    accuracy                           0.92      1087
   macro avg       0.84      0.75      0.79      1087
weighted avg       0.91      0.92      0.91      1087



In [608]:
#simple model as benchmark

model2_knn = make_pipeline(StandardScaler(),KNeighborsClassifier(weights = 'distance'))
model2_knn.fit(sequences_matrix, Y_train.flatten())

y_pred2_knn = model2_knn.predict(test_sequences_matrix)

# evaluation
print(classification_report(Y_test.flatten(),y_pred2_knn))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       948
           1       0.75      0.57      0.64       139

    accuracy                           0.92      1087
   macro avg       0.84      0.77      0.80      1087
weighted avg       0.91      0.92      0.92      1087



How we can see first, basic KNN model didn't improve our previous results, but with only one change in weight parameters, we beat out previous best result. 
Finally with StandardScaler instead of MinMaxScaller our results was a little bit better.  
Let try with more another option.

c) RandomForrest

In [618]:
#simple model as benchmark

model1_rf = RandomForestClassifier(random_state=1)
model1_rf.fit(sequences_matrix, Y_train.flatten())

y_pred1_rf = model1_rf.predict(test_sequences_matrix)

# evaluation
print(classification_report(Y_test.flatten(),y_pred1_rf))

              precision    recall  f1-score   support

           0       0.94      0.99      0.97       948
           1       0.89      0.59      0.71       139

    accuracy                           0.94      1087
   macro avg       0.92      0.79      0.84      1087
weighted avg       0.94      0.94      0.93      1087



After first try, in default model we reached the highest f1 score on this dataset.
Let check more sofisticated model form keras library.

d) simple neural network

In [28]:
# first we have to prepare Y_train and Y_test to correct shape
Y_train_encoded = OneHotEncoder(sparse= False).fit_transform(Y_train)
Y_test_encoded = OneHotEncoder(sparse= False).fit_transform(Y_test)

In [29]:
#set result repeatable

set_seed(2)

In [650]:
# first DL model
input_layer = Input(shape = sequences_matrix.shape[1])

x1 = layers.Dense(128, activation='relu')(input_layer)
x1 = layers.Dense(64, activation='relu')(x1)

out = layers.Dense(2)(x1)

out = layers.Softmax()(out)

model_dl1 = Model(inputs = input_layer, outputs = out)
model_dl1.compile(optimizer= 'Adam', loss = 'binary_crossentropy', metrics = 'AUC')

model_dl1.fit(sequences_matrix, Y_train_encoded, epochs = 20, validation_split=0.1, callbacks=EarlyStopping(patience=3, monitor='val_loss'))

y_pred_dl1 = model_dl1.predict(test_sequences_matrix).argmax(axis = 1) 

print(classification_report(Y_test.flatten(),y_pred1_rf))



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       948
           1       0.88      0.58      0.70       139

    accuracy                           0.94      1087
   macro avg       0.91      0.79      0.83      1087
weighted avg       0.93      0.94      0.93      1087



How we can see, simple DL model hasn't improve our result, let improve it, by adding recurential layers.

e) rnn networks

In [30]:
# our model definition
def model_rnn(X_train,Y_train, units_rnn, embedd = 64, opt = 'adam', metrics = 'AUC', batch = 64, epochs = 30, verbose = True):
    
    '''
    X_train = training data without labels, in matrix format, prepared earlier as a sequences_matrix,
    Y_train = labels for training data as an array(None, 2),
    units_rnn = list of units per layer, same units per layer are recommended
    embedd = dimension of embedding, default 64
    opt = optimizer as a string,  default = 'adam'
    metric = metric to evaluate model results, default = 'auc'
    batch = batch size, default 64, NOTE than batch size should be divided by 8
    epochs = number of epochs, iteration of model, default = 30
    verbose = whether print model summary or not, default = true (print)
    
    
    max_words = number of words used in model, parameter defined in preprocessing stage,
    max_len = maximal lenght of one record, number of words in one record, parameter defined in preprocessing stage,
    
    '''
    
    # model instance
    model = models.Sequential()
    
    # input layer
    model.add(
        layers.InputLayer(
                    name = 'intro', 
                    input_shape = max_len
        )
    )
    
    # text embedding
    model.add(
        layers.Embedding(
            input_dim = max_words,
            output_dim=embedd
        )
    )
    
    # model with LSTM layers, and dense as an output
    for i, j in enumerate(units_rnn):
        if i == len(units_rnn)-1 and i == 0:
            model.add(
                layers.LSTM(
                    units = j, 
                    name = 'rnn' + str(i)
                )
            )
        
        elif i == len(units_rnn)-1:
             model.add(
                layers.LSTM(
                    units = j, 
                    name = 'rnn' + str(i)
                )
            )
        
        
        else:
            model.add(
                layers.LSTM(
                    units = j, 
                    return_sequences = True,
                    name = 'rnn' + str(i)
                )
            )
            
    model.add(
        layers.Dense(
            units=2,
            name = 'out'
            )
        )  
    
    
    model.add(layers.Softmax())
    
#     model.add(
#         layers.Dense(
#             units=1,
#             activation="sigmoid",
#             name = 'out'
#             )
#         )            
    
    if verbose == True:
        model.summary()
    
    
    # model compile
    model.compile(
            loss = 'binary_crossentropy',
            optimizer = opt,
            metrics = metrics
    )
    
    # model fit
    model.fit(
        x = X_train,
        y = Y_train,
        batch_size = batch,
        epochs = epochs,
        validation_split = 0.2,
        callbacks = [EarlyStopping(
                            monitor ='val_loss',
                            patience = 5,
                            min_delta = 0.01
                            )]
    )
    
    
    return model    
            
    
        

In [703]:
model_rnn1 = model_rnn(sequences_matrix,Y_train_encoded,[64])

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 70, 64)            51200     
_________________________________________________________________
rnn0 (LSTM)                  (None, 64)                33024     
_________________________________________________________________
out (Dense)                  (None, 2)                 130       
_________________________________________________________________
softmax_6 (Softmax)          (None, 2)                 0         
Total params: 84,354
Trainable params: 84,354
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30


In [704]:
# model evaluation results
res = model_rnn1.evaluate(test_sequences_matrix,Y_test_encoded, verbose=0)
print('Test set\n  Loss: {:0.2f}\n  AUC: {:0.2f}'.format(res[0],res[1]))

Test set
  Loss: 0.05
  AUC: 1.00


In [705]:
# classification report
y_pred = model_rnn1.predict(test_sequences_matrix).argmax(axis = 1)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       948
           1       0.98      0.94      0.96       139

    accuracy                           0.99      1087
   macro avg       0.98      0.97      0.97      1087
weighted avg       0.99      0.99      0.99      1087



How we can see our model, have almost perfect match, it makes only some mistakes, and not spam messeges classify as a spam.
Let's go to optimize recall_score for 'spam' class.

In [715]:
#precision score for class 'spam'
precision_score(Y_test, y_pred,pos_label=1)

0.9774436090225563

In [60]:
parameters = { "units_rnn" : [[64,64],[128],[128,128],[32],[32,32], [64,64,64],[128,128,128],[32,32,32]]}    

results = pd.DataFrame({"units_rnn" : [], "precision" : []})

lenght = len(parameters["units_rnn"])

for i in range(lenght):
    param = parameters.get("units_rnn")[i]
    model = model_rnn(sequences_matrix,Y_train_encoded,param, verbose = False)
    y_pred = model.predict(test_sequences_matrix).argmax(axis = 1)
    print('model' + str(i+1), "\n")
    
    precision = precision_score(Y_test, y_pred,pos_label=1)
    results = results.append({"units_rnn": param,"precision" : precision}, ignore_index = True)
    
print(results)
    







Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
model1 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
model2 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
model3 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
model4 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
model5 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
model6 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30


model7 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
model8 

         units_rnn  precision
0         [64, 64]   0.920000
1            [128]   0.958333
2       [128, 128]   0.926667
3             [32]   0.958621
4         [32, 32]   0.938776
5     [64, 64, 64]   0.931507
6  [128, 128, 128]   0.926667
7     [32, 32, 32]   0.912752


In [700]:
# moze jakis hyperopt na kerasie?, albo zwykla petla i daj slowniki w srodku w liscie, aby po nich chodzil jako parametry, moze tak sie da

In [None]:
#poszukaj sposobu na unbalanced in textowych danych, wez wykorzystaj transfer learning i kilka roznych modeli nie tylko DL