In [1]:
!pip install --upgrade pandas==1.0.0rc0

Collecting pandas==1.0.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/73/2f/91d5e0fa358e2568acea1941d09547ce5cff32f46973cefe499261b0c3c9/pandas-1.0.0rc0-cp36-cp36m-manylinux1_x86_64.whl (10.0MB)
[K     |████████████████████████████████| 10.0MB 2.4MB/s 
[31mERROR: tpot 0.11.1 has requirement scikit-learn>=0.22.0, but you'll have scikit-learn 0.21.3 which is incompatible.[0m
[31mERROR: mizani 0.6.0 has requirement matplotlib>=3.1.1, but you'll have matplotlib 3.0.3 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 0.25.3
    Uninstalling pandas-0.25.3:
      Successfully uninstalled pandas-0.25.3
Successfully installed pandas-1.0.0rc0


In [2]:
import operator
import re

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print('Pandas Version {}'.format(pd.__version__))

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

from nltk.tokenize import word_tokenize

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback

SEED = 42

Pandas Version 1.0.0rc0


In [3]:
df = pd.read_csv('../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv')

print('Data Set Shape = {}'.format(df.shape))
print('Data Set Memory Usage = {:.2f} MB'.format(df.memory_usage().sum() / 1024**2))

Data Set Shape = (5572, 2)
Data Set Memory Usage = 0.09 MB


## **GloVe Embeddings**

In [4]:
%%time

glove_embeddings = {}

with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        glove_embeddings[word] = np.asarray(values[1:], 'float32')

CPU times: user 21.1 s, sys: 530 ms, total: 21.6 s
Wall time: 21.5 s


In [5]:
def build_vocab(df):
    
    text = df['Message'].apply(lambda s: s.split()).values      
    vocab = {}
    
    for message in text:
        for word in message:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1                
    return vocab


def check_embedding_coverage(df, embeddings):
    
    vocab = build_vocab(df)    
    
    covered = {}
    oov = {}    
    n_covered = 0
    n_oov = 0
    
    for word in vocab:
        try:
            covered[word] = embeddings[word]
            n_covered += vocab[word]
        except:
            oov[word] = vocab[word]
            n_oov += vocab[word]
            
    vocab_coverage = len(covered) / len(vocab)
    text_coverage = (n_covered / (n_covered + n_oov))
    print('Embeddings cover {:.2%} of vocab'.format(vocab_coverage))
    print('Embeddings cover {:.2%} of text'.format(text_coverage))
    
    sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_oov


oov = check_embedding_coverage(df, glove_embeddings)

Embeddings cover 31.44% of vocab
Embeddings cover 64.08% of text


## **Text Pre-processing**

In [6]:
def clean(text):
    
    # Contractions
    text = re.sub(r"Its", "It is", text)
    text = re.sub(r"i'm", "I am", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"That's", "That is", text)
    text = re.sub(r"i've", "I have", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"I.ll", "I will", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"we're", "we are", text)
    text = re.sub(r"we'll", "we will", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"What's", "What is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"How's", "How is", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"i.ll", "I will", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"You're", "You are", text)
    text = re.sub(r"you've", "you have", text)
    text = re.sub(r"You've", "You have", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"We're", "We are", text)
    text = re.sub(r"you'd", "you would", text)
    text = re.sub(r"Haven't", "Have not", text)
    text = re.sub(r"She.s", "She is", text)
    text = re.sub(r"did'nt", "did not", text)
    text = re.sub(r"Wat's", "What is", text)
    text = re.sub(r"she.s", "she is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"u're", "you are", text)
    text = re.sub(r"Can't", "cannot", text)
    text = re.sub(r"did't", "did not", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"We'll", "We will", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"Who's", "Who is", text)
    text = re.sub(r"don‘t", "do not", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"ain't", "am not", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"You'll", "You will", text)
    text = re.sub(r"I‘m", "I am", text)
    text = re.sub(r"dsn't", "does not", text)
    text = re.sub(r"THERE'S", "THERE IS", text)
    text = re.sub(r"cann't", "cannot", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"There's", "There is", text)
    text = re.sub(r"There's", "There is", text)
    text = re.sub(r"He's", "He is", text)
    text = re.sub(r"they're", "they are", text)
    text = re.sub(r"U've", "You have", text)
    text = re.sub(r"He's", "He is", text)
    text = re.sub(r"u'll", "you will", text)
    text = re.sub(r"Where's", "Where is", text)
    text = re.sub(r"DON\x92T", "DO NOT", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"i\x92m", "I am", text)
    text = re.sub(r"We‘re", "We are", text)
    text = re.sub(r"We'd", "We would", text)
    text = re.sub(r"It‘s", "It is", text)
    text = re.sub(r"THAT\x92S", "THAT IS", text)
    text = re.sub(r"They're", "They are", text)
    text = re.sub(r"we've", "we have", text)
    text = re.sub(r"THAT\x92S", "THAT IS", text)
    text = re.sub(r"that‘s", "that is", text)
    text = re.sub(r"dat's", "that is", text)
    text = re.sub(r"didn‘t", "did not", text)
    text = re.sub(r"I\x92m", "I am", text)
    text = re.sub(r"doesn\\", "does not", text)
    text = re.sub(r"i\x92d", "I would", text)    
        
    # Character entity references
    text = re.sub(r"&amp;", "and", text)
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&lt;", "<", text)
    
    # Slang, typo, abbreviation
    text = re.sub(r"MobileUpd8", "Mobile Update", text)
    text = re.sub(r"Aight,", "all right,", text)
    text = re.sub(r"aight,", "all right,", text)
    text = re.sub(r"Max10mins", "maximum 10 minutes", text)
    text = re.sub(r"b'day", "birthday", text)
    text = re.sub(r"Thanx", "Thanks", text)
    text = re.sub(r"un-redeemed", "unredeemed", text)
    text = re.sub(r"Bstfrnd", "best friend", text)
    text = re.sub(r"Swtheart", "sweetheart", text)
    text = re.sub(r"Belovd", "beloved", text)
    text = re.sub(r"Lifpartnr", "life partner", text)
    text = re.sub(r"Cutefrnd", "cute friend", text)
    text = re.sub(r"Jstfrnd", "just friend", text)   
    text = re.sub(r"Lvblefrnd", "lovable friend", text)
    text = re.sub(r"Suite342", "Suite 342", text)
    text = re.sub(r"FreeMsg", "Free Message", text)
    text = re.sub(r"call2optout", "call to opt out", text)
    text = re.sub(r"toClaim", "to claim", text)
    text = re.sub(r"girlfrnd", "girlfriend", text)
    text = re.sub(r"AfterNoon", "Afternoon", text)
    text = re.sub(r"SkillGame", "skill game", text)
    text = re.sub(r"ringtoneking", "ringtone king", text)
    text = re.sub(r"invnted", "invented", text)
    text = re.sub(r"Grahmbell", "Graham Bell", text)
    text = re.sub(r"Whenevr", "Whenever", text)
    text = re.sub(r"Valid12hrs", "Valid 12 hours", text)
    text = re.sub(r"Age16", "Age 16", text)
    text = re.sub(r"StarWars3", "Star Wars 3", text)
    text = re.sub(r"Suprman", "Superman", text)
    text = re.sub(r"Call2OptOut", "call to opt out", text)
    text = re.sub(r"iscoming", "is coming", text)
    text = re.sub(r"GOODFRIEND", "good friend", text)
    text = re.sub(r"age23", "age 23", text)
    text = re.sub(r"age16", "age 16", text)    
    text = re.sub(r"b\x92day", "birthday", text)
    text = re.sub(r"Alwys", "always", text)
    text = re.sub(r"LookAtMe", "Look at me", text)
    text = re.sub(r"EURO2004", "Euro 2004", text)
    text = re.sub(r"transfr", "transfer", text)
    text = re.sub(r"movietrivia", "movie trivia", text)
    text = re.sub(r"FREE2DAY", "free today", text)
    text = re.sub(r"2optout", "to opt out", text)
    text = re.sub(r"Callertune", "caller tune", text)   
    text = re.sub(r"callertune", "caller tune", text) 
    text = re.sub(r"urgnt", "urgent", text)   
    text = re.sub(r"PICSFREE1", "pictures free 1", text)   
    text = re.sub(r"SkilGme", "Skill Game", text) 
    text = re.sub(r"bcums", "becomes", text) 
    text = re.sub(r"DeliveredTomorrow", "Delivered Tomorrow", text) 
    text = re.sub(r"2MORO", "Tomorrow", text) 
    text = re.sub(r"linerental", "line rental", text) 
    text = re.sub(r"MobilesDirect", "Mobiles Direct", text) 
    text = re.sub(r"125gift", "125 gift", text) 
    text = re.sub(r"gr8prizes", "great prizes", text)   
    text = re.sub(r"msgs", "messages", text)
    text = re.sub(r"12hrs", "12 hours", text)
    text = re.sub(r"frnd", "friend", text)    
        
    # Words with punctuations and special characters
    punctuations = '#!,£?+&*<>()"@%-=;' + "'"
    
    for p in punctuations:
        text = text.replace(p, f' {p} ')
        
    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ')
        
    if 'www' not in text or 'http' not in text or '...' not in text or '..' not in text:
        text = text.replace('.', ' . ')
        text = text.replace(':', ' : ')
        text = text.replace('/', ' / ')
    
    return text.lower()
    
# oov after text cleaning
df['Message'] = df['Message'].apply(lambda s : clean(s))
oov = check_embedding_coverage(df, glove_embeddings)

Embeddings cover 75.19% of vocab
Embeddings cover 96.56% of text


## **Tokenization**

In [7]:
MAX_LEN = 50
tokenizer = Tokenizer()

def create_corpus(df):    
    corpus = []
    
    for message in df['Message']:
        words = [word.lower() for word in word_tokenize(message)]
        corpus.append(words)
        
    return corpus 

corpus = create_corpus(df)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
sequences2d = pad_sequences(sequences, maxlen=MAX_LEN, truncating='post', padding='post')
sequences2d

array([[  68,  488, 4333, ...,    0,    0,    0],
       [  63,  360,    1, ...,    0,    0,    0],
       [  61,  506,   14, ...,    0,    0,    0],
       ...,
       [8778,    5,   58, ...,    0,    0,    0],
       [   9,  514,  123, ...,    0,    0,    0],
       [2614,    1,   15, ...,    0,    0,    0]], dtype=int32)

## **Train/Test Split**

In [8]:
df = pd.concat([df, pd.DataFrame(sequences2d)], axis=1)
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[2:]], df['Category'], test_size=0.2, stratify=df['Category'], random_state=SEED)

print('X_train Shape = {} (Class ham = {} - Class spam = {})'.format(X_train.shape, y_train[y_train == 0].shape[0], y_train[y_train == 1].shape[0]))
print('X_test Shape = {} (Class ham = {} - Class spam = {})'.format(X_test.shape, y_test[y_test == 0].shape[0], y_test[y_test == 1].shape[0]))

X_train Shape = (4457, 50) (Class ham = 3859 - Class spam = 598)
X_test Shape = (1115, 50) (Class ham = 966 - Class spam = 149)


## **Model**

In [9]:
def get_embeddings(embeddings):    

    word_index = tokenizer.word_index
    num_words = len(word_index) + 1

    embedding_matrix = np.zeros((num_words, 100))
    unknown_words = []

    for word, i in word_index.items():      
        try:
            embedding_matrix[i] = embeddings[word]
        except KeyError:
            try:
                embedding_matrix[i] = embeddings[word.lower()]
            except KeyError:
                unknown_words.append(word)
    
    return embedding_matrix, unknown_words
                
glove_embeddings_matrix, _ = get_embeddings(glove_embeddings)            

In [10]:
def build_model(num_words, embedding_matrix):
    
    model = Sequential([
        Embedding(num_words, 100, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_LEN, trainable=False),
        SpatialDropout1D(0.2), # Her kelime için Embeddinglerin aynı boyutunu droplayabiliyor?
        LSTM(2 ** 7, activation='tanh', recurrent_activation='sigmoid', dropout=0.1, recurrent_dropout=0.1),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


def train(X, y, num_words, embedding_matrix):
    
    N = 5    
    skf = StratifiedKFold(n_splits=N, random_state=SEED, shuffle=True)
    
    oof = np.zeros((len(X_train), 1))
    y_pred = np.zeros((len(X_test), 1))
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):       
            
        print('\nFold {}\n'.format(fold))
    
        model = build_model(num_words, embedding_matrix)
        model.fit(X.iloc[trn_idx], y.iloc[trn_idx], batch_size=32, epochs=10, validation_data=(X.iloc[val_idx], y.iloc[val_idx]))
        
        predictions = model.predict(X.iloc[val_idx])
        oof[val_idx] = predictions

        y_pred += model.predict(X_test) / N           
            
    return oof, y_pred
    

In [11]:
num_words = len(tokenizer.word_index) + 1
oof, y_pred = train(X_train, y_train, num_words, glove_embeddings_matrix)


Fold 0

Train on 3565 samples, validate on 892 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold 1

Train on 3565 samples, validate on 892 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold 2

Train on 3565 samples, validate on 892 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold 3

Train on 3566 samples, validate on 891 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold 4

Train on 3567 samples, validate on 890 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## **Evaluation**

In [12]:
oof_preds = np.round(oof)

oof_precision = precision_score(y_train, oof_preds, average='macro')
oof_recall = recall_score(y_train, oof_preds, average='macro')
oof_f1 = f1_score(y_train, oof_preds, average='macro')

print('OOF Precision: {:.6} - OOF Recall: {:.6} - OOF F1: {:.6}'.format(oof_precision, oof_recall, oof_f1))

OOF Precision: 0.964993 - OOF Recall: 0.937714 - OOF F1: 0.950763


In [13]:
test_preds = np.round(y_pred)

test_precision = precision_score(y_test, test_preds, average='macro')
test_recall = recall_score(y_test, test_preds, average='macro')
test_f1 = f1_score(y_test, test_preds, average='macro')

print('Test Precision: {:.6} - Test Recall: {:.6} - Test F1: {:.6}'.format(test_precision, test_recall, test_f1))

Test Precision: 0.959358 - Test Recall: 0.938295 - Test F1: 0.948471
