# Purpose
This dataset is hosted on kaggle, and there were some promising results posted there, but they werent properly verified. This is an endeavor to see what kind of results that the popular [deep learning notebook](https://www.kaggle.com/jacklinggu/keras-mlp-cnn-test-for-text-classification) would get if we used a balanced metric and an 80-20 train_test_split.

In [1]:
% matplotlib inline
import pandas as pd
from pathlib import Path
import sys
import seaborn as sns
import re
from pprint import pprint
from itertools import zip_longest
import numpy as np
from functools import partial

project_dir = Path.cwd().parent
sys.path.append(str(project_dir/'src'))

# These are utilities that I created to reduce notebook clutter
from make_dataframe import make_dataframe, master_data_handler
import utilities as ut
import deep_learning as dl

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
master_data_handler()
df = make_dataframe()
df.label = df.label.map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from keras import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef
from keras.wrappers.scikit_learn import KerasClassifier

In [5]:
max_len = 100
num_max = 1000
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(df.text)
mat_texts = tok.texts_to_matrix(df.text, mode='count')
print(df.label.shape,mat_texts.shape)

cnn_texts_seq = tok.texts_to_sequences(df.text)
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_len)
print(cnn_texts_mat.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 3. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]
(5572,) (5572, 1000)
(5572, 1000)
[49, 471, 842, 755, 658, 64, 8, 88, 123, 351, 148, 67, 58, 144]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0  49 471 842 755
 658  64   8  88 123 351 148  67  58 144]
(5572, 100)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(cnn_texts_mat, df.label, test_size=0.3, random_state=0)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(mat_texts, df.label, test_size=0.3, random_state=0)

In [7]:
pipeline_dl_simple = Pipeline([
    ('clf', KerasClassifier(build_fn=partial(dl.get_simple_model,num_max),
                            batch_size=32,epochs=10,verbose=0, validation_split=.2,
                            validation_data=(X_test_s, y_test_s))),
])
pipeline_dl_simple.fit(X_train_s, y_train_s)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               512512    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 644,097
Trainable params: 644,097
Non-trainable params: 0
_________________________________________________________________
compile done
Train on 3900 samples, validate on 1672 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/1

Pipeline(memory=None,
     steps=[('clf', <keras.wrappers.scikit_learn.KerasClassifier object at 0x000001F4442FA7B8>)])

In [8]:
y_true_s, y_pred_s = y_test_s, pipeline_dl_simple.predict(X_test_s)
mcc_simp = matthews_corrcoef(y_true_s, y_pred_s)
mcc_simp



In [9]:
def pipeline_dl(model_in, model_text, X_train, y_train, X_test, y_test, verbose=0):
    pipeline_dl_v1 = Pipeline([
        ('clf', KerasClassifier(build_fn=partial(model_in,num_max, max_len),
                                batch_size=32,epochs=10,verbose=verbose, validation_split=.2,
                                validation_data=(X_test, y_test))),
    ])
    pipeline_dl_v1.fit(X_train, y_train);

    y_true, y_pred = y_test, pipeline_dl_v1.predict(X_test)
    mcc = matthews_corrcoef(y_true, y_pred)
    print(f'------------------------------------------------------------')
    print(f'{model_text} got an MCC of: {mcc}')
    print(f'------------------------------------------------------------')
    return mcc

In [10]:
mcc_v1 = pipeline_dl(dl.get_cnn_model_v1, 'Model V1', X_train, y_train, X_test, y_test)
mcc_v2 = pipeline_dl(dl.get_cnn_model_v2, 'Model V2', X_train, y_train, X_test, y_test)
mcc_v3 = pipeline_dl(dl.get_cnn_model_v3, 'Model V3', X_train, y_train, X_test, y_test);

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 20)           20000     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 20)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 64)            3904      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               16640     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
__________

# Final Results

In [12]:
print(f'Simple: {mcc_simp}')
print(f'MCC V1: {mcc_v1}')
print(f'MCC V2: {mcc_v2}')
print(f'MCC V3: {mcc_v3}')

Simple: 0.9417253825867283
MCC V1: 0.8949619135716199
MCC V2: 0.9141356743368744
MCC V3: 0.8815328957915707
