In [64]:

import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

import Stemmer
stemmer = Stemmer.Stemmer('english')

In [65]:
import ult
import vec_model_helper as helper
import re

In [66]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
from keras.layers import LSTM

In [67]:
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import keras.backend as K

In [None]:
from keras.utils.np_utils import to_categorical
labels = to_categorical(np.asarray(labels))

In [68]:
def stem(s):
    ws = s.split(' ')
    ws = stemmer.stemWords(ws)
    return " ".join(ws)

In [69]:
#Cleaning 
contractions = ult.get_contractions()
#print(contractions)
def expandShort(sent):
    for word in sent.split():
        if word.lower() in contractions:
            sent = sent.replace(word, contractions[word.lower()])
    return sent

def cleanText(sent):
    sent = sent.replace("\\n","")            
    sent = sent.replace("\\xa0","") #magic space lol
    sent = sent.replace("\\xc2","") #space
    sent = re.sub(r"(@[A-Za-z]+)|([\t])", "",sent)
    sent = expandShort(sent.strip().lower())
    sent = re.sub(r'[^\w]', ' ', sent)
    sent = re.sub(r"(@[A-Za-z]+)|([^A-Za-z \t])", " ", sent)
    ws = [w for w in sent.strip().split(' ') if w is not ''] # remove double space
    return " ".join(ws)

In [70]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 1024
DIM = 100

In [71]:
df = pd.read_csv("./data/TRAIN_SMS.csv",encoding='iso-8859-1')
df.head()

Unnamed: 0,Label,Message
0,ham,oh how abt 2 days before Christmas
1,info,"Welcome to OVATION HOLD R.No. 184, 114, 395, 3..."
2,info,Thank you for using your ICICI bank CREDITcard...
3,ham,schedule a meeting with the entire team in the...
4,ham,Tommy is my brother


In [72]:
df.Message.apply(lambda s: cleanText(s))
stemmed_df = df.Message.apply(lambda s: stem(s))

In [73]:
texts = stemmed_df.tolist()
lables = df.Label
lables = helper.encode_labels(lables)

In [74]:
len(texts)

30000

In [75]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, char_level=False)

In [76]:
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [77]:
import os
def build_embedding_index():
    embeddings_index = {}
    f = open(os.path.join('./data/glove.6B/', 'glove.6B.100d.txt'), encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))
    return embeddings_index

In [78]:
def f1_score(y_true, y_pred):
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    if c3 == 0:
        return 0
    # How many selected items are relevant?
    precision = c1 / c2
    # How many relevant items are selected?
    recall = c1 / c3
    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score 

In [79]:
word_vectors_index = build_embedding_index()

Found 400000 word vectors.


In [80]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [81]:
vocab_size_num = min(len(word_index), MAX_NB_WORDS)
vocab_size_num

38708

In [82]:
embeddimg_matrix = np.zeros((vocab_size_num+1, DIM))
embeddimg_matrix.shape
for word, i in word_index.items():
    word_vec = word_vectors_index.get(word) 
    if word_vec is not None:
        embeddimg_matrix[i-1] = word_vec

In [83]:
embeddimg_matrix.shape

(38709, 100)

In [97]:
model = Sequential()

model.add(
Embedding(vocab_size_num+1,
                    DIM,
                    weights=[embeddimg_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False)
)
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 1024, 100)         3870900   
_________________________________________________________________
dropout_13 (Dropout)         (None, 1024, 100)         0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_14 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 153       
Total params: 3,901,253
Trainable params: 30,353
Non-trainable params: 3,870,900
_________________________________________________________________
None


In [98]:
tbCallBack = TensorBoard(log_dir='board', histogram_freq=2, write_graph=True, write_images=True)

In [99]:
x_train, x_val, y_train, y_val = train_test_split(data,lables, shuffle=True, test_size=0.20)

In [100]:
x_train.shape

(24000, 1024)

In [None]:
model.fit(x_train, y_train, epochs=6, batch_size=256, validation_data=(x_val,y_val), callbacks=[tbCallBack])

Train on 24000 samples, validate on 6000 samples
Epoch 1/6

In [55]:

model.save_weights("./model/ecpoc20_.h5")
model.save("./model/ecpoc20_.h5")

In [56]:
print("Score",model.evaluate(x_val, y_val))

preds = model.predict(x_val)

print("F1 on model", f1_score(y_val, preds))

Score [0.14276138186454773, 0.94899999999999995, 0.94892242050170894]


AttributeError: 'numpy.dtype' object has no attribute 'base_dtype'

In [61]:
preds[0]

array([ 0.9831751 ,  0.00373961,  0.01308537], dtype=float32)