In [5]:
import tensorflow as tf
from tensorflow                              import keras
from tensorflow.keras                        import layers
from tensorflow.keras.preprocessing.text     import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics         import confusion_matrix

import pandas as pd
import numpy  as np

import matplotlib.pyplot as plt

import os
import time

import nltk
from nltk.corpus import stopwords

In [2]:
# https://www.kaggle.com/c/nlp-getting-started: NLP Disaster Tweets dataset
df = pd.read_csv("train.csv")

In [3]:
df.shape

(7613, 5)

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
print((df.target == 1).sum()) # Disaster
print((df.target == 0).sum()) # No Disaster

3271
4342


In [7]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")

for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
t
@bbcmtd Wholesale Markets ablaze 


In [9]:
df["text"] = df.text.map(remove_URL)
df["text"] = df.text.map(remove_punct)

In [10]:
# remove stopwords
nltk.download('stopwords')


# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to /Users/Andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [12]:
df["text"] = df.text.map(remove_stopwords)

In [13]:
df.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [14]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(df.text)

In [15]:
len(counter)

17971

In [16]:
counter

diculous': 4,
         'london': 15,
         'cool': 31,
         'skiing': 1,
         'wonderful': 5,
         'day': 109,
         'looooool': 1,
         'wayi': 1,
         'cant': 102,
         'eat': 7,
         'shit': 56,
         'nyc': 12,
         'last': 83,
         'week': 37,
         'girlfriend': 6,
         'cooool': 1,
         'like': 345,
         'pasta': 2,
         'end': 42,
         'bbcmtd': 1,
         'wholesale': 4,
         'markets': 7,
         'ablaze': 28,
         'always': 46,
         'try': 19,
         'bring': 17,
         'metal': 13,
         'rt': 107,
         'africanbaze': 1,
         'newsnigeria': 1,
         'flag': 21,
         'set': 48,
         'aba': 14,
         'crying': 9,
         'plus': 8,
         'side': 23,
         'look': 73,
         'sky': 15,
         'night': 50,
         'phdsquares': 1,
         'mufc': 2,
         'theyve': 5,
         'built': 6,
         'much': 64,
         'hype': 3,
         'around': 39,
 

In [17]:
counter.most_common(5)

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

In [19]:
num_unique_words = len(counter)
print(num_unique_words)

17971


### Creating X, y data

In [20]:
X = df.text.to_numpy()
y = df.target.to_numpy()

### Splitting the data into Training and testing

In [21]:
# TODO: Create a test train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
print(X_train)

['photo postapocalypticflimflam prodding around rubble' 'end'
 'man crush everyday cristianinspire' ...
 'omron hem712c automatic blood pressure monitor standard large bp cuffs'
 'officials say quarantine place alabama home possible ebola case developing symptoms'
 'moved england five years ago today whirlwind time']


In [23]:
# Split dataset into training and validation set
train_size = int(X_train.shape[0] * 0.8)

train_df = X_train[:train_size]
val_df   = X_train[train_size:]

print(train_df)

['photo postapocalypticflimflam prodding around rubble' 'end'
 'man crush everyday cristianinspire' ...
 'uabstephenlong courtlizcamp total tweet fail beautiful inside blaze'
 'road closures remain effect due hazard trees falling tree torching uphill runs fire forest service road 1 remains close'
 'socialmedia news new facebook page features seek help personalize customer experience']


In [25]:
# split text and labels
train_sentences = train_df
train_labels    = y_train[:train_size]
val_sentences   = val_df
val_labels      = y_train[train_size:]

In [26]:
train_sentences.shape, val_sentences.shape

((4080,), (1020,))

In [27]:
test_sentences = X_test
test_labels    = y_test

In [28]:
test_sentences.shape

(2513,)

In [29]:
# Tokenize
# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [30]:
# each word has unique index
word_index = tokenizer.word_index

In [31]:
word_index

{'like': 1,
 'im': 2,
 'amp': 3,
 'fire': 4,
 'new': 5,
 'via': 6,
 'get': 7,
 'people': 8,
 'news': 9,
 'one': 10,
 'dont': 11,
 'us': 12,
 'video': 13,
 '2': 14,
 'emergency': 15,
 'disaster': 16,
 'still': 17,
 'police': 18,
 'would': 19,
 'california': 20,
 'crash': 21,
 'world': 22,
 'man': 23,
 'suicide': 24,
 'know': 25,
 'train': 26,
 'time': 27,
 '3': 28,
 'day': 29,
 'body': 30,
 'go': 31,
 'burning': 32,
 'see': 33,
 'got': 34,
 'buildings': 35,
 'first': 36,
 'nuclear': 37,
 'attack': 38,
 'car': 39,
 'youtube': 40,
 'fires': 41,
 'rt': 42,
 'back': 43,
 'cant': 44,
 'families': 45,
 'storm': 46,
 'today': 47,
 'war': 48,
 'watch': 49,
 'life': 50,
 'hiroshima': 51,
 'good': 52,
 'bomb': 53,
 'going': 54,
 'look': 55,
 'two': 56,
 'u': 57,
 'may': 58,
 'full': 59,
 'love': 60,
 'dead': 61,
 'accident': 62,
 'think': 63,
 'way': 64,
 'say': 65,
 'mass': 66,
 'army': 67,
 'wildfire': 68,
 'many': 69,
 'years': 70,
 'black': 71,
 'last': 72,
 'best': 73,
 'death': 74,
 'want':

In [32]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences  = tokenizer.texts_to_sequences(test_sentences)
val_sequences   = tokenizer.texts_to_sequences(val_sentences)

In [33]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['onufrance 7475 bioterrorism rockefellerchirockefellerunivheiress 2 evade lgl efforts 2 prosecute blks 4 harvardu kidnapgafp'
 'noahanyname utopian impulse inevitably ends gulags mass murder'
 'dreaming peacefully loud ass thunder wanted scare'
 'swellyjetevo disneyland tacos bomb'
 'blood pressure roof dont need extra shit']
[[4325, 4326, 493, 4327, 14, 4328, 4329, 2085, 14, 4330, 4331, 107, 4332, 4333], [2782, 4334, 4335, 4336, 1399, 4337, 66, 153], [4338, 4339, 218, 167, 154, 833, 2783], [4340, 4341, 4342, 53], [265, 1207, 1665, 11, 91, 2784, 132]]


In [34]:
# Pad the sequences to have the same length
# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded   = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")

In [35]:
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

In [36]:
train_padded.shape, test_padded.shape, val_padded.shape

((4080, 20), (2513, 20), (1020, 20))

In [37]:
train_padded[10]

array([4325, 4326,  493, 4327,   14, 4328, 4329, 2085,   14, 4330, 4331,
        107, 4332, 4333,    0,    0,    0,    0,    0,    0], dtype=int32)

In [38]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

onufrance 7475 bioterrorism rockefellerchirockefellerunivheiress 2 evade lgl efforts 2 prosecute blks 4 harvardu kidnapgafp
[4325, 4326, 493, 4327, 14, 4328, 4329, 2085, 14, 4330, 4331, 107, 4332, 4333]
[4325 4326  493 4327   14 4328 4329 2085   14 4330 4331  107 4332 4333
    0    0    0    0    0    0]


In [39]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [40]:
reverse_word_index

{1: 'like',
 2: 'im',
 3: 'amp',
 4: 'fire',
 5: 'new',
 6: 'via',
 7: 'get',
 8: 'people',
 9: 'news',
 10: 'one',
 11: 'dont',
 12: 'us',
 13: 'video',
 14: '2',
 15: 'emergency',
 16: 'disaster',
 17: 'still',
 18: 'police',
 19: 'would',
 20: 'california',
 21: 'crash',
 22: 'world',
 23: 'man',
 24: 'suicide',
 25: 'know',
 26: 'train',
 27: 'time',
 28: '3',
 29: 'day',
 30: 'body',
 31: 'go',
 32: 'burning',
 33: 'see',
 34: 'got',
 35: 'buildings',
 36: 'first',
 37: 'nuclear',
 38: 'attack',
 39: 'car',
 40: 'youtube',
 41: 'fires',
 42: 'rt',
 43: 'back',
 44: 'cant',
 45: 'families',
 46: 'storm',
 47: 'today',
 48: 'war',
 49: 'watch',
 50: 'life',
 51: 'hiroshima',
 52: 'good',
 53: 'bomb',
 54: 'going',
 55: 'look',
 56: 'two',
 57: 'u',
 58: 'may',
 59: 'full',
 60: 'love',
 61: 'dead',
 62: 'accident',
 63: 'think',
 64: 'way',
 65: 'say',
 66: 'mass',
 67: 'army',
 68: 'wildfire',
 69: 'many',
 70: 'years',
 71: 'black',
 72: 'last',
 73: 'best',
 74: 'death',
 75: 'wa

In [41]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [42]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[4325, 4326, 493, 4327, 14, 4328, 4329, 2085, 14, 4330, 4331, 107, 4332, 4333]
onufrance 7475 bioterrorism rockefellerchirockefellerunivheiress 2 evade lgl efforts 2 prosecute blks 4 harvardu kidnapgafp


## Shared Metrics

In [43]:
loss    = keras.losses.BinaryCrossentropy(from_logits=False)
optim   = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

## Creating the LSTM

In [76]:
LSTM_model = keras.models.Sequential()
LSTM_model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.

LSTM_model.add(layers.LSTM(32, dropout=0.1))
LSTM_model.add(layers.Dense(1, activation="sigmoid"))

LSTM_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 32)            575072    
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 583,425
Trainable params: 583,425
Non-trainable params: 0
_________________________________________________________________


In [77]:
LSTM_model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [78]:
start_time = time.time()

LSTM_model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2)

running_time = time.time() - start_time

Epoch 1/20
128/128 - 2s - loss: 0.5361 - accuracy: 0.7343 - val_loss: 0.4722 - val_accuracy: 0.7873
Epoch 2/20
128/128 - 1s - loss: 0.2406 - accuracy: 0.9174 - val_loss: 0.5694 - val_accuracy: 0.7873
Epoch 3/20
128/128 - 2s - loss: 0.1300 - accuracy: 0.9605 - val_loss: 0.6398 - val_accuracy: 0.7824
Epoch 4/20
128/128 - 1s - loss: 0.0804 - accuracy: 0.9789 - val_loss: 0.6832 - val_accuracy: 0.7647
Epoch 5/20
128/128 - 1s - loss: 0.0615 - accuracy: 0.9794 - val_loss: 0.8112 - val_accuracy: 0.7490
Epoch 6/20
128/128 - 2s - loss: 0.0472 - accuracy: 0.9806 - val_loss: 1.0172 - val_accuracy: 0.7686
Epoch 7/20
128/128 - 1s - loss: 0.0384 - accuracy: 0.9831 - val_loss: 1.1954 - val_accuracy: 0.7657
Epoch 8/20
128/128 - 1s - loss: 0.0325 - accuracy: 0.9858 - val_loss: 1.2093 - val_accuracy: 0.7637
Epoch 9/20
128/128 - 1s - loss: 0.0269 - accuracy: 0.9850 - val_loss: 1.3222 - val_accuracy: 0.7716
Epoch 10/20
128/128 - 1s - loss: 0.0239 - accuracy: 0.9870 - val_loss: 1.3206 - val_accuracy: 0.7637

In [79]:
print("The training took: {:.2f} seconds.".format(running_time))

The training took: 29.08 seconds.


# Testing LSTM

In [80]:
# Predictting with the training data
LSTM_predictions = LSTM_model.predict(test_padded)
LSTM_predictions = [1 if p > 0.5 else 0 for p in LSTM_predictions]

In [49]:
print(test_sentences[10:20])

print(test_labels[10:20])
print(LSTM_predictions[10:20])

['akilah world news cop pulls man car avoid' 'walk plank sinking ship'
 'zakbagans pets r like part family love animals last 2 pets rescued breaks heart animals mistreated'
 'use perforated metal shear panel sfor seismicresistant applications'
 'answer friend yelling windmy latest article read share thanks'
 'woman electrocuted red redblood videoclip'
 'bringing tornadoes floods bringing climate change god america plaguing farrakhan quote'
 'savages leaked thomas brady gangstermail account wonder quick fatality samsung mobile b real son'
 'malaysia airlines flight 370 disappeared 17months ago debris found south indian ocean'
 'people finally panicking cable tv']
[1 0 0 0 0 0 0 0 1 0]
[0, 0, 0, 0, 0, 0, 1, 0, 1, 0]


In [81]:
confusion_matrix(LSTM_predictions, test_labels)

array([[1223,  334],
       [ 223,  733]])

## Creating the GRU

In [70]:
GRU_model = keras.models.Sequential()
GRU_model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

GRU_model.add(layers.GRU(32, dropout=0.1))
GRU_model.add(layers.Dense(1, activation="sigmoid"))

GRU_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 32)            575072    
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                6336      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 581,441
Trainable params: 581,441
Non-trainable params: 0
_________________________________________________________________


In [71]:
GRU_model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [72]:
start_time = time.time()
GRU_model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=1)
running_time = time.time() - start_time

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [73]:
print("The training took: {:.2f} seconds.".format(running_time))

The training took: 36.75 seconds.


In [74]:
GRU_predictions = GRU_model.predict(test_padded)
GRU_predictions = [1 if p > 0.5 else 0 for p in GRU_predictions]

In [56]:
print(train_sentences[10:20])

print(test_labels[10:20])
print(GRU_predictions[10:20])

['onufrance 7475 bioterrorism rockefellerchirockefellerunivheiress 2 evade lgl efforts 2 prosecute blks 4 harvardu kidnapgafp'
 'noahanyname utopian impulse inevitably ends gulags mass murder'
 'dreaming peacefully loud ass thunder wanted scare'
 'swellyjetevo disneyland tacos bomb'
 'blood pressure roof dont need extra shit'
 'reddit quarantine offensive content reddit cofounder ceo steve huffman unveiled specif'
 'ronda rousey would close making floyd mayweathers money 50 fights bloody elbow boxing'
 'camilla33 craykain hate shatter delusions hatchet deadly weapon justifying lethal force gunsense'
 'harder conflict glorious triumph thomas paine'
 'wbcshirl2 yes god doessnt change says rejoice fall people calamities like wild fires ect wanna punished']
[1 0 0 0 0 0 0 0 1 0]
[0, 0, 0, 0, 0, 0, 1, 1, 1, 0]


In [75]:
confusion_matrix(GRU_predictions, test_labels)

array([[1204,  322],
       [ 242,  745]])