In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('nlp_data/train.csv')
test_df = pd.read_csv('nlp_data/test.csv')
train_df.shape, test_df.shape

((7613, 5), (3263, 4))

In [9]:
train_df = train_df.sample(frac=1, random_state=2021)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
185,264,ambulance,,@fouseyTUBE you ok? Need a ambulance. Hahahah ...,0
4616,6561,injury,"ÌÏT: 35.223347,-80.827834",#PFT Barkevious Mingo missed Browns practice w...,0
7093,10159,violent%20storm,UK,Terrifying POV footage captures violent landin...,1
5726,8171,rescuers,,http://t.co/XlFi7ovhFJ VIDEO: 'We're picking u...,1
7020,10061,typhoon,Whole World,Global precipitation measurement satellite cap...,1


In [11]:
train_data, val_data, train_labels, val_labels = train_test_split(train_df['text'], train_df['target'], test_size=0.1, random_state=2021)
train_data.shape, val_data.shape, train_labels.shape, val_labels.shape

((6851,), (762,), (6851,), (762,))

## Text to embeddings

In [31]:
from tensorflow.keras.layers import TextVectorization, Embedding

In [32]:
max_vocab_size = 100000
max_seq_length = 15
text_vectorization = TextVectorization(max_tokens=max_vocab_size, output_sequence_length=max_seq_length)

In [33]:
text_vectorization.adapt(train_data)

In [None]:
vocab = text_vectorization.get_vocabulary()
print('Vocab length: {}, top 5: {}, tail 5: {}'.format(len(vocab), vocab[:5], vocab[-5:]))

In [34]:
embedding = Embedding(input_dim=len(vocab), output_dim=128, input_length=max_seq_length, name="embedding_1")

In [35]:
sample = train_data[0]
sample, text_vectorization(sample), embedding(text_vectorization(sample))

('Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 <tf.Tensor: shape=(15,), dtype=int64, numpy=
 array([ 104, 5819,   22,    2,  834,    6,   19,  262,  142, 1720, 5592,
          72,   41,    0,    0])>,
 <tf.Tensor: shape=(15, 128), dtype=float32, numpy=
 array([[-0.0012822 , -0.010647  ,  0.02720002, ...,  0.02467877,
         -0.0018568 , -0.00418786],
        [-0.00366858, -0.03127545, -0.0245946 , ..., -0.01952111,
         -0.0229091 , -0.00373935],
        [-0.02928547, -0.04111477,  0.00608927, ...,  0.00919198,
         -0.03175006,  0.02720414],
        ...,
        [ 0.02962365, -0.04755999, -0.00920203, ...,  0.00208379,
         -0.03838278,  0.03805104],
        [-0.00832572,  0.02414176, -0.04526794, ...,  0.03990587,
         -0.02619268,  0.03846988],
        [-0.00832572,  0.02414176, -0.04526794, ...,  0.03990587,
         -0.02619268,  0.03846988]], dtype=float32)>)

## Naive Bayes + Tf-idf

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from helper_functions import calculate_results

#### Normal

In [50]:
tf_vectorizer = TfidfVectorizer()
train_tf = tf_vectorizer.fit_transform(train_data)

print("n_samples: %d, n_features: %d" % train_tf.shape)

n_samples: 6851, n_features: 20137


In [51]:
val_tf = tf_vectorizer.transform(val_data)
print("n_samples: %d, n_features: %d" % val_tf.shape)

n_samples: 762, n_features: 20137


In [52]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(train_tf, train_labels)

MultinomialNB()

In [64]:
y_pred = naive_bayes_classifier.predict(val_tf)
calculate_results(val_labels, y_pred)

{'accuracy': 78.87139107611549,
 'precision': 0.8053461730226711,
 'recall': 0.7887139107611548,
 'f1': 0.7808015277723382}

#### Use sklearn Pipeline

In [59]:
model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()), ('cls', MultinomialNB())
])
model_0.fit(train_data, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('cls', MultinomialNB())])

In [65]:
y_pred = model_0.predict(val_data)
calculate_results(val_labels, y_pred)

{'accuracy': 78.87139107611549,
 'precision': 0.8053461730226711,
 'recall': 0.7887139107611548,
 'f1': 0.7808015277723382}

## FCN

In [68]:
from tensorflow.keras import layers
from helper_functions import create_tensorboard_callback

In [74]:
tb_callback1 = create_tensorboard_callback(dir_name='nlp_model_logs', experiment_name='model1_dense')
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorization(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_1 = tf.keras.Model(inputs, outputs, name='model_1')

Saving TensorBoard log files to: model_logs/model1_dense/20220627-142709


In [75]:
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model_1_history = model_1.fit(train_data, train_labels, epochs=5,
                              validation_data=(val_data, val_labels), callbacks=[tb_callback1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [76]:
model1_pred_probs = model_1.predict(val_data)
model1_pred = tf.squeeze(tf.round(model1_pred_probs))
model1_pred.shape, val_labels.shape



(TensorShape([762]), (762,))

In [77]:
calculate_results(val_labels, model1_pred)

{'accuracy': 79.52755905511812,
 'precision': 0.7961262183381259,
 'recall': 0.7952755905511811,
 'f1': 0.7931953806655787}

## LSTM

In [79]:
tb_callback2 = create_tensorboard_callback(dir_name='nlp_model_logs', experiment_name='model2_lstm')
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorization(inputs)
x = embedding(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_2 = tf.keras.Model(inputs, outputs, name='model_2')

Saving TensorBoard log files to: nlp_model_logs/model2_lstm/20220627-150608


In [80]:
model_2.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model_2_history = model_2.fit(train_data, train_labels, epochs=5, validation_data=(val_data, val_labels), callbacks=[tb_callback2])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [81]:
model2_pred_probs = model_2.predict(val_data)
model2_pred = tf.squeeze(tf.round(model2_pred_probs))
model2_pred.shape, val_labels.shape



(TensorShape([762]), (762,))

In [82]:
calculate_results(val_labels, model2_pred)

{'accuracy': 75.7217847769029,
 'precision': 0.7580773946289202,
 'recall': 0.7572178477690289,
 'f1': 0.7575417441632131}

## GRU

In [83]:
tb_callback3 = create_tensorboard_callback(dir_name='nlp_model_logs', experiment_name='model3_gru')
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorization(inputs)
x = embedding(x)
x = layers.GRU(64)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_3 = tf.keras.Model(inputs, outputs, name='model_3')

Saving TensorBoard log files to: nlp_model_logs/model3_gru/20220627-151113


In [84]:
model_3.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model_3_history = model_3.fit(train_data, train_labels, epochs=5, validation_data=(val_data, val_labels), callbacks=[tb_callback3])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [85]:
model3_pred_probs = model_3.predict(val_data)
model3_pred = tf.squeeze(tf.round(model3_pred_probs))
model3_pred.shape, val_labels.shape



(TensorShape([762]), (762,))

In [86]:
calculate_results(val_labels, model3_pred)

{'accuracy': 74.1469816272966,
 'precision': 0.7405001863867785,
 'recall': 0.7414698162729659,
 'f1': 0.7406114198035412}

## Bidirectional

In [87]:
tb_callback4 = create_tensorboard_callback(dir_name='nlp_model_logs', experiment_name='model4_biRNN')
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorization(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_4 = tf.keras.Model(inputs, outputs, name='model_4')

Saving TensorBoard log files to: nlp_model_logs/model4_biRNN/20220627-162619


In [88]:
model_4.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model_4_history = model_4.fit(train_data, train_labels, epochs=5, validation_data=(val_data, val_labels), callbacks=[tb_callback4])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [89]:
model4_pred_probs = model_4.predict(val_data)
model4_pred = tf.squeeze(tf.round(model4_pred_probs))
model4_pred.shape, val_labels.shape



(TensorShape([762]), (762,))

In [90]:
calculate_results(val_labels, model4_pred)

{'accuracy': 74.80314960629921,
 'precision': 0.7480314960629921,
 'recall': 0.7480314960629921,
 'f1': 0.7480314960629921}

## Conv1D

In [99]:
tb_callback5 = create_tensorboard_callback(dir_name='nlp_model_logs', experiment_name='model5_conv1D')
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorization(inputs)
x = embedding(x)
x = layers.Conv1D(64, 5, activation='tanh')(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_5 = tf.keras.Model(inputs, outputs, name='model_5')

Saving TensorBoard log files to: nlp_model_logs/model5_conv1D/20220627-164518


In [100]:
model_5.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model_5_history = model_5.fit(train_data, train_labels, epochs=5, validation_data=(val_data, val_labels), callbacks=[tb_callback5])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [101]:
model5_pred_probs = model_5.predict(val_data)
model5_pred = tf.squeeze(tf.round(model5_pred_probs))
model5_pred.shape, val_labels.shape



(TensorShape([762]), (762,))

In [102]:
calculate_results(val_labels, model5_pred)

{'accuracy': 74.2782152230971,
 'precision': 0.7417080098668083,
 'recall': 0.7427821522309711,
 'f1': 0.7413648839066457}

## Pretrained USE model

In [103]:
import tensorflow_hub as tfhub

In [104]:
embed = tfhub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
embed_sample = embed([sample, 'This is another sample that not come from dataset'])

tensorflow.python.framework.ops.EagerTensor

In [105]:
embed_sample.shape

TensorShape([2, 512])

In [109]:
tb_callback6 = create_tensorboard_callback(dir_name='nlp_model_logs', experiment_name='model6_USE_pretrained')
sentence_encoder_layer = tfhub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4', input_shape=[], dtype=tf.string, trainable=False, name='USE')
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
], name='model_6')
model_6.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model_6.fit(train_data, train_labels, epochs=5, validation_data=(val_data, val_labels), callbacks=[tb_callback6])

Saving TensorBoard log files to: nlp_model_logs/model6_USE_pretrained/20220627-173149
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff4b529ccd0>

In [110]:
model6_pred_probs = model_6.predict(val_data)
model6_pred = tf.squeeze(tf.round(model6_pred_probs))
model6_pred.shape, val_labels.shape



(TensorShape([762]), (762,))

In [111]:
calculate_results(val_labels, model6_pred)

{'accuracy': 80.18372703412074,
 'precision': 0.8041247411259352,
 'recall': 0.8018372703412073,
 'f1': 0.799218679146436}

## Tensorboard.dev

In [None]:
!tensorboard dev upload --logdir ./nlp_model_logs/ --name "NLP model experiments" --description "NLP model for disaster classification tweet" --one_shot


***** TensorBoard Uploader *****

This will upload your TensorBoard logs to https://tensorboard.dev/ from
the following directory:

./nlp_model_logs/

This TensorBoard will be visible to everyone. Do not upload sensitive
data.

Your use of this service is subject to Google's Terms of Service
<https://policies.google.com/terms> and Privacy Policy
<https://policies.google.com/privacy>, and TensorBoard.dev's Terms of Service
<https://tensorboard.dev/policy/terms/>.

This notice will not be shown again while you are logged into the uploader.
To log out, run `tensorboard dev auth revoke`.

Continue? (yes/NO) 

In [None]:
!tensorboard dev list

In [None]:
!tensorboard dev delete --experiment_id <exp_id>