In [1]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys
from tensorflow.keras import layers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

### 1. Data preparation

In [2]:
### Link folders

train_df = pd.read_csv("data/nlp/train.csv")
test_df = pd.read_csv("data/nlp/test.csv")

In [3]:
### Shuffle training dataframe

train_df_shuffled = train_df.sample(frac = 1, random_state = 42)

train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [4]:
### How many total sample?

print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [5]:
### Visualize random training samples

random_index = random.randint(0, len(train_df) - 5)

for row in train_df_shuffled[["text", "target"]][random_index:random_index + 5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}\n")
    print("---\n")

Target: 1 (real disaster)
Text:
Still can't get over the thunderstorm/tornado we were woken up to yesterday. Half the street is still in the dark! http://t.co/Y8h5v1j2y7

---

Target: 1 (real disaster)
Text:
Wreck with road blockage Woodward Avenue Northbound at Davison in M.S. #shoalstraffic

---

Target: 0 (not real disaster)
Text:
Like it affects every level of life you're expecting me to buy everything and still survive with my limited pocket money

---

Target: 1 (real disaster)
Text:
#WorldNews Fallen powerlines on G:link tram: UPDATE: FIRE crews have evacuated up to 30 passengers who were tr... http://t.co/EYSVvzA7Qm

---

Target: 1 (real disaster)
Text:
Has gun law ever dissuaded a potential mass murderer?

---



In [6]:
### Split training data into training and validation sets

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size = 0.1,
                                                                            random_state = 42)

In [7]:
### Check the length of validation & training

len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [8]:
# View first 10 training sentences and its label

train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

### 2. Converting text into numbers
Tokenization - word-level tokenization, character-level tokenization, sub-word tokenization <br>
Embeddings - own/ custom embedding, pre-learned embedding 

In [9]:
### Example of text vectorization

text_vectorizer = TextVectorization(max_tokens = None,
                                    standardize = "lower_and_strip_punctuation",
                                    split = "whitespace",
                                    ngrams = None,
                                    output_mode = "int",
                                    output_sequence_length = None)

In [10]:
### What is average number of tokens (words)?

round(sum(len(i.split()) for i in train_sentences) / len(train_sentences))

15

In [11]:
### Set text vectorization with custom variables
### Set max number of words to have in our vocabulary
### Max length for the sequences

max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode = "int",
                                    output_sequence_length = max_length)

In [12]:
### Fit the text vectorizer to the training text

text_vectorizer.adapt(train_sentences)

In [13]:
### Create sample sentence and tokenize it
### Check the output

sample_sentence = "There's a flood in my street!"

text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [14]:
### Choose random sentence from the training dataset then tokenize it

random_sentence = random.choice(train_sentences)

print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")

text_vectorizer([random_sentence])

Original text:
Love waking up to my dad screaming at me ??????      

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 110, 4291,   27,    5,   13, 1419,  311,   17,   31,    0,    0,
           0,    0,    0,    0]], dtype=int64)>

In [15]:
### Get the unique words in the vocabulary

words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### 3. Creating embedding layer

In [16]:
### Creating embedding layer

embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim = 128,
                             embeddings_initializer = "uniform",
                             input_length = max_length,
                             name = "embedding_layer") 

In [17]:
### Get a random sentence from training set
### Embed the random sentence

random_sentence = random.choice(train_sentences)

print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
I just wanted to watch Paper Towns but the buildings on fire ?????      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.04301525, -0.04443644, -0.01748439, ...,  0.04449774,
         -0.00153724, -0.04439235],
        [ 0.04555226,  0.02510707, -0.0017382 , ...,  0.04944488,
          0.02425803,  0.00947589],
        [-0.03437819,  0.04130817, -0.02357488, ...,  0.03838101,
          0.04014463,  0.00493345],
        ...,
        [ 0.03371259, -0.00171655,  0.02867376, ..., -0.04695725,
         -0.03288045, -0.03945033],
        [ 0.03371259, -0.00171655,  0.02867376, ..., -0.04695725,
         -0.03288045, -0.03945033],
        [ 0.03371259, -0.00171655,  0.02867376, ..., -0.04695725,
         -0.03288045, -0.03945033]]], dtype=float32)>

In [18]:
### Single token's embedding

sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.04301525, -0.04443644, -0.01748439, -0.03508688,  0.00029314,
        0.03428726, -0.04057056, -0.04228323,  0.0082969 , -0.01319723,
       -0.04065651,  0.01285983,  0.01793614,  0.01988634, -0.00381547,
       -0.00659677, -0.04319429,  0.0002957 ,  0.01494164,  0.01893128,
       -0.02990079, -0.02787136,  0.02569074,  0.04917817,  0.00226707,
        0.00238109, -0.00410699, -0.02816108, -0.04933527,  0.01428999,
       -0.00102276, -0.04656769,  0.01690919,  0.0202193 , -0.00054177,
        0.00970261, -0.0277022 , -0.04200481,  0.03645274,  0.0424467 ,
        0.00378089, -0.02008352, -0.03885863,  0.03527704,  0.02983529,
        0.04967414,  0.01167498,  0.01129816, -0.03097321,  0.02864044,
        0.03663311, -0.00881679, -0.02291065,  0.01550363, -0.03079291,
       -0.04998752,  0.00915421,  0.03727027,  0.04121624, -0.03007622,
       -0.03852018, -0.0250762 ,  0.02410323, -0.00825997,  0.04990783,
       -0.014980

### 4. Model building

Model 1 - Naive Bayes (baseline)

In [45]:
SAVE_DIR = "model_logs"

### Function for performance metrics

def performance_metrics(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100

    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average = "weighted")
    
    model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
    
    return model_results

### Function for comparing new and old result

def compare_baseline_with_new_result(baseline_result, new_result):
    for key, value in baseline_result.items():
        print(f"Baseline {key}: {value:.2f}, New {key}: {new_result[key]:.2f}, Difference: {new_result[key] - value}")

In [20]:
### Convert words to numbers using tfidf then model the text

first_model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

first_model.fit(train_sentences, train_labels)

In [21]:
### Check performance metrics

baseline_pred = first_model.predict(val_sentences)

baseline_result = performance_metrics(y_true = val_labels, y_pred = baseline_pred)
baseline_result

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

Model 2 - Simple dense model

In [22]:
### Create one dimensional strings inputs 
input_layer = layers.Input(shape = (1,), dtype = "string")

### Turn the input text into numbers
x = text_vectorizer(input_layer)

### Embedding the numerized numbers
x = embedding(x)

### Lower the dimensionality of the embedding
x = layers.GlobalAveragePooling1D()(x)

### Create the output layer for binary outputs 
output_layer = layers.Dense(1, activation = "sigmoid")(x)

### Construct the model
second_model = tf.keras.Model(input_layer, output_layer)

second_model.compile(loss = "binary_crossentropy", optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

second_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_layer (Embedding  (None, 15, 128)           1280000   
 )                                                               
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                             

In [23]:
second_model_history = second_model.fit(train_sentences, train_labels, epochs = 5,
    validation_data = (val_sentences, val_labels), 
    callbacks = [create_tensorboard_callback(dir_name = SAVE_DIR, experiment_name = "simple_dense_model")])

Saving TensorBoard log files to: model_logs/simple_dense_model/20230922-225243
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
# Check validation results

second_model.evaluate(val_sentences, val_labels)



[0.4826074540615082, 0.787401556968689]

In [25]:
### Check embedding weights

embedding.weights

[<tf.Variable 'embedding_layer/embeddings:0' shape=(10000, 128) dtype=float32, numpy=
 array([[ 0.01463184, -0.01991755,  0.04797755, ..., -0.06716058,
         -0.05348415, -0.01951163],
        [ 0.03035338, -0.01621791, -0.03138665, ..., -0.03135433,
          0.03413351,  0.04508649],
        [-0.0019709 ,  0.02034102,  0.007183  , ...,  0.00484237,
         -0.01946999, -0.01748167],
        ...,
        [ 0.04960455, -0.0209007 , -0.02314503, ..., -0.02859908,
          0.04579038,  0.03502407],
        [ 0.00269462, -0.06377188, -0.01078972, ..., -0.05997188,
         -0.05074562,  0.05960172],
        [-0.04528678, -0.0967545 ,  0.01804337, ..., -0.03746699,
         -0.06067383,  0.03093776]], dtype=float32)>]

In [26]:
### Other way to check embedding weights

embed_weights = second_model.get_layer("embedding_layer").get_weights()[0]

embed_weights.shape

(10000, 128)

In [27]:
# !tensorboard dev upload --logdir ./model_logs \
#   --name "First deep model on text data" \
#   --description "Trying a dense model with an embedding layer" \
#   --one_shot

In [28]:
# !tensorboard dev delete --experiment_id EXPERIMENT_ID_TO_DELETE

In [47]:
second_model_pred_prob = second_model.predict(val_sentences)

### Turn into single-dimension tensor of float
second_model_pred = tf.squeeze(tf.round(second_model_pred_prob))

second_model_result = performance_metrics(y_true = val_labels, y_pred = second_model_pred)

second_model_result



{'accuracy': 78.74015748031496,
 'precision': 0.7937136229340627,
 'recall': 0.7874015748031497,
 'f1': 0.7839588199365206}

In [46]:
compare_baseline_with_new_result(baseline_result = baseline_result, new_result = second_model_result)

Baseline accuracy: 79.27, New accuracy: 78.74, Difference: -0.5249343832020941
Baseline precision: 0.81, New precision: 0.79, Difference: -0.01742537748725459
Baseline recall: 0.79, New recall: 0.79, Difference: -0.005249343832020914
Baseline f1: 0.79, New f1: 0.78, Difference: -0.0022601558684343104


Model 3 - LSTM

In [31]:
third_model_embedding = layers.Embedding(input_dim = max_vocab_length, output_dim = 128,
    embeddings_initializer = "uniform", input_length = max_length, name = "third_embedding_layer")

input_layer = layers.Input(shape = (1,), dtype = "string")

x = text_vectorizer(input_layer)
x = third_model_embedding(x)
# print(x.shape)
x = layers.LSTM(64)(x)
# print(x.shape)

output_layer = layers.Dense(1, activation = "sigmoid")(x)

third_model = tf.keras.Model(input_layer, output_layer, name = "third_model_lstm")

third_model.compile(loss = "binary_crossentropy", optimizer = tf.keras.optimizers.Adam(),
    metrics = ["accuracy"])

third_model.summary()

Model: "third_model_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 third_embedding_layer (Emb  (None, 15, 128)           1280000   
 edding)                                                         
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1329473 (5.07 MB)
Trainable params: 

In [32]:
third_model_history = third_model.fit(train_sentences, train_labels, epochs = 5, 
    validation_data = (val_sentences, val_labels), callbacks = [create_tensorboard_callback(SAVE_DIR, "lstm")])

Saving TensorBoard log files to: model_logs/LSTM/20230922-225309
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
# !tensorboard dev upload --logdir ./model_logs \
#   --name "First deep model on text data" \
#   --description "Trying a dense model with an embedding layer" \
#   --one_shot

In [39]:
third_model_pred_prob = third_model.predict(val_sentences)
third_model_pred = tf.squeeze(tf.round(third_model_pred_prob))

third_model_result = performance_metrics(y_true = val_labels, y_pred = third_model_pred)
third_model_result



{'accuracy': 75.98425196850394,
 'precision': 0.7598710707718088,
 'recall': 0.7598425196850394,
 'f1': 0.758578322737536}

In [48]:
compare_baseline_with_new_result(baseline_result, third_model_result)

Baseline accuracy: 79.27, New accuracy: 75.98, Difference: -3.2808398950131163
Baseline precision: 0.81, New precision: 0.76, Difference: -0.05126792964950844
Baseline recall: 0.79, New recall: 0.76, Difference: -0.03280839895013121
Baseline f1: 0.79, New f1: 0.76, Difference: -0.02764065306741892


Model 4 - Bidirectonal RNN

In [50]:
forth_model_embedding = layers.Embedding(input_dim = max_vocab_length, output_dim = 128,
    embeddings_initializer = "uniform", input_length = max_length, name = "forth_embedding")

input_layer = layers.Input(shape = (1,), dtype = "string")

x = text_vectorizer(input_layer)
x = forth_model_embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x)

output_layer = layers.Dense(1, activation = "sigmoid")(x)

forth_model = tf.keras.Model(input_layer, output_layer, name = "forth_model_bidirectional_rnn")

forth_model.compile(loss = "binary_crossentropy", optimizer = tf.keras.optimizers.Adam(),
    metrics = ["accuracy"])

forth_model.summary()

Model: "forth_model_bidirectional_rnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 forth_embedding (Embedding  (None, 15, 128)           1280000   
 )                                                               
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                     

In [51]:
forth_model_history = forth_model.fit(train_sentences, train_labels,
    epochs = 5, validation_data = (val_sentences, val_labels), 
    callbacks = [create_tensorboard_callback(SAVE_DIR, "bidirectional_rnn")])

Saving TensorBoard log files to: model_logs/bidirectional_rnn/20230922-231811
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [53]:
forth_model_pred_prob = forth_model.predict(val_sentences)
forth_model_pred = tf.squeeze(tf.round(forth_model_pred_prob))

forth_model_results = performance_metrics(val_labels, forth_model_pred)
forth_model_results



{'accuracy': 74.93438320209974,
 'precision': 0.7490502271995707,
 'recall': 0.7493438320209974,
 'f1': 0.7482434768818874}

In [None]:
compare_baseline_to_new_results(baseline_results, model_4_results)