## Natural Language Processing with TensorFlow

In [97]:
from Library.helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

### 1. Visualizing the NLP dataset

In [98]:
import pandas as pd

train_df = pd.read_csv("Data/nlp_getting_started/train.csv")
test_df = pd.read_csv("Data/nlp_getting_started/test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


let's shuffle the training dataset.

In [99]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


Check how many 0 (not disaster) and 1 (is disaster) in training dataset. 

In [100]:
train_df_shuffled.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

Since we have only two target values, therefore, we are dealing with <b>binary classification</b>.

Let`s check how many total samples we have.

In [101]:
print(f"Total samples: {len(train_df) + len(test_df)}")
print(f"Total training samples: {len(train_df)}, {round((len(train_df)/ (len(train_df) + len(test_df)) * 100))}%")
print(f"Total test samples: {len(test_df)}, {round((len(test_df)/ (len(train_df) + len(test_df)) * 100))}%")

Total samples: 10876
Total training samples: 7613, 70%
Total test samples: 3263, 30%


Seem like we have decent amount of training and test dataset (70% vs 30%). Usually a split of 90/10 (90% training, 10% testing) or 80/20 is suffice.

Let`s visualize the training examples to gain better understand on the dataset with randomness.

In [102]:
import random

# Create 0 and 7608, since we select 5 samples, we minus 5 so it will not exceed total number of samples.
random_index = random.randint(0, len(train_df) - 5)

for row in train_df_shuffled[["text", "target"]][random_index:random_index + 5].itertuples():
    index, text, target = row
    
    print(f"Target: {target}", "(is disaster)" if target > 0 else "(not disaster)")
    print(f"Text:\n{text}\n")
    print("---\n")

Target: 1 (is disaster)
Text:
Traffic Collision - No Injury: I5 S at I5 S 43rd Ave offramp South Sac http://t.co/cT9ejXoLpu

---

Target: 1 (is disaster)
Text:
Eyewitness accounts of survivors of Hiroshima gleaned from a
number of oral history projects https://t.co/yRQGNbLKaC

---

Target: 1 (is disaster)
Text:
U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... http://t.co/SB5R7ShcCJ via @Change

---

Target: 0 (not disaster)
Text:
The chick I work with chews chewing gum so loud ?? feel to bang her

---

Target: 0 (not disaster)
Text:
@widda16 ... He's gone. You can relax. I thought the wife who wrecked her cake was a goner mind lol #whoops

---



### 2. Split training dataset into training and validation sets

We`ll split the training dataset into 90% training vs 10% validation.

In [103]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(), 
                                                                            train_df_shuffled["target"].to_numpy(), 
                                                                            test_size=0.1, 
                                                                            random_state=42)

Let`s check the length after splitting.

In [104]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

Let`s view the top 10 first training sentences and their labels.

In [105]:
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

### 3. Converting the text into numbers (Tokenization and Embeddings)

Let`s start text vectorization (tokenization).

Find the average number of words in training set.

In [146]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

Create `TextVectorization` object and set some parameters.

In [147]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [148]:
# Fit the text vectorizer with available training text

text_vectorizer.adapt(train_sentences)

In [149]:
# Let`s create sample sentence and tokenize it

sample_sentence = "Test test, there is flood in penang!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[1246, 1246,   74,    9,  232,    4,    1,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int64)>

In [150]:
# Let`s choose some random sentence from the training dataset then tokenize it

random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\n\nVectorized version:\n{text_vectorizer([random_sentence])}")

Original text:
Tube strike = absolute pandemonium

Vectorized version:
[[1938 1251 6283  502    0    0    0    0    0    0    0    0    0    0
     0]]


In [151]:
# Let`s check the unique words in the vocabulary we have adapted

words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### 4. Creating Embedding by using an Embedding Layer

In [157]:
from tensorflow.keras import layers

tf.random.set_seed(42)
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             input_length=max_length,
                             name="embedding_1") 

embedding

<keras.layers.embeddings.Embedding at 0x2239b24dd60>

Let`s get random sentence and test the embedding layer.

In [158]:
random_sentence = random.choice(train_sentences)
sample_embed = embedding(text_vectorizer([random_sentence]))

print(f"Original text:\n{random_sentence}\n\nEmbedded version:\n{sample_embed}")

Original text:
Hi yall this poem is called is the one about the snowstorm when we meet in space and that one time it rained. Thx. Ur watching disney chann

Embedded version:
[[[-0.03455232 -0.04213167  0.00866807 ...  0.01231498  0.00271455
    0.04221163]
  [ 0.03883579 -0.04218531  0.0122561  ... -0.04528418  0.03804381
    0.02471515]
  [ 0.02271518 -0.03204429  0.00166398 ... -0.00896945 -0.02778459
   -0.04290668]
  ...
  [ 0.03493195  0.00647281 -0.00628376 ...  0.03327657  0.03445746
    0.02336425]
  [-0.01238489 -0.01569571  0.04614357 ...  0.00714378 -0.04799243
   -0.04700608]
  [ 0.02528647 -0.0374978   0.03416723 ... -0.02124472 -0.0327303
   -0.02983623]]]


Each token in the sentence gets turned into a length 128 feature vector. Let`s check out. 

In [162]:
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.03455232, -0.04213167,  0.00866807, -0.01631409,  0.03413432,
        0.02733302,  0.04049318, -0.02776693, -0.0237524 ,  0.02595422,
        0.01845378,  0.00974258,  0.01647302,  0.00846023,  0.01980538,
       -0.03098391,  0.02313794,  0.01934112, -0.04288423,  0.02086648,
        0.01022495, -0.02217864,  0.03475136,  0.01083792, -0.00564416,
       -0.01541098, -0.00291831, -0.0042215 , -0.0456175 , -0.03705328,
        0.02297748,  0.04238257, -0.01124246,  0.01248912, -0.02637945,
        0.0205338 ,  0.03695256,  0.01771487,  0.03142862,  0.02634046,
       -0.0005298 , -0.03343501, -0.0301125 ,  0.02121196,  0.00588139,
       -0.02367045,  0.04463694,  0.02741059, -0.01686595, -0.00685756,
       -0.02562699,  0.0367169 , -0.04397279, -0.00650481, -0.00319433,
        0.02268673,  0.03502214, -0.03258608,  0.04395398, -0.03344826,
       -0.03235445, -0.03280754,  0.02930889, -0.0094143 , -0.04937797,
        0.003256

### 5. Modelling the training dataset

Let`s building the following:
* **Model 1**: Naive Bayes (baseline)
* **Model 2**: Feed-forward neural network (dense model)
* **Model 3**: LSTM model
* **Model 4**: GRU model
* **Model 5**: Bidirectional-LSTM model
* **Model 6**: 1D Convolutional Neural Network
* **Model 7**: TensorFlow Hub Pretrained Feature Extractor
* **Model 8**: model 7 with 10% of training data

Model 1 is the baseline that we'll expect other models to beat.

### Model 1: Naive Bayes (baseline)

In [213]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_1 = Pipeline([
                    ("tfidf", TfidfVectorizer()),
                    ("clf", MultinomialNB())
])

model_1.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

Let`s checking accuracy of model 1.

In [214]:
baseline_score = model_1.score(val_sentences, val_labels)

print(f"The baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

The baseline model achieves an accuracy of: 79.27%


Let make some predictions with baseline model.

In [215]:
baseline_preds = model_1.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

### Creating an evaluation function for model experiments (Accuracy, Precision, Recall, F1-score)

In [216]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
    return model_results

Let`s evaluate model 1.

In [217]:
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

### Model 2: Feed-forward neural network (dense model)

In [236]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding, will increase accuracy and reduce loss
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation

model_2 = tf.keras.Model(inputs, outputs, name="model_2_dense")

In [237]:
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [238]:
model_2.summary()

Model: "model_2_dense"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization_7 (TextVe (None, 15)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 15, 128)           1280000   
_________________________________________________________________
global_average_pooling1d_5 ( (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________


In [239]:
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [240]:
# Check the result.

model_2.evaluate(val_sentences, val_labels)



[0.5629136562347412, 0.7795275449752808]

In [241]:
# Check embedding weights.

embedding.weights

[<tf.Variable 'embedding_1/embeddings:0' shape=(10000, 128) dtype=float32, numpy=
 array([[-1.15883224e-01, -2.54726484e-02,  1.17380701e-01, ...,
          5.77725209e-02,  1.40486538e-01,  1.75039843e-01],
        [-1.88060123e-02, -4.11261916e-02,  2.04582456e-02, ...,
          4.24626730e-02,  1.13175273e-01,  1.14244565e-01],
        [-7.04165325e-02,  2.72392179e-04,  1.19271688e-01, ...,
          2.51390859e-02,  4.85049635e-02,  1.14741907e-01],
        ...,
        [-3.30144390e-02, -5.24929911e-03, -4.20972481e-02, ...,
          2.02876367e-02,  3.08806822e-03,  2.21579187e-02],
        [-7.13161975e-02, -1.02247156e-01,  1.43731639e-01, ...,
         -1.32509843e-01, -1.25343502e-01,  1.20486632e-01],
        [-2.71486223e-01, -2.88029164e-01,  2.99228698e-01, ...,
         -2.58708656e-01, -3.59207958e-01,  2.50938654e-01]], dtype=float32)>]

In [242]:
# Check embedding weights.

embed_weights = model_2.get_layer("embedding_1").get_weights()[0]

print(embed_weights.shape)

(10000, 128)


In [243]:
# Make predictions

model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]

array([[0.3722633 ],
       [0.7826859 ],
       [0.99941486],
       [0.06555247],
       [0.01956049],
       [0.9655756 ],
       [0.9168829 ],
       [0.9992175 ],
       [0.99584955],
       [0.35966796]], dtype=float32)

In [244]:
# Turn prediction probabilities into single-dimension tensor of floats

model_2_preds = tf.squeeze(tf.round(model_2_pred_probs)) # squeeze removes single dimensions
model_2_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [245]:
# Calculate model_2 metrics

model_2_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_2_preds)
model_2_results

{'accuracy': 77.95275590551181,
 'precision': 0.7822644211580037,
 'recall': 0.7795275590551181,
 'f1': 0.7771404562571971}

In [246]:
# Is the model 2 better than our baseline model?
import numpy as np

np.array(list(model_2_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

In [247]:
# Compare both model 2 with baseline model.

def compare_baseline_to_new_results(baseline_results, new_model_results):
    for key, value in baseline_results.items():
        print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

compare_baseline_to_new_results(baseline_results=baseline_results, 
                                new_model_results=model_2_results)

Baseline accuracy: 79.27, New accuracy: 77.95, Difference: -1.31
Baseline precision: 0.81, New precision: 0.78, Difference: -0.03
Baseline recall: 0.79, New recall: 0.78, Difference: -0.01
Baseline f1: 0.79, New f1: 0.78, Difference: -0.01
