## Natural Language Processing with TensorFlow

In [89]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from helper_function import performance_metrics, compare_baseline_with_new_result, create_tensorboard_callback
from tensorflow.keras import layers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import tensorflow_hub as hub
import numpy as np

### 1. Data preparation

In [2]:
### Link folders

train_df = pd.read_csv("data/nlp/train.csv")
test_df = pd.read_csv("data/nlp/test.csv")

In [3]:
### Shuffle training dataframe

train_df_shuffled = train_df.sample(frac = 1, random_state = 42)

train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [4]:
### How many total sample?

print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [5]:
### Visualize random training samples

random_index = random.randint(0, len(train_df) - 5)

for row in train_df_shuffled[["text", "target"]][random_index:random_index + 5].itertuples():
    _, text, target = row
    
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}\n")
    print("---\n")

Target: 0 (not real disaster)
Text:
Plane Panic What kind of douchebag. Bubble Gum

---

Target: 1 (real disaster)
Text:
@kunalkapoor Photo of the Day: Storm
Chaser
http://t.co/4WJy7seHmw
#photography #pod

---

Target: 1 (real disaster)
Text:
@KapoKekito on northgate by the taco truck that's fire.

---

Target: 1 (real disaster)
Text:
DLH issues Hazardous Weather Outlook (HWO) http://t.co/WOzuBXRi2p

---

Target: 1 (real disaster)
Text:
#reuters Twelve feared killed in Pakistani air ambulance helicopter crash http://t.co/ShzPyIQok5

---



In [6]:
### Split training data into training and validation sets

train_sentence, val_sentence, train_label, val_label = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size = 0.1,
                                                                            random_state = 42)

In [7]:
### Check the length of validation & training

print(f"Length of train sentence: {len(train_sentence)}, length of train label: {len(train_label)}")
print(f"Length of val sentence: {len(val_sentence)}, length of val label: {len(val_label)}")

Length of train sentence: 6851, length of train label: 6851
Length of val sentence: 762, length of val label: 762


In [8]:
# View first 10 training sentences and its label

train_sentence[:10], train_label[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

### 2. Converting text into numbers
Tokenization - word-level tokenization, character-level tokenization, sub-word tokenization <br>
Embeddings - own/ custom embedding, pre-learned embedding 

In [9]:
### Example of text vectorization

text_vectorizer = TextVectorization(max_tokens = None,
                                    standardize = "lower_and_strip_punctuation",
                                    split = "whitespace",
                                    ngrams = None,
                                    output_mode = "int",
                                    output_sequence_length = None)

In [10]:
### What is average number of tokens (words)?

print(f"The average of tokens is {round(sum(len(i.split()) for i in train_sentence) / len(train_sentence))}")

The average of tokens is 15


In [11]:
### Set text vectorization with custom variables
### Set max number of words to have in our vocabulary
### Max length for the sequences

max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
    output_mode = "int",
    output_sequence_length = max_length)

In [12]:
### Fit the text vectorizer to the training text

text_vectorizer.adapt(train_sentence)

In [13]:
### Create sample sentence and tokenize it
### Check the output

sample_sentence = "There's a flood in my street!"

text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [14]:
### Choose random sentence from the training dataset then tokenize it

random_sentence = random.choice(train_sentence)

print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")

text_vectorizer([random_sentence])

Original text:
Drones Under Fire: Officials Offer $75000 Reward Leading To Pilots Who Flew Over Wildfire http://t.co/d2vEppeh8S #photography #arts      

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[2487,  204,   42,  503, 2367,    1, 2743, 1508,    5, 2021,   65,
        2996,   60,  146,    1]], dtype=int64)>

In [15]:
### Get the unique words in the vocabulary

words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### 3. Creating embedding layer

In [16]:
### Creating embedding layer

embedding = layers.Embedding(input_dim = max_vocab_length,
     output_dim = 128, embeddings_initializer = "uniform",
     input_length = max_length, name = "embedding_layer") 

In [17]:
### Get a random sentence from training set
### Embed the random sentence

random_sentence = random.choice(train_sentence)

print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
Kai Forbath just demolished a weather station set up on a drill field with a missed field goal. Thing just exploded into metal bits.      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.00811068,  0.03491307, -0.02548101, ..., -0.03320241,
          0.02853728,  0.02550546],
        [-0.00811068,  0.03491307, -0.02548101, ..., -0.03320241,
          0.02853728,  0.02550546],
        [-0.00519258,  0.03926153,  0.0130669 , ..., -0.01611619,
         -0.00551615, -0.01114497],
        ...,
        [-0.01655617,  0.01649188,  0.04291462, ..., -0.01350659,
          0.03089793,  0.04934624],
        [ 0.03377423, -0.01212207, -0.03183881, ...,  0.01489768,
         -0.04352232, -0.02405475],
        [-0.03596265, -0.00616892,  0.02295513, ...,  0.01972618,
         -0.04185466, -0.00833704]]], dtype=float32)>

In [83]:
### Single token's embedding

sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.00811068,  0.03491307, -0.02548101,  0.04781481,  0.04361737,
        0.03199318,  0.01940601, -0.0483266 , -0.005664  , -0.03757665,
       -0.03235463,  0.04903879, -0.02028735,  0.02265917, -0.04359738,
       -0.02064624, -0.00183594,  0.03911011, -0.00225415, -0.03699765,
        0.02796933,  0.00077952,  0.03411959,  0.04131328, -0.01507081,
       -0.02272201,  0.03807184,  0.01373016, -0.03292575,  0.00670286,
        0.02325675, -0.03902855,  0.02357253, -0.03854655, -0.01321665,
        0.03578268, -0.04914347,  0.0247624 , -0.02353262, -0.02438924,
        0.0293134 , -0.04433916,  0.0002748 , -0.00192436, -0.03667526,
       -0.02790846, -0.01435464, -0.01166171, -0.00105019,  0.00022941,
        0.01524897, -0.00659596,  0.00104121, -0.03893106,  0.01025798,
       -0.04031094,  0.03773138, -0.01118631,  0.03802359,  0.01509837,
       -0.0435953 ,  0.0068249 ,  0.00203154, -0.01919363,  0.02477253,
        0.012711

### 4. Model building

Model 1 - Naive Bayes (baseline)

In [19]:
### Initialize save directory location

saved_dir_loc = "model_log"

In [20]:
### Convert words to numbers using tfidf then model the text

first_model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

first_model.fit(train_sentence, train_label)

In [21]:
### Check performance metrics

baseline_pred = first_model.predict(val_sentence)

baseline_result = performance_metrics(y_true = val_label, y_pred = baseline_pred)
baseline_result

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

Model 2 - Simple dense model

In [22]:
### Create one dimensional strings inputs 
input_layer = layers.Input(shape = (1,), dtype = "string")

### Turn the input text into numbers
x = text_vectorizer(input_layer)

### Embedding the numerized numbers
x = embedding(x)

### Lower the dimensionality of the embedding
x = layers.GlobalAveragePooling1D()(x)

### Create the output layer for binary outputs 
output_layer = layers.Dense(1, activation = "sigmoid")(x)

### Construct the model
second_model = tf.keras.Model(input_layer, output_layer)

second_model.compile(loss = "binary_crossentropy", optimizer = tf.keras.optimizers.Adam(),
    metrics = ["accuracy"])

second_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_layer (Embedding  (None, 15, 128)           1280000   
 )                                                               
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                             

In [23]:
second_model_history = second_model.fit(train_sentence, train_label, epochs = 5,
    validation_data = (val_sentence, val_label), 
    callbacks = [create_tensorboard_callback(dir_name = saved_dir_loc , experiment_name = "simple_dense_model")])

Saving TensorBoard log files to: model_log/simple_dense_model/20230924-025729
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
# Check validation results

second_model.evaluate(val_sentence, val_label)



[0.47811052203178406, 0.778215229511261]

In [26]:
### Check embedding weights

embedding.weights

[<tf.Variable 'embedding_layer/embeddings:0' shape=(10000, 128) dtype=float32, numpy=
 array([[-3.7057016e-02,  4.8624083e-02,  1.4477805e-02, ...,
          3.3055462e-02, -2.1986498e-02, -4.1537173e-02],
        [-6.5777977e-03,  3.9635230e-02, -2.7613837e-02, ...,
         -3.4749243e-02,  3.0362485e-02,  2.2442650e-02],
        [ 3.9865156e-03,  6.0793847e-02,  3.5185479e-02, ...,
         -6.2198386e-02,  4.2470168e-02, -2.3823939e-02],
        ...,
        [ 4.7637112e-03,  3.2985210e-04, -1.1808313e-02, ...,
         -4.2228986e-02,  3.0479696e-02,  3.5024334e-02],
        [ 8.9700632e-03,  5.1463611e-02,  6.8334718e-03, ...,
         -4.5752746e-05,  4.5254495e-02, -1.4551225e-03],
        [ 3.7684176e-02,  8.7184988e-02, -6.3494042e-02, ...,
         -1.6940210e-02,  7.0671938e-02, -3.0954029e-02]], dtype=float32)>]

In [27]:
### Other way to check embedding weights

embed_weights = second_model.get_layer("embedding_layer").get_weights()[0]

embed_weights.shape

(10000, 128)

In [None]:
# !tensorboard dev upload --logdir ./model_log \
#   --name "First deep model on text data" \
#   --description "Trying a dense model with an embedding layer" \
#   --one_shot

In [None]:
# !tensorboard dev delete --experiment_id EXPERIMENT_ID_TO_DELETE

In [29]:
second_model_pred_prob = second_model.predict(val_sentence)

### Turn into single-dimension tensor of float
second_model_pred = tf.squeeze(tf.round(second_model_pred_prob))

second_model_result = performance_metrics(y_true = val_label, y_pred = second_model_pred)

second_model_result



{'accuracy': 77.82152230971128,
 'precision': 0.7814103276314137,
 'recall': 0.7782152230971129,
 'f1': 0.7756075024838144}

In [30]:
compare_baseline_with_new_result(baseline_result = baseline_result, new_result = second_model_result)

Baseline accuracy: 79.27, New accuracy: 77.82, Difference: -1.443569553805773
Baseline precision: 0.81, New precision: 0.78, Difference: -0.029728672789903543
Baseline recall: 0.79, New recall: 0.78, Difference: -0.01443569553805768
Baseline f1: 0.79, New f1: 0.78, Difference: -0.010611473321140541


Model 3 - LSTM

In [31]:
third_model_embedding = layers.Embedding(input_dim = max_vocab_length, output_dim = 128,
    embeddings_initializer = "uniform", input_length = max_length, name = "third_embedding_layer")

input_layer = layers.Input(shape = (1,), dtype = "string")

x = text_vectorizer(input_layer)
x = third_model_embedding(x)
# print(x.shape)
x = layers.LSTM(64)(x)
# print(x.shape)

output_layer = layers.Dense(1, activation = "sigmoid")(x)

third_model = tf.keras.Model(input_layer, output_layer, name = "third_model_lstm")

third_model.compile(loss = "binary_crossentropy", optimizer = tf.keras.optimizers.Adam(),
    metrics = ["accuracy"])

third_model.summary()

Model: "third_model_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 third_embedding_layer (Emb  (None, 15, 128)           1280000   
 edding)                                                         
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1329473 (5.07 MB)
Trainable params: 

In [34]:
third_model_history = third_model.fit(train_sentence, train_label, epochs = 5, 
    validation_data = (val_sentence, val_label), callbacks = [create_tensorboard_callback(saved_dir_loc, "lstm")])

Saving TensorBoard log files to: model_log/lstm/20230924-030216
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# !tensorboard dev upload --logdir ./model_logs \
#   --name "First deep model on text data" \
#   --description "Trying a dense model with an embedding layer" \
#   --one_shot

In [35]:
third_model_pred_prob = third_model.predict(val_sentence)
third_model_pred = tf.squeeze(tf.round(third_model_pred_prob))

third_model_result = performance_metrics(y_true = val_label, y_pred = third_model_pred)
third_model_result



{'accuracy': 77.03412073490814,
 'precision': 0.7729377683268125,
 'recall': 0.7703412073490814,
 'f1': 0.7677842762403819}

In [36]:
compare_baseline_with_new_result(baseline_result, third_model_result)

Baseline accuracy: 79.27, New accuracy: 77.03, Difference: -2.230971128608914
Baseline precision: 0.81, New precision: 0.77, Difference: -0.03820123209450477
Baseline recall: 0.79, New recall: 0.77, Difference: -0.022309711286089162
Baseline f1: 0.79, New f1: 0.77, Difference: -0.018434699564573


Model 4 - Bidirectonal RNN

In [37]:
forth_model_embedding = layers.Embedding(input_dim = max_vocab_length, output_dim = 128,
    embeddings_initializer = "uniform", input_length = max_length, name = "forth_embedding")

input_layer = layers.Input(shape = (1,), dtype = "string")

x = text_vectorizer(input_layer)
x = forth_model_embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x)

output_layer = layers.Dense(1, activation = "sigmoid")(x)

forth_model = tf.keras.Model(input_layer, output_layer, name = "forth_model_bidirectional_rnn")

forth_model.compile(loss = "binary_crossentropy", optimizer = tf.keras.optimizers.Adam(),
    metrics = ["accuracy"])

forth_model.summary()

Model: "forth_model_bidirectional_rnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 forth_embedding (Embedding  (None, 15, 128)           1280000   
 )                                                               
                                                                 
 bidirectional (Bidirection  (None, 128)               98816     
 al)                                                             
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                     

In [39]:
forth_model_history = forth_model.fit(train_sentence, train_label,
    epochs = 5, validation_data = (val_sentence, val_label), 
    callbacks = [create_tensorboard_callback(saved_dir_loc, "bidirectional_rnn")])

Saving TensorBoard log files to: model_log/bidirectional_rnn/20230924-030319
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
forth_model_pred_prob = forth_model.predict(val_sentence)
forth_model_pred = tf.squeeze(tf.round(forth_model_pred_prob))

forth_model_result = performance_metrics(val_label, forth_model_pred)
forth_model_result



{'accuracy': 76.77165354330708,
 'precision': 0.7691343474319641,
 'recall': 0.7677165354330708,
 'f1': 0.7656749923220023}

In [44]:
compare_baseline_with_new_result(baseline_result, forth_model_result)

Baseline accuracy: 79.27, New accuracy: 76.77, Difference: -2.4934383202099752
Baseline precision: 0.81, New precision: 0.77, Difference: -0.042004652989353186
Baseline recall: 0.79, New recall: 0.77, Difference: -0.02493438320209973
Baseline f1: 0.79, New f1: 0.77, Difference: -0.02054398348295261


Model 5 - One dimensional cnn

In [45]:
fifth_model_embedding = layers.Embedding(input_dim = max_vocab_length, output_dim = 128,
    embeddings_initializer = "uniform", input_length = max_length, name = "fifth_embedding")

input_layer = layers.Input(shape = (1,), dtype = "string")

x = text_vectorizer(input_layer)
x = fifth_model_embedding(x)
x = layers.Conv1D(filters = 32, kernel_size = 5, activation = "relu")(x)
x = layers.GlobalMaxPool1D()(x)

output_layer = layers.Dense(1, activation = "sigmoid")(x)

fifth_model = tf.keras.Model(input_layer, output_layer, name = "fifth_model_cnn_1d")

fifth_model.compile(loss = "binary_crossentropy", 
    optimizer = tf.keras.optimizers.Adam(), metrics = ["accuracy"])

fifth_model.summary()

Model: "fifth_model_cnn_1d"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 fifth_embedding (Embedding  (None, 15, 128)           1280000   
 )                                                               
                                                                 
 conv1d (Conv1D)             (None, 11, 32)            20512     
                                                                 
 global_max_pooling1d (Glob  (None, 32)                0         
 alMaxPooling1D)                                                 
                                                

In [46]:
fifth_model_history = fifth_model.fit(train_sentence, train_label,
    epochs = 5, validation_data = (val_sentence, val_label),
    callbacks = [create_tensorboard_callback(saved_dir_loc, "cnn_1d")])

Saving TensorBoard log files to: model_log/cnn_1d/20230924-030406
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [48]:
fifth_model_pred_prob = fifth_model.predict(val_sentence)
fifth_model_pred = tf.squeeze(tf.round(fifth_model_pred_prob))

fifth_model_result = performance_metrics(val_label, fifth_model_pred)
fifth_model_result



{'accuracy': 76.37795275590551,
 'precision': 0.7638046232237288,
 'recall': 0.7637795275590551,
 'f1': 0.7625889751874003}

In [49]:
compare_baseline_with_new_result(baseline_result, fifth_model_result)

Baseline accuracy: 79.27, New accuracy: 76.38, Difference: -2.887139107611546
Baseline precision: 0.81, New precision: 0.76, Difference: -0.047334377197588484
Baseline recall: 0.79, New recall: 0.76, Difference: -0.02887139107611547
Baseline f1: 0.79, New f1: 0.76, Difference: -0.023630000617554603


Model 6 - Pretrained sentence encoder <br><br>
Difference between the embedding layer vs universal sentence encoder is rather than create a word-level embedding, the universal sentence encoder creates <b>whole sentence-level embedding</b>. Custom embedding layer outputs 128 dimensional vector for each word while universal sentence encoder outputs 512 dimensional vector for each sentence.

In [74]:
### Example of pretrained embedding with universal sentence encoder

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embed_sample = embed([sample_sentence])

### View first 50 
print(embed_sample[0][:50])

tf.Tensor(
[-0.01157023  0.02485909  0.02878049 -0.012715    0.0397154   0.08827759
  0.02680983  0.05589837 -0.01068731 -0.00597293  0.00639322 -0.01819516
  0.00030816  0.09105889  0.05874643 -0.03180629  0.01512472 -0.05162928
  0.00991366 -0.06865345 -0.04209306  0.02678979  0.03011009  0.00321065
 -0.00337969 -0.04787359  0.0226672  -0.00985925 -0.04063614 -0.01292091
 -0.04666385  0.05630299 -0.03949255  0.00517684  0.02495827 -0.0701444
  0.0287151   0.04947682 -0.00633976 -0.08960192  0.02807119 -0.00808363
 -0.01360601  0.05998649 -0.10361788 -0.05195374  0.00232955 -0.0233253
 -0.03758107  0.0332773 ], shape=(50,), dtype=float32)


In [75]:
### What is its shape?

embed_sample[0].shape

TensorShape([512])

In [76]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
    input_shape = [], dtype = tf.string, trainable = False, name = "USE")

sixth_model = tf.keras.Sequential([
  sentence_encoder_layer,
  layers.Dense(64, activation = "relu"),
  layers.Dense(1, activation = "sigmoid")
], name = "sixth_model_use")

sixth_model.compile(loss = "binary_crossentropy",
    optimizer = tf.keras.optimizers.Adam(), metrics = ["accuracy"])

sixth_model.summary()

Model: "sixth_model_use"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_6 (Dense)             (None, 64)                32832     
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256830721 (979.73 MB)
Trainable params: 32897 (128.50 KB)
Non-trainable params: 256797824 (979.61 MB)
_________________________________________________________________


In [77]:
sixth_model_history = sixth_model.fit(train_sentence, train_label,
    epochs = 5, validation_data = (val_sentence, val_label), 
    callbacks = [create_tensorboard_callback(saved_dir_loc, "tf_hub_sentence_encoder")])

Saving TensorBoard log files to: model_log/tf_hub_sentence_encoder/20230924-032159
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [78]:
sixth_model_pred_prob = sixth_model.predict(val_sentence)
sixth_model_pred = tf.squeeze(tf.round(sixth_model_pred_prob))

sixth_model_result = performance_metrics(val_label, sixth_model_pred)
sixth_model_result



{'accuracy': 80.83989501312337,
 'precision': 0.8124533883813974,
 'recall': 0.8083989501312336,
 'f1': 0.8062063003139296}

In [79]:
compare_baseline_with_new_result(baseline_result, sixth_model_result)

Baseline accuracy: 79.27, New accuracy: 80.84, Difference: 1.5748031496063106
Baseline precision: 0.81, New precision: 0.81, Difference: 0.001314387960080099
Baseline recall: 0.79, New recall: 0.81, Difference: 0.015748031496063075
Baseline f1: 0.79, New f1: 0.81, Difference: 0.01998732450897467


Model 7 - Pretrained sentence encoder with 10% data

In [91]:
train_sentence_90_percent, train_sentence_10_percent, train_label_90_percent, train_label_10_percent \
    = train_test_split(np.array(train_sentence), train_label, test_size = 0.1, random_state = 42)

In [96]:
# Check length of 10 percent from training

print(f"Total training: {len(train_sentence)}")
print(f"Length of 10% training: {len(train_sentence_10_percent)}")

label_count = pd.Series(train_label_10_percent).value_counts()

print(f"Total label '0': {label_count[0]}")
print(f"Total label '1': {label_count[1]}")

Total training: 6851
Length of 10% training: 686
Total label '0': 415
Total label '1': 271


In [98]:
### Simply clone the sixth model as seventh model

seventh_model = tf.keras.models.clone_model(sixth_model)

seventh_model.compile(loss = "binary_crossentropy", optimizer = tf.keras.optimizers.Adam(),
    metrics = ["accuracy"])

seventh_model.summary()

Model: "sixth_model_use"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_6 (Dense)             (None, 64)                32832     
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256830721 (979.73 MB)
Trainable params: 32897 (128.50 KB)
Non-trainable params: 256797824 (979.61 MB)
_________________________________________________________________


In [99]:
seventh_model_history = seventh_model.fit(x = train_sentence_10_percent, y = train_label_10_percent,
    epochs = 5, validation_data = (val_sentence, val_label),
    callbacks = [create_tensorboard_callback(saved_dir_loc, "10_percent_tf_hub_sentence_encoder")])

Saving TensorBoard log files to: model_log/10_percent_tf_hub_sentence_encoder/20230924-034041
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [100]:
seventh_model_pred_prob = seventh_model.predict(val_sentence)
seventh_model_pred = tf.squeeze(tf.round(seventh_model_pred_prob))

seventh_model_result = performance_metrics(val_label, seventh_model_pred)
seventh_model_result



{'accuracy': 77.82152230971128,
 'precision': 0.7838273596953228,
 'recall': 0.7782152230971129,
 'f1': 0.7747045976528383}

In [103]:
compare_baseline_with_new_result(baseline_result, seventh_model_result)

Baseline accuracy: 79.27, New accuracy: 77.82, Difference: -1.443569553805773
Baseline precision: 0.81, New precision: 0.78, Difference: -0.027311640725994457
Baseline recall: 0.79, New recall: 0.78, Difference: -0.01443569553805768
Baseline f1: 0.79, New f1: 0.77, Difference: -0.011514378152116644
