Copyright (c) 2023, Troy Phat Tran (Mr. Troy).

Question:

Build and train a binary classifier for the language classification dataset. The dataset is typically a JSON array<br>
of 500 JSON objects. Each object has 3 keys: sentence, language_code, and is_english.<br>
We want our model to be able to determine whether a piece of text is "English or not".

Your task is to fill in the missing parts of the code block (where commented as "ADD CODE HERE").

Note: the dataset is imbalanced as there are more non-English sentences than English ones. To keep things simple, <br>
you don't need to handle data imbalance in this coding challenge.

In [1]:
import json
import os
from urllib.request import urlretrieve

In [3]:
import numpy as np
from keras import Sequential
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import tensorflow as tf

In [4]:
def nlp_binary_model():
    # Download the dataset
    json_file = 'language-classification.json'
    if not os.path.exists(json_file):
        url = 'https://trientran.github.io/tf-practice-exams/language-classification.json'
        urlretrieve(url=url, filename=json_file)

    # Parse the JSON file
    with open(file=json_file, mode='r', encoding='utf-8') as f:
        datastore = json.load(f)

    # Extract texts and labels from JSON data
    texts = []
    labels = []
    for item in datastore:
        texts.append(item['sentence'])  # replace with the sentence/paragraph/text field in the real test JSON file
        labels.append(item['is_english'])  # replace with the label field in the real test JSON file

    # Predefined constants
    max_length = 25
    trunc_type = 'pre'  # Can be replaced with 'post'
    vocab_size = 500
    padding_type = 'pre'  # Can be replaced with 'post'
    embedding_dim = 32
    oov_tok = "<OOV>"

    # Split the dataset into training and validation sets
    num_samples = len(texts)
    num_train_samples = int(0.8 * num_samples)
    indices = np.random.permutation(num_samples)
    train_indices = indices[:num_train_samples]
    val_indices = indices[num_train_samples:]

    # Tokenize the texts
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    # Pad the sequences
    padded_sequences = pad_sequences(
        sequences=sequences,
        maxlen=max_length,
        padding=padding_type,
        truncating=trunc_type
    )
    padded_training_set = padded_sequences[train_indices]
    padded_validation_set = padded_sequences[val_indices]

    # Convert the labels to numpy array
    training_labels = np.array(labels)[train_indices]
    validation_labels = np.array(labels)[val_indices]

    # Define the model architecture
    model = Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Define an early stopping callback (optional)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)

    # Train the model
    model.fit(padded_training_set, training_labels, validation_data=(padded_validation_set, validation_labels), epochs=50, callbacks=[early_stopping])
    return model

===============DO NOT EDIT THIS PART================================

In [5]:
if __name__ == '__main__':
    # Run and save your model
    my_model = nlp_binary_model()
    filepath = "nlp_binary_model.h5"
    my_model.save(filepath)

    # Reload the saved model
    saved_model = load_model(filepath)

    # Show the model architecture
    saved_model.summary()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 32)            16000     
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 24)                1560      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 34225 (133.69 KB)
Trainable params: 34225 (133.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
