<a href="https://colab.research.google.com/github/firdaaacy/Tensorflow-Exercise/blob/main/NLP-TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================================================================================
# PROBLEM A4
#
# Build and train a binary classifier for the IMDB review dataset.
# The classifier should have a final layer with 1 neuron activated by sigmoid.
# Do not use lambda layers in your model.
#
# The dataset used in this problem is originally published in http://ai.stanford.edu/~amaas/data/sentiment/
#
# Desired accuracy and validation_accuracy > 83%
# ===========================================================================================================

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def solution_A4():
    imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
    # YOUR CODE HERE
    training_sentences =[]
    training_labels=[]
    testing_sentences=[]
    testing_labels=[]
    train_data = imdb['train']
    test_data = imdb['test']

    for s, l in train_data:
        training_sentences.append(s.numpy().decode('utf8'))
        training_labels.append(l.numpy())

    for s, l in test_data:
        testing_sentences.append(s.numpy().decode('utf8'))
        testing_labels.append(l.numpy())

    # YOUR CODE HERE
    testing_labels = np.array(testing_labels)
    training_labels = np.array(training_labels)

    vocab_size = 10000
    embedding_dim = 16
    max_length = 120
    trunc_type='post'
    oov_tok = "<OOV>"
    
    #tokenizer =  # YOUR CODE HERE
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(training_sentences)
    word_index = tokenizer.word_index

    training_sequences = tokenizer.texts_to_sequences(training_sentences)
    training_padded = pad_sequences(training_sequences, truncating=trunc_type, maxlen=max_length)

    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
    testing_padded = pad_sequences(testing_sequences, truncating=trunc_type, maxlen=max_length)
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

    def decode_review(text):
        return ' '.join([reverse_word_index.get(i, '?') for i in text])

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        # tf.keras.layers.Dropout(rate=0.3),
        # tf.keras.layers.Conv1D(64, 4, activation='relu'),
        # tf.keras.layers.Dropout(rate=0.3),
        # tf.keras.layers.GlobalAveragePooling1D(),
        # tf.keras.layers.Dropout(rate=0.3),
        # # tf.keras.layers.GlobalAveragePooling1D(),
        # tf.keras.layers.Dense(4, activation='relu'),
        # tf.keras.layers.Dropout(rate=0.3),
        # tf.keras.layers.Dense(1, activation='sigmoid')

        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(10, activation="relu", kernel_regularizer="l2"),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(training_padded, training_labels, epochs=10,
              validation_data=(testing_padded, testing_labels))

    return model


# The code below is to save your model as a .h5 file.
# It will be saved automatically in your Submission folder.
if __name__ == '__main__':
    model = solution_A4()
    model.save("model_A4.h5")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


SUBMISSION B


In [None]:
# ===================================================================================================
# PROBLEM B4
#
# Build and train a classifier for the BBC-text dataset.
# This is a multiclass classification problem.
# Do not use lambda layers in your model.
#
# The dataset used in this problem is originally published in: http://mlg.ucd.ie/datasets/bbc.html.
#
# Desired accuracy and validation_accuracy > 91%
# ===================================================================================================

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import pandas as pd


# class MyCallback(tf.keras.callbacks.Callback):
#     def on_epoch_end(self, epoch, logs={}):
#         if (logs.get('val_accuracy') > 0.91 and logs.get('accuracy') > 0.91):
#             print("\nValidation Accuracy reached 91%, so cancelled training!")
#             self.model.stop_training = True


def solution_B4():
    bbc = pd.read_csv('https://github.com/dicodingacademy/assets/raw/main/Simulation/machine_learning/bbc-text.csv')

    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type = 'post'
    padding_type = 'post'
    oov_tok = "<OOV>"
    training_portion = .8

    # YOUR CODE HERE
    train_size = int(bbc.shape[0] * training_portion)
    train = bbc[:train_size]
    test = bbc[train_size:]

    train_data = train['text'].to_numpy()
    train_label = train['category']

    test_data = test['text'].to_numpy()
    test_label = test['category']

    onehot_encoder = OneHotEncoder(sparse=False)
    train__label = onehot_encoder.fit_transform(train_label.to_numpy().reshape(-1, 1))
    test__label = onehot_encoder.transform(test_label.to_numpy().reshape(-1, 1))

    # tokenizer =  YOUR CODE HERE
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(train_data)
    word_index = tokenizer.word_index

    training_sequences = tokenizer.texts_to_sequences(train_data)
    training_padded = pad_sequences(training_sequences, truncating=trunc_type, maxlen=max_length)

    testing_sequences = tokenizer.texts_to_sequences(test_data)
    testing_padded = pad_sequences(testing_sequences, truncating=trunc_type, maxlen=max_length)

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(5, activation='softmax')
    ])
    # callbacks = MyCallback()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(training_padded, train__label, epochs=20,
              validation_data=(testing_padded, test__label)
              )

    return model

    # The code below is to save your model as a .h5 file.
    # It will be saved automatically in your Submission folder.


if __name__ == '__main__':
    model = solution_B4()
    model.save("model_B4.h5")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


PAKET C


In [5]:
# =====================================================================================================
# PROBLEM C4
#
# Build and train a classifier for the sarcasm dataset.
# The classifier should have a final layer with 1 neuron activated by sigmoid.
#
# Do not use lambda layers in your model.
#
# Dataset used in this problem is built by Rishabh Misra (https://rishabhmisra.github.io/publications).
#
# Desired accuracy and validation_accuracy > 75%
# =======================================================================================================

import json
import tensorflow as tf
import numpy as np
import urllib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# class MyCallback(tf.keras.callbacks.Callback):
#   def on_epoch_end(self, epoch, logs={}):
#     if(logs.get('val_accuracy')>0.8 and logs.get('accuracy')>0.8):
#       print("\nAccuracy and Validation Accuracy reached 80%, so cancelled training!")
#       self.model.stop_training = True


def solution_C4():
    data_url = 'https://github.com/dicodingacademy/assets/raw/main/Simulation/machine_learning/sarcasm.json'
    urllib.request.urlretrieve(data_url, 'sarcasm.json')

    # DO NOT CHANGE THIS CODE
    # Make sure you used all of these parameters or test may fail
    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type = 'post'
    padding_type = 'post'
    oov_tok = "<OOV>"
    training_size = 20000

    sentences = []
    labels = []
    # YOUR CODE HERE

    with open("./sarcasm.json", 'r') as f:
        datastore = json.load(f)
    for item in datastore:
        sentences.append(item['headline'])
        labels.append(item['is_sarcastic'])
    # Split the sentences
    train_text= sentences[:training_size]
    test_text = sentences[training_size:]

    # Split the labels
    train_label = labels[:training_size]
    test_label = labels[training_size:]

    # Fit your tokenizer with training data
    #tokenizer =  # YOUR CODE HERE
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(train_text)
    word_index = tokenizer.word_index

    training_sequences = tokenizer.texts_to_sequences(train_text)
    training_padded = pad_sequences(training_sequences, truncating=trunc_type, maxlen=max_length)

    testing_sequences = tokenizer.texts_to_sequences(test_text)
    testing_padded = pad_sequences(testing_sequences, truncating=trunc_type, maxlen=max_length)

    training_labels = np.array(train_label)
    testing_labels = np.array(test_label)

    # model = tf.keras.Sequential([
    #     # YOUR CODE HERE. DO not change the last layer or test may fail
    #     tf.keras.layers.Dense(1, activation='sigmoid')
    # ])

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        # tf.keras.layers.Dense(512, activation='relu'),
        # tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    # callbacks = MyCallback()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(training_padded, training_labels, epochs=10,
          validation_data=(testing_padded, testing_labels) #, callbacks=[callbacks]
          )
    return model


# The code below is to save your model as a .h5 file.
# It will be saved automatically in your Submission folder.
if __name__ == '__main__':
    # DO NOT CHANGE THIS CODE
    model = solution_C4()
    model.save("model_C4.h5")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
