<a href="https://colab.research.google.com/github/https-deeplearning-ai/tensorflow-1-public/blob/master/C3/W3/ungraded_labs/C3_W3_Lab_6_sarcasm_with_1D_convolutional.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/doantronghieu/DEEP-LEARNING/main/helper_DL.py
!pip install colorama
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':15})
import seaborn           as sns
sns.set()
import helper_DL as helper

# Ungraded Lab: Training a Sarcasm Detection Model using a Convolution Layer

You will be doing the same steps here as the previous lab but will be using a convolution layer instead. As usual, try tweaking the parameters and observe how it affects the results. 


## Download the Dataset

In [None]:
# Download the dataset
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

In [None]:
import json

# Load the JSON file
with open('./sarcasm.json', 'r') as f:
    datastore = json.load(f)

# Initialize the lists
sentences = []
labels    = []

# Collect sentences and labels into the lists
for item in datastore:
    sentences.append(item['headline'])
    labels   .append(item['is_sarcastic'])

## Split the Dataset

In [None]:
training_size = 20000

# Split the sentences
training_sentences = sentences[0:training_size]
testing_sentences  = sentences[training_size:]

# Split the labels
training_labels = labels[0:training_size]
testing_labels  = labels[training_size:]

## Data preprocessing

In [None]:
import numpy as np
import tensorflow.keras.preprocessing as tfkp

vocab_size    = 10000 # Vocabulary size of tokenizer
max_length    = 120   # Maximum length of the padded sequences
trunc_type   = 'post'
padding_type = 'post'
oov_tok      = '<OOV>'

# Initialize the Tokenizer class
tokenizer = tfkp.text.Tokenizer(num_words = vocab_size, oov_token = oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded    = tfkp.sequence.pad_sequences(training_sequences,
                                                 maxlen = max_length,
                                                 padding = padding_type,
                                                 truncating = trunc_type) 

# Generate and pad the testing sequences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded    = tfkp.sequence.pad_sequences(testing_sequences,
                                                maxlen = max_length,
                                                padding = padding_type,
                                                truncating = trunc_type) 

# Convert the labels lists into numpy arrays
training_labels = np.array(training_labels)
testing_labels  = np.array(testing_labels)

## Build and Compile the Model

In [None]:
import tensorflow as tf
import tensorflow.keras as tfk
from tensorflow import nn
from tensorflow.keras import layers, losses, optimizers, models, Model
import numpy as np

In [None]:
# Hyperparameters
EMBEDDING_DIM = 16
FILTERS       = 128
KERNEL_SIZE   = 5
DENSE_DIM     = 6

# Buid the model
model_conv = models.Sequential([
    layers.Embedding(vocab_size, EMBEDDING_DIM, input_length = max_length),
    layers.Conv1D(filters = FILTERS, kernel_size = KERNEL_SIZE, activation = nn.relu),
    layers.GlobalMaxPooling1D(),
    layers.Dense(DENSE_DIM, activation = nn.relu),
    layers.Dense(1, activation = nn.sigmoid)                         
])

model_conv.summary()

# Set the training parameters
model_conv.compile(loss = losses.binary_crossentropy,
                   optimizer = optimizers.Adam(),
                   metrics = ['accuracy'])

## Train the Model

In [None]:
NUM_EPOCHS = 10

# Train the model
history_conv = model_conv.fit(training_padded, training_labels, 
                              epochs = NUM_EPOCHS,
                              validation_data = (testing_padded, testing_labels))

helper.plot_history_curves(history_conv)