**Sarcasm prediction**

dataset from: https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json

NLP practise

In [9]:
import json
import tensorflow as tf
import numpy as np
import urllib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import io

In [2]:
# parameters
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [3]:
sentences = []
labels = []

In [13]:
# Upload the dataset to google colab
from google.colab import files
uploaded = files.upload()

Saving sarcasm.json to sarcasm (1).json


In [17]:
f = open('sarcasm.json')

In [18]:
data2 = json.load(f)

In [25]:
# shuffle the data
from random import shuffle
shuffle(data2)

In [28]:
data2[1]['headline']

'bonobo embarrassed after walking in on parents, siblings, cousins, friends, partner having sex'

In [29]:
# Add to the list
for item in data2:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])

In [31]:
# Examine the data
index = 1 
print(sentences[index])
print(labels[index])
print(len(sentences))
print(len(labels))

bonobo embarrassed after walking in on parents, siblings, cousins, friends, partner having sex
1
26709
26709


In [32]:
# split into training and testing dataset
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [33]:
# Fit the tokenizer on Training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

In [34]:
word_index = tokenizer.word_index
# Setting the padding properties
# Creating padded sequences from train and test data
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [35]:
# Set the Model
tf.random.set_seed(42)
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
  tf.keras.layers.Conv1D(16, 3, activation='relu'),
  tf.keras.layers.MaxPooling1D(2),
  # tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(6, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [37]:
# Training the model
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs,
                        validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 3s - loss: 0.5250 - accuracy: 0.7207 - val_loss: 0.3841 - val_accuracy: 0.8225 - 3s/epoch - 6ms/step
Epoch 2/30
625/625 - 3s - loss: 0.3577 - accuracy: 0.8403 - val_loss: 0.3631 - val_accuracy: 0.8323 - 3s/epoch - 4ms/step
Epoch 3/30
625/625 - 3s - loss: 0.3276 - accuracy: 0.8542 - val_loss: 0.3620 - val_accuracy: 0.8329 - 3s/epoch - 4ms/step
Epoch 4/30
625/625 - 3s - loss: 0.3058 - accuracy: 0.8637 - val_loss: 0.3623 - val_accuracy: 0.8325 - 3s/epoch - 5ms/step
Epoch 5/30
625/625 - 3s - loss: 0.2895 - accuracy: 0.8730 - val_loss: 0.3639 - val_accuracy: 0.8335 - 3s/epoch - 5ms/step
Epoch 6/30
625/625 - 3s - loss: 0.2742 - accuracy: 0.8806 - val_loss: 0.3873 - val_accuracy: 0.8284 - 3s/epoch - 4ms/step
Epoch 7/30
625/625 - 3s - loss: 0.2613 - accuracy: 0.8884 - val_loss: 0.3812 - val_accuracy: 0.8326 - 3s/epoch - 4ms/step
Epoch 8/30
625/625 - 3s - loss: 0.2457 - accuracy: 0.8943 - val_loss: 0.3865 - val_accuracy: 0.8325 - 3s/epoch - 4ms/step
Epoch 9/30
625/625 - 3s 

In [38]:
model.save("mymodel.h5")