In [1]:
# Import packages
import pandas as pd
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
# Convert JSON file to array of JSON objects
with open('./data/Sarcasm_Headlines_Dataset.json', 'r') as f:
    data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")

In [3]:
# Create DataFrame from array of JSON objects
df = pd.DataFrame(data)

In [4]:
# Create predictor and target from DataFrame
X = df['headline']
y = df['is_sarcastic']

In [5]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
# Set tokenizer and padder parameters
num_words = 1000
oov_token = '<OOV>'
pad_type = 'post'
trunc_type = 'post'

In [7]:
# Fit tokenizer on the training set
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [8]:
# Tokenize the training and test sets
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

maxlen = max([len(x) for x in X_train_sequences])

In [9]:
# Pad the training and test sets
X_train_padded = pad_sequences(X_train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [10]:
print(X_train[1])

the 'roseanne' revival catches up to our thorny political mood, for better and worse


In [11]:
print(X_train_sequences[1])

[1, 927, 736, 695, 429, 1, 41, 1, 2, 45, 1, 5, 7, 302, 736]


In [12]:
print(X_train_padded[1])

[  1 927 736 695 429   1  41   1   2  45   1   5   7 302 736   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]
