In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf

# load data
onion = pd.read_csv('onionometer/sentences/onion.txt', sep='\n')
real = pd.read_csv('onionometer/sentences/real.txt', sep='\n')

# label data
onion['label'], real['label'] = 1, 0

# concatenate and shuffle data
shuffled_data = pd.concat([onion, real]).sample(frac=1).reset_index(drop=True)
shuffled_data

Unnamed: 0,title,label
0,arctic warming 2 to 3 times faster than anywhe...,0
1,"ministry seeks input on education reform, sex ...",0
2,66-year-old 'washington post' reporter hopes h...,1
3,friends don’t understand how man not depressed,1
4,liberal leader brian gallant prepares for thro...,0
...,...,...
31716,soldier acquitted in sexual assault of subordi...,0
31717,93% of americans admit they occasionally check...,1
31718,inuvik rcmp ask for help in locating man wante...,0
31719,"12 stunning, seasonal accents for under $30! h...",0


In [7]:
# split data into train/test sets
X_train = shuffled_data.loc[:25000, 'title'].values
y_train = shuffled_data.loc[:25000, 'label'].values
X_test = shuffled_data.loc[25001:, 'title'].values
y_test = shuffled_data.loc[25001:, 'label'].values

In [34]:
# create tokenizer vocabulary
tokenizer = tf.keras.preprocessing.text.Tokenizer()
total_headlines = np.concatenate((X_train, X_test))
tokenizer.fit_on_texts(total_headlines)

In [37]:
max_length = max([len(h.split()) for h in total_headlines])
vocab_size = len(tokenizer.word_index) + 1

In [38]:
# encode headlines as vectors of integers
X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

In [40]:
# pad input data
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_tokenized, maxlen=max_length, padding='post')
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_tokenized, maxlen=max_length, padding='post')

In [64]:
# build model.
# 100-dimensional embedding layer followed by bidirectional LSTM
# followed by layer of 32 densely connected neurons, 
# followed by one output neuron
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 100, input_length=max_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [65]:
optimizers = ['sgd', 'adam', 'RMSprop']
loss_functions = ['binary_crossentropy'] # only use binary crossentropy. hinge & hinge squared requires output set to be {-1, 1}

results = {'sgd': {}, 'adam': {}, 'RMSprop': {}}

In [66]:
for opt in optimizers:
    for loss_fn in loss_functions:
        model.compile(optimizer=opt, loss=loss_fn, metrics=['accuracy'])
        model.fit(X_train_padded, y_train, batch_size=512, epochs=3)
        eval_loss, eval_acc = model.evaluate(X_test_padded, y_test, batch_size=512)
        results[opt][loss_fn] = [eval_acc, eval_loss]

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [67]:
results

{'sgd': {'binary_crossentropy': [0.5303571, 0.6913437494209834]},
 'adam': {'binary_crossentropy': [0.9095238, 0.2337867608737378]},
 'RMSprop': {'binary_crossentropy': [0.9096726, 0.2546553272931349]}}

In [1]:
# With LSTM layer: 91% accuracy
# Without LSTM layer it was 53% accuracy :D