# Modelling

## Import

In [15]:
import tensorflow as tf
from keras import models, losses, optimizers, metrics
from keras.layers import Dense, LSTM, Embedding
from keras.callbacks import EarlyStopping
import gc

%run ../scripts/save_utils.py

## Data load

In [18]:
x_train, y_train, x_val, y_val, x_test, y_test  = load_data('../save_files/processed_data.pkl')
headline_tokenizer = load_data('../save_files/tokenizer.pkl')

## Baseline model

To start things off, we need to train and evaluate a baseline model to have a bottom threshold of performance.

In [19]:
vocab_size = len(headline_tokenizer.word_index) + 1
max_len = x_train.shape[1]

stop_early = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
base_classifier = models.Sequential()
base_classifier.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))
base_classifier.add(LSTM(units=128))
base_classifier.add(Dense(16, activation='relu'))
base_classifier.add(Dense(1, activation='sigmoid'))
base_classifier.compile(optimizer=optimizers.RMSprop(lr=0.0005),
                        loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy])

base_classifier.fit(x_train, y_train, batch_size=64, epochs=50, validation_data=(x_val, y_val), verbose='auto', callbacks=[stop_early])

In [None]:
baseline_results = base_classifier.evaluate(x_val, y_val, return_dict=True)
print('baseline loss:     ', baseline_results['loss'])
print('baseline accuracy: ', baseline_results['binary_accuracy'])

In [21]:
gc.collect()

24410