# Modelling

## Import

In [7]:
import tensorflow as tf
from keras import models, losses, optimizers, metrics
from keras.layers import Dense, LSTM, Embedding
from keras.callbacks import EarlyStopping
import gc

from sklearn.preprocessing import Normalizer

%run ../scripts/save_utils.py

## Data load

In [8]:
x_train, y_train, x_val, y_val, x_test, y_test  = load_data('../save_files/processed_data.pkl')
headline_tokenizer = load_data('../save_files/tokenizer.pkl')

In [9]:
norm = Normalizer()
x_train_norm = norm.fit_transform(x_train)
x_val_norm = norm.transform(x_val)

## Baseline model

To start things off, we need to train and evaluate a baseline model to have a bottom threshold of performance.

In [10]:
vocab_size = len(headline_tokenizer.word_index) + 1
max_len = x_train.shape[1]

stop_early = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

In [11]:
base_classifier = models.Sequential()
base_classifier.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))
base_classifier.add(LSTM(units=128))
base_classifier.add(Dense(64, activation='relu'))
base_classifier.add(Dense(16, activation='relu'))
base_classifier.add(Dense(1, activation='sigmoid'))
base_classifier.compile(optimizer=optimizers.RMSprop(lr=0.002),
                        loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy])

base_classifier.fit(x_train_norm, y_train, batch_size=64, epochs=50, validation_data=(x_val_norm, y_val), verbose=2, callbacks=[stop_early])

Epoch 1/50


  super().__init__(name, **kwargs)


269/269 - 18s - loss: 0.6928 - binary_accuracy: 0.5208 - val_loss: 0.6922 - val_binary_accuracy: 0.5236 - 18s/epoch - 67ms/step
Epoch 2/50
269/269 - 6s - loss: 0.6921 - binary_accuracy: 0.5236 - val_loss: 0.6921 - val_binary_accuracy: 0.5236 - 6s/epoch - 23ms/step
Epoch 3/50
269/269 - 7s - loss: 0.6921 - binary_accuracy: 0.5236 - val_loss: 0.6920 - val_binary_accuracy: 0.5236 - 7s/epoch - 24ms/step
Epoch 4/50
269/269 - 6s - loss: 0.6921 - binary_accuracy: 0.5236 - val_loss: 0.6920 - val_binary_accuracy: 0.5236 - 6s/epoch - 23ms/step
Epoch 5/50
269/269 - 6s - loss: 0.6921 - binary_accuracy: 0.5236 - val_loss: 0.6920 - val_binary_accuracy: 0.5236 - 6s/epoch - 23ms/step
Epoch 6/50
269/269 - 6s - loss: 0.6921 - binary_accuracy: 0.5236 - val_loss: 0.6921 - val_binary_accuracy: 0.5236 - 6s/epoch - 23ms/step
Epoch 7/50
269/269 - 6s - loss: 0.6921 - binary_accuracy: 0.5236 - val_loss: 0.6920 - val_binary_accuracy: 0.5236 - 6s/epoch - 23ms/step
Epoch 8/50
269/269 - 6s - loss: 0.6921 - binary_ac

<keras.callbacks.History at 0x270086e7cd0>

In [12]:
baseline_results = base_classifier.evaluate(x_val, y_val, return_dict=True)
print('baseline loss:     ', baseline_results['loss'])
print('baseline accuracy: ', baseline_results['binary_accuracy'])

baseline loss:      0.6920647025108337
baseline accuracy:  0.5235849022865295


In [13]:
gc.collect()

5506