# Clickbait Generator
Give me those clicks!!!
Data taken from https://github.com/bhargaviparanjape/clickbait

## Data Processing
1. Extract archived data
2. Add end_of_headline terminator word
3. Tokenize titles
4. Split into X and Y data

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, get_file

import gzip
import numpy as np

vocab_size = 7000
sample_length = 20
step = 1
end_of_headline = 'eoh'

In [None]:
def load_text():
    data_file = get_file("clicks", "https://raw.githubusercontent.com/bhargaviparanjape/clickbait/master/dataset/clickbait_data.gz")
    with gzip.open(data_file, 'rt') as f:
        return f.read().replace('\n\n', f" {end_of_headline} ")

def split_into_samples(text):
    samples = []
    next_words = []

    for i in range(0, len(text) - sample_length, step):
        samples.append(text[i:i + sample_length])
        next_words.append(text[i+sample_length])

    p = np.random.permutation(len(samples))
    return np.asarray(samples)[p], np.asarray(next_words)[p]

In [None]:
np.random.seed(0)

text = load_text()

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts([text])
text = tokenizer.texts_to_sequences([text])[0]
X, Y = split_into_samples(text)

print(f"{len(tokenizer.word_index)} unique words")
print(f"{len(X)} samples")
Y = to_categorical(Y, num_classes=vocab_size, dtype='bool')


## LSTM Model
Uses trained word embeddings

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback, EarlyStopping

physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))

In [None]:
def create_model(sample_length, vocab_size):
    input_layer = Input(shape=(sample_length,))

    m = Embedding(vocab_size, 10, input_length=sample_length)(input_layer)
    m = LSTM(256, dropout=0.2, return_sequences=True)(m)
    m = LSTM(256, dropout=0.2)(m)
    m = Dense(300, activation='relu')(m)
    m = Dropout(0.2)(m)
    m = Dense(vocab_size, activation='softmax')(m)

    model = Model(inputs=[input_layer], outputs=m)

    return model

def make_plots(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)

    plt.plot(epochs, acc, 'bo', label='Training accuracy')
    plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
    plt.title('Accuracy')
    plt.legend()

    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Loss')
    plt.legend()

    plt.show()

In [None]:
model = create_model(sample_length, vocab_size)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

## Prediction

In [None]:
# temperature sampling based on Deep Learning book
def sample(preds, temperature):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    return np.argmax(probs)

def generate_seq(model, tokenizer, seq_length, seed_text, n_words, temperature = 0.5):
    result = seed_text.replace(end_of_headline, '').split()
    in_text = seed_text
    first = True

    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        preds = model.predict(encoded, verbose=0)

        # sample first word with higher temperature
        if first:
            first = False
            yhat = sample(preds[0], temperature + 0.3)
        else:
            yhat = sample(preds[0], temperature)

        out_word = 'X'
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        if out_word == end_of_headline:
            break

        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# Callback that generates some text after each epoch
class GenerationCallback(Callback):

    def __init__(self, seed_text="", num_examples=5):
        self.seed_text = seed_text
        self.num_examples = num_examples

    def on_epoch_end(self, epoch, logs=None):
        print("\n\nExample headlines")
        for i in range(0, self.num_examples):
            output = generate_seq(model, tokenizer, sample_length, self.seed_text, 20, .5)
            print(f"{i} {output}")
        print("")

In [None]:
callbacks = [
    GenerationCallback("eoh", 3),
    EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
]

history = model.fit(X, Y, batch_size=100, epochs=50, validation_split=0.05, callbacks=callbacks)

In [None]:
model.save('vocab2000_lstm256.h5')
print(f"Best validation accuracy {np.amax(history.history['val_accuracy'])}")
make_plots(history)

In [None]:
# compare models
def test_model(model):
    for i in range(0,10):
        output = generate_seq(model, tokenizer, sample_length, end_of_headline, 20, .5)
        print(f"{i} {output}")

print("Latest:")
test_model(model)

print("\n\n512-512 with pre-trained embeddings:")
big_model = tf.keras.models.load_model("512-512.h5")
test_model(big_model)

print("\n\n256 with vocab 1000 and trained embeddings:")
small_model = tf.keras.models.load_model("vocab1000_lstm256.h5")
test_model(small_model)



## Results
My favorite results so far
- we know your zodiac sign based on your zodiac sign
- are you more like more dog or a dog
- the 17 most important canadian celebrity moments of 2015
- here's how to make a vampire
- can you guess your favorite '90s movie based on your favorite kitten
- are you more a canadian or taylor swift or oprah
- 17 insanely delicious ways to eat your family
- we know your favorite pop thing based on your zodiac sign