In [2]:
import pandas as pd
import os
import collections 
import spacy
import pickle
import numpy as np
import pickle
import re
import math
import unidecode
import concurrent.futures
import tensorflow as tf
import h5py

In [3]:
data.shape

(142568, 9)

In [2]:
nlp = spacy.load('en')

filename = "../full_articles.csv"
filepath = os.path.join(os.getcwd(), filename)
data = pd.read_csv(filepath)

data.dropna(how="any", subset=["title", "content", "publication"], inplace=True)

SAVE_DIR = "pickles"

In [3]:
data = data.loc[data.publication.apply(lambda x: x == "NPR")]
contents = data.content.tolist()
contents = [unidecode.unidecode(content).lower() for content in contents]
# delete the unneeded data
del data

In [5]:
def process_content(contents_data, sequence_length=128, batch_size=128, decode_unicode=False):
    if decode_unicode:
        contents_data = [unidecode.unidecode(content).lower() for content in contents]
    else:
        contents_data = [content.lower() for content in contents]

    tokenized = [[word.text for word in doc] for doc in nlp.pipe(contents_data, batch_size=128)]
    print("done tokenizing...")
    words_dict = collections.Counter([word for doc in tokenized for word in doc])
    words = list(sorted([word[0] for word in words_dict.most_common()]))
    words_dict = {x: i for i,x in enumerate(words)}
    contents_translated = [[words_dict[word] for word in doc] for doc in tokenized]
    
    batch_chunks = []
    for doc in contents_translated:
        batch_chunks_item = [doc[i:i+sequence_length+batch_size] for i in range(0, len(doc) - batch_size, batch_size)]
        batch_chunks.extend(batch_chunks_item)
    
    return batch_chunks, words_dict, words

content_batches, content_words_dict, content_words = process_content(contents)
# save translators
name = "content_word_rnn"
PKL_SAVE = f"{name}.pkl"
with open(os.path.join("pickles",PKL_SAVE), "wb") as pkl_file:
    pickle.dump((content_batches, content_words_dict, content_words), pkl_file)

done tokenizing...


# load

In [3]:
name = "content_word_rnn"
PKL_SAVE = f"{name}.pkl"
with open(os.path.join("pickles",PKL_SAVE), "rb") as pkl_file:
    content_batches, content_words_dict, content_words = pickle.load(pkl_file)

# convert to HDF5 file for dynamic loading

In [5]:
import h5py

seq_length = 128
f = h5py.File("rnn_data", "w")
predictor_grp = f.create_group("batches")
for i,batch in enumerate(content_batches):
    predictor_grp.create_dataset(name=f"batch{i}", shape=(len(batch),), data=np.array(batch))
f.create_dataset("max_batch", shape=(1,), data=np.array([i]))
f.close()

In [4]:
from keras.utils import Sequence
from keras.utils import to_categorical

class WordRNNSequence(Sequence):
    def __init__(self, batch_idx, batch_data, seq_length, n_vocab, validation=False, training_sequence=None):
        self.batch_idx = batch_idx
        self.batch_data = batch_data
        self.seq_length = seq_length
        self.n_vocab = n_vocab
        if validation and not training_sequence:
            raise FileNotFoundError("need non-null training keras.utils.Sequence")
        if validation:
            assert isinstance(training_sequence, Sequence)
            self.training_sequence = training_sequence
        
    def __len__(self):
        return len(self.batch_idx)
    
    def __getitem__(self, key):
        data = self.batch_data[f"batch{key}"][:]
        len_batch = data.shape[0] - self.seq_length
        batch_x = np.zeros((len_batch, self.seq_length, self.n_vocab), dtype=np.bool)
        for i in range(0, len_batch):
            seq = data[i:i+self.seq_length]
            batch_x[i,] = to_categorical(seq, num_classes=self.n_vocab)
        batch_y = data[self.seq_length:]
        batch_y = to_categorical(batch_y, num_classes=self.n_vocab)
        batch_y = batch_y.astype(np.bool)
        return batch_x, batch_y
    
    def on_epoch_end(self):
        if self.validation:
            all_batch_idx = self.batch_idx + self.training_sequence.batch_idx
            self.training_sequence.batch_idx, self.batch_idx = train_test_split(all_batch_idx, test_size=0.2)
               
    def _batch_len(self, true_idx):
        data = self.batch_data[true_idx]
        len_batch = len(data) - int(self.seq_length)
        return len_batch

In [18]:
from keras.utils import to_categorical

def encode_sequences(sequences, word_dict, seq_length, n_vocab):
    data = np.zeros(shape=(len(sequences), seq_length, n_vocab), dtype=np.bool) # extra word for OOV words 
    for i,sequence in enumerate(sequences): 
        if len(sequence) > seq_length:
            sequence = sequence[:seq_length]
        elif len(sequence) < seq_length:
            raise NotImplementedError(f"Need a sequence of length {seq_length}")
        for j,word in enumerate(sequence):
            word_lower = word.lower()
            if word_lower in word_dict:
                data[i, j, word_dict[word_lower]] = 1
            else:
                data[i, j, n_vocab - 1] = 1

    return(data)

def encode_next_words(next_words, word_dict, n_vocab):
    next_word_encode = np.zeros(shape=(len(next_words), n_vocab), dtype=np.bool) # extra word for OOV words 
    for i,next_word in enumerate(next_words):
        next_word_lower = next_word.lower()
        if next_word_lower in word_dict:
            next_word_encode[i, word_dict[next_word_lower]] = 1
        else:
            next_word_encode[i, n_vocab - 1] = 1
            
    return(next_word_encode)

In [6]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Bidirectional, LSTM, Input
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy
from sklearn.model_selection import train_test_split
import sklearn


n_nodes = 512
#class WordRNNSequence(Sequence):
#    def __init__(self, batch_idx, batch_data, seq_length, n_vocab, validation=False):
#    content_batches, content_words_dict, content_words = pickle.load(pkl_file)
f = h5py.File("rnn_data", "r")
num_batches = f["max_batch"][0]
train_idx, validation_idx = train_test_split(np.arange(num_batches), test_size=0.2)

hdf5_batch_data = f["batches"]

train = {
    "batch_idx": train_idx, 
    "batch_data": hdf5_batch_data, 
    "seq_length": 128, 
    "n_vocab": len(content_words)
}

valid = {
    "batch_idx": validation_idx, 
    "batch_data": hdf5_batch_data, 
    "seq_length": 128, 
    "n_vocab": len(content_words)
}

train_sequence = WordRNNSequence(**train)
valid_sequence = WordRNNSequence(training_sequence=train_sequence, **valid)


SEQ_LENGTH = 128
N_VOCAB = len(content_words)

model = Sequential()
model.add(Bidirectional(LSTM(n_nodes, activation="relu"), input_shape = (SEQ_LENGTH, N_VOCAB)))
model.add(Dropout(0.6))
model.add(Dense(N_VOCAB))
model.add(Activation('softmax'))
optimizer = Adam(lr=0.001)
callbacks = [EarlyStopping(patience=2, monitor="val_loss")]
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=[categorical_accuracy])
MODEL_CHECK_DIR = "checkpoints"
callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath='model_gen.{epoch:02d}-{val_loss:.2f}.hdf5',
                           monitor='val_loss',
                           verbose=0, mode='auto',
                           period=2)]

epochs = 20

model.fit_generator(train_sequence,
                    epochs = epochs,
                    validation_data=valid_sequence,
                    callbacks=callbacks,
                   max_queue_size=1)

model.save(os.path.join(os.getcwd(), MODEL_CHECK_DIR, 'model_gen_title.h5'))

Epoch 1/20


ResourceExhaustedError: OOM when allocating tensor of shape [97103,2048] and type float
	 [[{{node training_1/Adam/zeros}} = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [97103,2048] values: [0 0 0...]...>, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

Caused by op 'training_1/Adam/zeros', defined at:
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 281, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 232, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 397, in execute_request
    user_expressions, allow_stdin)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-096a55f38af7>", line 62, in <module>
    max_queue_size=1)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/keras/engine/training.py", line 1415, in fit_generator
    initial_epoch=initial_epoch)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/keras/engine/training_generator.py", line 39, in fit_generator
    model._make_train_function()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/keras/engine/training.py", line 498, in _make_train_function
    loss=self.total_loss)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/keras/optimizers.py", line 482, in get_updates
    ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/keras/optimizers.py", line 482, in <listcomp>
    ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 700, in zeros
    v = tf.zeros(shape=shape, dtype=tf_dtype, name=name)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1548, in zeros
    output = fill(shape, constant(zero, dtype=dtype), name=name)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 2817, in fill
    "Fill", dims=dims, value=value, name=name)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3254, in create_op
    op_def=op_def)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1750, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor of shape [97103,2048] and type float
	 [[{{node training_1/Adam/zeros}} = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [97103,2048] values: [0 0 0...]...>, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]


In [13]:
f.close()

<HDF5 file "rnn_data" (mode r)>

In [42]:
def sample(preds, temperature):
    t = np.asarray(preds).astype(np.float64)
    t = np.log(t) / temperature
    t = np.exp(t)
    t = t / np.sum(t)
    probs = np.random.multinomial(1, t, 1)
    return np.argmax(probs)

def gen_words(model, seed, word_num, word_dict, seq_length, words_index, temperature=1.0):
    """`nlp` must be defined"""
    words = [word.text for word in nlp(seed) if word.is_alpha or word.is_punct]
    
    generated = words


    for i in range(word_num):
        encoded = encode_sequence(words, word_dict, seq_length)
        preds = model.predict(encoded)[0]
        result = sample(preds, temperature)
        next_word = words_index[result]
        generated.append(next_word)
        words = words[1:] + [next_word]
        
    return " ".join(generated)
    

In [59]:
gen_words(model=model, seed="breaking news: trump announced a speech in which he denounced",
          word_num=200,
          word_dict=content_word_dict,
          seq_length=10,
          words_index=content_vocab,
         temperature=.6)

  This is separate from the ipykernel package so we can avoid doing imports until


'breaking news : trump announced a speech in which he denounced has called seen been to in to his his the life life presidency because , , a many there notes likely never many on a small more baby group than than , the any right tv professor , . , ” ” but i i it know have , in to i a my like very mind album , , . i i you think am know that is that the the this way national administration is company . often in has only all a creating over familiar ways all , the the ” national small and war source people in of to the the make new community line business , . at but but least all there years of are , trump more though in than , another a putin parents year more would . to take ” remember the a any ” museum genome video director . . . ” but .. they they the were say same , in plan ” which that to is happens be still to the . repeal move ” of in : that obamacare you people and can would no find follow follow why her , would of pontzer have things to'