In [1]:
import pandas as pd
import os
import collections 
import spacy
import pickle
import numpy as np
import pickle
import re
import math
import unidecode
import concurrent.futures
import tensorflow as tf
import h5py

  return f(*args, **kwds)


In [3]:
data.shape

(142568, 9)

In [2]:
nlp = spacy.load('en')

filename = "../full_articles.csv"
filepath = os.path.join(os.getcwd(), filename)
data = pd.read_csv(filepath)

data.dropna(how="any", subset=["title", "content", "publication"], inplace=True)

SAVE_DIR = "pickles"

In [3]:
data = data.loc[data.publication.apply(lambda x: x == "NPR")]
contents = data.content.tolist()
contents = [unidecode.unidecode(content).lower() for content in contents]
# delete the unneeded data
del data

In [5]:
def process_content(contents_data, sequence_length=128, batch_size=128, decode_unicode=False):
    if decode_unicode:
        contents_data = [unidecode.unidecode(content).lower() for content in contents]
    else:
        contents_data = [content.lower() for content in contents]

    tokenized = [[word.text for word in doc] for doc in nlp.pipe(contents_data, batch_size=128)]
    print("done tokenizing...")
    words_dict = collections.Counter([word for doc in tokenized for word in doc])
    words = list(sorted([word[0] for word in words_dict.most_common()]))
    words_dict = {x: i for i,x in enumerate(words)}
    contents_translated = [[words_dict[word] for word in doc] for doc in tokenized]
    
    batch_chunks = []
    for doc in contents_translated:
        batch_chunks_item = [doc[i:i+sequence_length+batch_size] for i in range(0, len(doc) - batch_size, batch_size)]
        batch_chunks.extend(batch_chunks_item)
    
    return batch_chunks, words_dict, words

content_batches, content_words_dict, content_words = process_content(contents)
# save translators
name = "content_word_rnn"
PKL_SAVE = f"{name}.pkl"
with open(os.path.join("pickles",PKL_SAVE), "wb") as pkl_file:
    pickle.dump((content_batches, content_words_dict, content_words), pkl_file)

done tokenizing...


# load

In [2]:
name = "content_word_rnn"
PKL_SAVE = f"{name}.pkl"
with open(os.path.join("pickles",PKL_SAVE), "rb") as pkl_file:
    content_batches, content_words_dict, content_words = pickle.load(pkl_file)

# convert to HDF5 file for dynamic loading

In [3]:
import h5py

seq_length = 128
f = h5py.File("rnn_data", "w")
predictor_grp = f.create_group("batches")
for i,batch in enumerate(content_batches):
    predictor_grp.create_dataset(name=f"batch{i}", shape=(len(batch),), data=np.array(batch))
f.close()
max_i = i

In [17]:
from keras.utils import Sequence
from keras.utils import to_categorical

class WordRNNSequence(Sequence):
    def __init__(self, batch_idx, batch_data, seq_length, n_vocab, validation=False, training_sequence=None):
        self.batch_idx = batch_idx
        self.batch_data = batch_data
        self.seq_length = seq_length
        self.n_vocab = n_vocab
        if validation and not training_sequence:
            raise FileNotFoundError("need non-null training keras.utils.Sequence")
        if validation:
            assert isinstance(training_sequence, Sequence)
            self.training_sequence = training_sequence
        
    def __len__(self):
        return len(self.batch_idx)
    
    def __getitem__(self, key):
        data = self.batch_data[f"batch{key}"][:]
        len_batch = data.shape[0] - self.seq_length
        batch_x = np.zeros((len_batch, self.seq_length, self.n_vocab), dtype=np.bool)
        for i in range(0, len_batch):
            seq = data[i:i+self.seq_length]
            batch_x[i,] = to_categorical(seq, num_classes=self.n_vocab)
        batch_y = data[self.seq_length:]
        batch_y = to_categorical(batch_y, num_classes=self.n_vocab)
        batch_y = batch_y.astype(np.bool)
        print(f"{batch_x.shape}, {batch_y.shape}")
        return batch_x, batch_y
    
    def on_epoch_end(self):
        if self.validation:
            all_batch_idx = self.batch_idx + self.training_sequence.batch_idx
            self.training_sequence.batch_idx, self.batch_idx = train_test_split(all_batch_idx, test_size=0.2)
               
    def _batch_len(self, true_idx):
        data = self.batch_data[true_idx]
        len_batch = len(data) - int(self.seq_length)
        return len_batch

In [18]:
from keras.utils import to_categorical

def encode_sequences(sequences, word_dict, seq_length, n_vocab):
    data = np.zeros(shape=(len(sequences), seq_length, n_vocab), dtype=np.bool) # extra word for OOV words 
    for i,sequence in enumerate(sequences): 
        if len(sequence) > seq_length:
            sequence = sequence[:seq_length]
        elif len(sequence) < seq_length:
            raise NotImplementedError(f"Need a sequence of length {seq_length}")
        for j,word in enumerate(sequence):
            word_lower = word.lower()
            if word_lower in word_dict:
                data[i, j, word_dict[word_lower]] = 1
            else:
                data[i, j, n_vocab - 1] = 1

    return(data)

def encode_next_words(next_words, word_dict, n_vocab):
    next_word_encode = np.zeros(shape=(len(next_words), n_vocab), dtype=np.bool) # extra word for OOV words 
    for i,next_word in enumerate(next_words):
        next_word_lower = next_word.lower()
        if next_word_lower in word_dict:
            next_word_encode[i, word_dict[next_word_lower]] = 1
        else:
            next_word_encode[i, n_vocab - 1] = 1
            
    return(next_word_encode)

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Bidirectional, LSTM, Input
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy
from sklearn.model_selection import train_test_split
import sklearn

train_idx, validation_idx = train_test_split(np.arange(len(content_batches)), test_size=0.2)

n_nodes = 512
#class WordRNNSequence(Sequence):
#    def __init__(self, batch_idx, batch_data, seq_length, n_vocab, validation=False):
#    content_batches, content_words_dict, content_words = pickle.load(pkl_file)
f = h5py.File("rnn_data", "r")
hdf5_batch_data = f["batches"]

train = {
    "batch_idx": train_idx, 
    "batch_data": hdf5_batch_data, 
    "seq_length": 128, 
    "n_vocab": len(content_words)
}

valid = {
    "batch_idx": validation_idx, 
    "batch_data": hdf5_batch_data, 
    "seq_length": 128, 
    "n_vocab": len(content_words)
}

train_sequence = WordRNNSequence(**train)
valid_sequence = WordRNNSequence(training_sequence=train_sequence, **valid)


SEQ_LENGTH = 128
N_VOCAB = len(content_words)

model = Sequential()
model.add(Bidirectional(LSTM(n_nodes, activation="relu"), input_shape = (SEQ_LENGTH, N_VOCAB)))
model.add(Dropout(0.6))

model.add(Dense(N_VOCAB))
model.add(Activation('softmax'))
optimizer = Adam(lr=0.001)
callbacks = [EarlyStopping(patience=2, monitor="val_loss")]
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=[categorical_accuracy])
MODEL_CHECK_DIR = "checkpoints"
callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath='model_gen.{epoch:02d}-{val_loss:.2f}.hdf5',
                           monitor='val_loss',
                           verbose=0, mode='auto',
                           period=2)]

epochs = 10

model.fit_generator(train_sequence,
                    epochs = epochs,
                    validation_data=valid_sequence,
                    callbacks=callbacks,
                    use_multiprocessing=True,
                    max_queue_size=5)


model.save(os.path.join(os.getcwd(), MODEL_CHECK_DIR, 'model_gen_title.h5'))

Epoch 1/10
Epoch 1/10
(128, 128, 97103), (128, 97103)
(128, 128, 97103), (128, 97103)
(128, 128, 97103), (128, 97103)
(128, 128, 97103), (128, 97103)
(128, 128, 97103), (128, 97103)
(128, 128, 97103), (128, 97103)
(88, 128, 97103), (88, 97103)
(128, 128, 97103), (128, 97103)
(18, 128, 97103), (18, 97103)
(128, 128, 97103), (128, 97103)
(128, 128, 97103), (128, 97103)
(128, 128, 97103), (128, 97103)
(128, 128, 97103), (128, 97103)


Process ForkPoolWorker-12:
Process ForkPoolWorker-11:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()


  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
KeyboardInterrupt
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


In [13]:
f.close()

<HDF5 file "rnn_data" (mode r)>

In [42]:
def sample(preds, temperature):
    t = np.asarray(preds).astype(np.float64)
    t = np.log(t) / temperature
    t = np.exp(t)
    t = t / np.sum(t)
    probs = np.random.multinomial(1, t, 1)
    return np.argmax(probs)

def gen_words(model, seed, word_num, word_dict, seq_length, words_index, temperature=1.0):
    """`nlp` must be defined"""
    words = [word.text for word in nlp(seed) if word.is_alpha or word.is_punct]
    
    generated = words


    for i in range(word_num):
        encoded = encode_sequence(words, word_dict, seq_length)
        preds = model.predict(encoded)[0]
        result = sample(preds, temperature)
        next_word = words_index[result]
        generated.append(next_word)
        words = words[1:] + [next_word]
        
    return " ".join(generated)
    

In [59]:
gen_words(model=model, seed="breaking news: trump announced a speech in which he denounced",
          word_num=200,
          word_dict=content_word_dict,
          seq_length=10,
          words_index=content_vocab,
         temperature=.6)

  This is separate from the ipykernel package so we can avoid doing imports until


'breaking news : trump announced a speech in which he denounced has called seen been to in to his his the life life presidency because , , a many there notes likely never many on a small more baby group than than , the any right tv professor , . , ” ” but i i it know have , in to i a my like very mind album , , . i i you think am know that is that the the this way national administration is company . often in has only all a creating over familiar ways all , the the ” national small and war source people in of to the the make new community line business , . at but but least all there years of are , trump more though in than , another a putin parents year more would . to take ” remember the a any ” museum genome video director . . . ” but .. they they the were say same , in plan ” which that to is happens be still to the . repeal move ” of in : that obamacare you people and can would no find follow follow why her , would of pontzer have things to'