In [1]:
import pandas as pd
import os
import collections 
import spacy
import pickle
from spacy.tokenizer import Tokenizer
import numpy as np
import pickle
import re
import math
import unidecode
import concurrent.futures

In [2]:
nlp = spacy.load('en')

In [3]:
nlp = spacy.load('en')

filename = "../full_articles.csv"
filepath = os.path.join(os.getcwd(), filename)
data = pd.read_csv(filepath)

data.dropna(how="any", subset=["title", "content", "publication"], inplace=True)

SAVE_DIR = "pickles"

In [4]:
data = data.loc[data.publication.apply(lambda x: x == "NPR")]
contents = data.content.tolist()
contents = [unidecode.unidecode(content).lower() for content in contents]
# delete the unneeded data
del data

In [10]:
def encode_sequences(sequences, word_dict, seq_length, n_vocab):
    data = np.zeros(shape=(len(sequences), seq_length, n_vocab), dtype=np.bool)
    for i,sequence in enumerate(sequences): 
        if len(sequence) > seq_length:
            sequence = sequence[:seq_length]
        elif len(sequence) < seq_length:
            raise NotImplementedError(f"Need a sequence of length {seq_length}")
        for j,word in enumerate(sequence):
            data[i, j, word_dict[word.lower()]] = 1
    return(data)

def encode_next_words(next_words, word_dict, n_vocab):
    next_word_encode = np.zeros(shape=(len(next_words), n_vocab), dtype=np.bool)
    for i,next_word in enumerate(next_words):
        next_word_encode[i, word_dict[next_word]] = 1
    return(next_word_encode)

In [11]:
def fit_gen(word_dict,
            seq_length, 
            n_vocab, 
            step_data=100, 
            step_token=1, 
            validation=False, 
            char_level=False):
    print(f"fit gen called for {seq_length} length sequences")
    global DATA, TRAINING_DATA, VALIDATION_DATA, VALIDATION_INDEX
    if validation:
        
        num_seq = len(VALIDATION_DATA)
    else:
        np.random.shuffle(DATA)
        TRAINING_DATA = DATA[:VALIDATION_INDEX]
        VALIDATION_DATA = DATA[VALIDATION_INDEX:]
        num_seq = len(TRAINING_DATA)
    total_iterations = math.floor((num_seq - step_data) // step_data)
    while True:
        for i in range(0, num_seq - step_data, step_data):
            n_it = i / step_data
            print(f"iteration {n_it} out of {total_iterations}")
            if validation:
                contents = VALIDATION_DATA[i:i+step_data]
            else:
                contents = TRAINING_DATA[i:i+step_data]           
            sequences = []
            next_words = []
            if char_level:
                content_words = [word for doc in TRAINING_DATA for word in doc]
            else:
                content_words = [word.text for doc in nlp.pipe(contents, batch_size=step_data) for word in doc if word.is_alpha or word.is_punct]
            for j in range(0, len(content_words)-seq_length, step_token):
                sequence = content_words[j:j+seq_length]
                next_word = content_words[j+seq_length]
                sequences.append(sequence)
                next_words.append(next_word)
            del content_words

            training = encode_sequences(sequences, word_dict, seq_length, n_vocab)
            target =  encode_next_words(next_words, word_dict, n_vocab)
            assert training.shape[0] == target.shape[0]
            training_data = (training, target)
            
            np.save(os.path.join(os.getcwd(), f"data/training/training_{PUBLICATION}_{step_data}_{n_it}.npy"), training)
            print(f"SAVED training{n_it}.npy")
            np.save(os.path.join(os.getcwd(), f"data/target/target_{PUBLICATION}_{step_data}_{n_it}.npy"), target)
            print(f"SAVED target{n_it}.npy")

            yield training_data
            del next_words, sequences, content_words, training, targets

## Char-RNN

In [6]:
content_chars = [c for content in contents for c in content]
content_char_count = collections.Counter(content_chars)
content_char_ind = [char[0] for char in content_char_count.most_common()]
content_char_dict = {char: i for i, char in enumerate(content_char_ind)}

In [13]:
#def fit_gen(word_dict, seq_length, n_vocab, step_data=100, step_token=1, validation=False):
batch_size = 64
seq_length
fit_gen_char = lambda : fit_gen(content_char_dict, seq_length=seq_length, n_vocab=len(content_char_dict), step_data=batch_size, char_level=True)
fit_gen_char_val = lambda : fit_gen(content_char_dict, seq_length=seq_length, n_vocab=len(content_char_dict), step_data=batch_sze, validation=True, char_level=True)

In [5]:
# save translators
name = "content_char"
PKL_SAVE = f"{name}.pkl"
with open(os.path.join("pickles",PKL_SAVE), "wb") as pkl_file:
    pickle.dump((content_chars, content_ind, content_dict), pkl_file)

In [None]:
DATA = contents
VALIDATION_SPLIT = 0.1
VALIDATION_INDEX = math.floor((1-VALIDATION_SPLIT) * len(DATA))
PUBLICATION = "NPR"

In [15]:
# preprocess and tokenize
#content_words = [word.text for doc in nlp.pipe(contents, batch_size=128) for word in doc if word.is_alpha or word.is_punct]
# create translator
#content_count = collections.Counter(content_words)
#del content_words
# create index-to-word translator
#content_vocab = list(sorted([item[0] for item in content_count.most_common()]))
# create word-to-index translator 
#content_word_dict = {x: i for i,x in enumerate(content_vocab)}

In [16]:
# save translators
#name = "content"
#PKL_SAVE = f"{name}.pkl"
#with open(os.path.join("pickles",PKL_SAVE), "wb") as pkl_file:
#    pickle.dump((contents, content_vocab, content_word_dict), pkl_file)

In [3]:
# load translators
#name = "content"
#PKL_SAVE = f"{name}.pkl"
#with open(os.path.join("pickles",PKL_SAVE), "rb") as pkl_file:
#    contents, content_vocab, content_word_dict = pickle.load(pkl_file)
    
#name = "title"
#PKL_SAVE = f"{name}.pkl"
#with open(os.path.join("pickles",PKL_SAVE), "rb") as pkl_file:
#    title_words, title_vocab, title_word_dict = pickle.load(pkl_file)
    
#title_word_dict = {x: i for i,x in enumerate(title_vocab)}

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Bidirectional, LSTM, Input
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy
n_nodes = 512
SEQ_LENGTH = 100


model = Sequential()
model.add(Bidirectional(LSTM(n_nodes, activation="relu"), input_shape = (SEQ_LENGTH, n_vocab)))
model.add(Dropout(0.6))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
optimizer = Adam(lr=0.001)
callbacks = [EarlyStopping(patience=2, monitor="val_loss")]
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=[categorical_accuracy])
MODEL_CHECK_DIR = "checkpoints"
callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=os.path.join(os.getcwd(), 
                                                 MODEL_CHECK_DIR,
                                                 'model_gen.{epoch:02d}-{val_loss:.2f}.hdf5'),
                           monitor='val_loss',
                           verbose=0, mode='auto',
                           period=2)]

batch_size = 12
epochs = 15
steps_per_epoch = int(len(DATA) * (1-VALIDATION_SPLIT) // batch_size)
validation_steps = int(len(DATA) * VALIDATION_SPLIT // batch_size)

fit_generator = lambda : fit_gen(content_word_dict, SEQ_LENGTH, n_vocab=len(content_word_dict), step_data=batch_size)
fit_generator_validation = lambda : fit_gen(content_word_dict, SEQ_LENGTH, n_vocab=len(content_word_dict), validation=True, step_data=batch_size)

model.fit_generator(f(),
                    steps_per_epoch=steps_per_epoch,
                    epochs = epochs,
                    validation_data=g(),
                   validation_steps=validation_steps)


model.save(os.path.join(os.getcwd(), MODEL_CHECK_DIR, 'model_gen_title.h5'))

Using TensorFlow backend.
  return f(*args, **kwds)


Epoch 1/15
fit gen called for 100 length sequencesfit gen called for 100 length sequences

iteration 0.0 out of 673


In [42]:
def sample(preds, temperature):
    t = np.asarray(preds).astype(np.float64)
    t = np.log(t) / temperature
    t = np.exp(t)
    t = t / np.sum(t)
    probs = np.random.multinomial(1, t, 1)
    return np.argmax(probs)

def gen_words(model, seed, word_num, word_dict, seq_length, words_index, temperature=1.0):
    """`nlp` must be defined"""
    words = [word.text for word in nlp(seed) if word.is_alpha or word.is_punct]
    
    generated = words


    for i in range(word_num):
        encoded = encode_sequence(words, word_dict, seq_length)
        preds = model.predict(encoded)[0]
        result = sample(preds, temperature)
        next_word = words_index[result]
        generated.append(next_word)
        words = words[1:] + [next_word]
        
    return " ".join(generated)
    

In [59]:
gen_words(model=model, seed="breaking news: trump announced a speech in which he denounced",
          word_num=200,
          word_dict=content_word_dict,
          seq_length=10,
          words_index=content_vocab,
         temperature=.6)

  This is separate from the ipykernel package so we can avoid doing imports until


'breaking news : trump announced a speech in which he denounced has called seen been to in to his his the life life presidency because , , a many there notes likely never many on a small more baby group than than , the any right tv professor , . , ” ” but i i it know have , in to i a my like very mind album , , . i i you think am know that is that the the this way national administration is company . often in has only all a creating over familiar ways all , the the ” national small and war source people in of to the the make new community line business , . at but but least all there years of are , trump more though in than , another a putin parents year more would . to take ” remember the a any ” museum genome video director . . . ” but .. they they the were say same , in plan ” which that to is happens be still to the . repeal move ” of in : that obamacare you people and can would no find follow follow why her , would of pontzer have things to'