In [2]:
# Many-to-one LSTM network (Divides the network into n layers evenly starting from the input dimension)
# i.e. a 300 length input vector with 2 levels is split into a 150 node layer and a 1 node layer
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from math import *
import time
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
def fullprint(*args, **kwargs):
  from pprint import pprint
  import numpy
  opt = numpy.get_printoptions()
  numpy.set_printoptions(threshold='nan')
  pprint(*args, **kwargs)
  numpy.set_printoptions(**opt)

In [4]:
NUM_LAYERS = 2
VEC_SIZE = 300
hidden_layer_sizes = [int(VEC_SIZE - i * VEC_SIZE/NUM_LAYERS) for i in range(NUM_LAYERS)] + [1]
print(hidden_layer_sizes)

[300, 150, 1]


In [5]:
# Make headline embeddings
print("Loading model...")
start = time.time()
model = Doc2Vec.load('NYTimesYearDocVecs.bin')
model.delete_temporary_training_data(keep_doctags_vectors=True, 
                                     keep_inference=True)
print("Model loaded in {}s".format(time.time()-start))

Loading model...
Model loaded in 106.238114119s


In [30]:
dv = model.docvecs
doc_tags = dv.offset2doctag
date_tags = [re.match('[0-9]{4}-[0-9]{2}-[0-9]{2}', tag).group()
             for tag in doc_tags]
date_tags = [tuple(int(t) for t in tag.split('-')) for tag in date_tags]
date_tags_uniq = list(set(date_tags))
uniq_date_tags = sorted(sorted(sorted(date_tags_uniq, key=lambda x: x[2]),key=lambda x: x[1]),key=lambda x: x[0])
# print(uniq_date_tags)

[(2008, 3, 1), (2008, 3, 2), (2008, 3, 3), (2008, 3, 4), (2008, 3, 5), (2008, 3, 6), (2008, 3, 7), (2008, 3, 8), (2008, 3, 9), (2008, 3, 10), (2008, 3, 11), (2008, 3, 12), (2008, 3, 13), (2008, 3, 14), (2008, 3, 15), (2008, 3, 16), (2008, 3, 17), (2008, 3, 18), (2008, 3, 19), (2008, 3, 20), (2008, 3, 21), (2008, 3, 22), (2008, 3, 23), (2008, 3, 24), (2008, 3, 25), (2008, 3, 26), (2008, 3, 27), (2008, 3, 28), (2008, 3, 29), (2008, 3, 30), (2008, 3, 31), (2008, 4, 1), (2008, 4, 2), (2008, 4, 3), (2008, 4, 4), (2008, 4, 5), (2008, 4, 6), (2008, 4, 7), (2008, 4, 8), (2008, 4, 9), (2008, 4, 10), (2008, 4, 11), (2008, 4, 12), (2008, 4, 13), (2008, 4, 14), (2008, 4, 15), (2008, 4, 16), (2008, 4, 17), (2008, 4, 18), (2008, 4, 19), (2008, 4, 20), (2008, 4, 21), (2008, 4, 22), (2008, 4, 23), (2008, 4, 24), (2008, 4, 25), (2008, 4, 26), (2008, 4, 27), (2008, 4, 28), (2008, 4, 29), (2008, 4, 30), (2008, 5, 1), (2008, 5, 2), (2008, 5, 3), (2008, 5, 4), (2008, 5, 5), (2008, 5, 6), (2008, 5, 7), (200

In [38]:
date_str_tags = [str(dt.date(year=tag[0], month=tag[1], day=tag[2])) for tag in uniq_date_tags]
# print(date_str_tags)
from collections import Counter, defaultdict
counts = Counter(date_str_tags)
print(counts)
# [model.docvecs[tag] for tag in date_str_tags][:2]
daily_headline = defaultdict(list)
for tag in date_str_tags:
#     print(tag)
    for i in range(counts[tag]):
        print(tag, i)
        docvec = dv[str(tag)+'-'+str(i)]
        print(docvec)
        daily_headline[tag] = daily_headline[tag].append(docvec)
now = dt.datetime.now()
now = dt.date(year=now.year, month=now.month, day=now.day)
print(dv)
print(daily_headline)
print(daily_headline[str(now)])

Counter({'2017-08-29': 1, '2008-11-17': 1, '2008-11-16': 1, '2008-11-15': 1, '2008-11-14': 1, '2008-11-13': 1, '2008-11-12': 1, '2008-11-11': 1, '2008-11-10': 1, '2015-11-11': 1, '2015-11-10': 1, '2015-11-13': 1, '2015-11-12': 1, '2015-11-15': 1, '2015-11-14': 1, '2008-11-19': 1, '2008-11-18': 1, '2016-03-23': 1, '2017-10-01': 1, '2012-07-01': 1, '2016-03-26': 1, '2016-11-28': 1, '2016-03-27': 1, '2018-01-16': 1, '2016-03-28': 1, '2016-03-29': 1, '2017-02-23': 1, '2015-11-19': 1, '2015-11-18': 1, '2013-01-01': 1, '2015-12-15': 1, '2013-01-03': 1, '2013-01-02': 1, '2013-01-05': 1, '2013-01-04': 1, '2012-03-28': 1, '2013-01-06': 1, '2013-01-09': 1, '2013-01-08': 1, '2015-12-18': 1, '2015-12-19': 1, '2014-03-31': 1, '2014-03-30': 1, '2017-10-06': 1, '2012-03-29': 1, '2013-04-17': 1, '2013-04-16': 1, '2012-03-19': 1, '2012-02-27': 1, '2016-09-08': 1, '2016-09-09': 1, '2010-05-08': 1, '2010-05-09': 1, '2010-05-06': 1, '2010-05-07': 1, '2010-05-04': 1, '2010-05-05': 1, '2010-05-02': 1, '2010

KeyError: '2008-03-02-0'

In [None]:
def batch_producer(raw_data, batch_size, num_steps):
    raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)

    data_len = tf.size(raw_data)
    batch_len = data_len // batch_size
    data = tf.reshape(raw_data[0: batch_size * batch_len],
                      [batch_size, batch_len])

    epoch_size = (batch_len - 1) // num_steps
    
    i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
    x = data[:, i * num_steps:(i + 1) * num_steps]
    x.set_shape([batch_size, num_steps])
    y = data[:, i * num_steps + 1: (i + 1) * num_steps + 1]
    y.set_shape([batch_size, num_steps])
    return x, y

In [11]:
# Placeholder, variable batch size (None)
with tf.name_scope('input'):
    X = tf.placeholder(tf.float32, [None, 300])
    Y = tf.placeholder(tf.float64, [None, 1])

In [10]:
def lstm_cell(size):
  return tf.contrib.rnn.BasicLSTMCell(size)

In [18]:
# Normalized sample to initialize the weight matrices
with tf.name_scope('network'):

    state_layers = []
    multiple_cells = []
    # Initial state of the LSTM memory.
    for size in hidden_layer_sizes[1:]:
#         h_i = tf.zeros([current_batch_size, size])
#         c_i = tf.zeros([current_batch_size, size])
#         state = h_i, c_i
#         print(state)
#         state_layers.append(state)
        multiple_cells.append(lstm_cell(size))
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(multiple_cells)
    print(state_layers)
    
    initial_state = state = stacked_lstm.zero_state(batch_size, tf.float32)
#     for i in range(num_steps):
#         # The value of state is updated after processing each batch of words.
#         output, state = stacked_lstm(words[:, i], state)

#         # The rest of the code.
#         # ...

#     final_state = state
#         W1 = tf.Variable(tf.truncated_normal([784, 256], stddev=0.1))
#         b1 = tf.Variable(tf.constant(0.1, shape = [1, 256]))
#         layer1 = tf.nn.relu(tf.matmul(X, W1) + b1)

NameError: name 'current_batch_size' is not defined

In [None]:
# Network 
