In [29]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import random

In [4]:
with open('../SubRecommender/data/train_reddit_data.json','r') as data_file:    
    reddit_data = json.load(data_file)

In [30]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [60]:
df = pd.DataFrame(reddit_data,columns=['user','subreddit','utc_stamp'])
df['utc_stamp'] = pd.to_datetime(df['utc_stamp'],unit='s')
df.sort_values(by=['user','utc_stamp'], ascending=True, inplace=True)
users = list(df.groupby('user')['user'].nunique().keys())
sub_list = list(df.groupby('subreddit')['subreddit'].nunique().keys())
training_sequences = []
training_labels = []
seq_lengths = []
for usr in users:
    user_comment_subs = list(df.loc[df['user'] == usr]['subreddit'].values)
    comment_chunks = chunks(user_comment_subs,25)
    for chnk in comment_chunks:
        label = sub_list.index(random.choice(chnk))
        training_labels.append(label)
        chnk_seq = [sub_list.index(sub) for sub in chnk if sub_list.index(sub) != label]
        training_sequences.append(chnk_seq)  
        seq_lengths.append(len(chnk_seq))

In [54]:
print('training_seq = ' + str(training_sequences[10]))
print('training_label = ' + training_labels[10])

training_seq = ['techsupport', 'indie', 'funny', 'gameideas', 'pcmasterrace', 'skateboarding', 'skateboarding', 'skateboarding', 'skateboarding', 'CodAW', 'gaming', 'GrandTheftAutoV', 'trees', 'gaming', 'skateboarding', 'pcmasterrace', 'pcmasterrace', 'pcmasterrace', 'pcmasterrace', 'pcmasterrace', 'pcmasterrace', 'whatisthisthing', 'longboarding', 'pcmasterrace']
training_label = pics


In [61]:
train_df = pd.DataFrame({'sub_seqs':training_sequences,'sub_label':training_labels,'seq_length':seq_lengths})
train_df.head()

Unnamed: 0,seq_length,sub_label,sub_seqs
0,21,1623,"[1298, 1297, 1298, 1298, 1298, 1298, 1298, 129..."
1,6,4164,"[3465, 4267, 1908, 2959, 1623, 1623]"
2,3,4164,"[1908, 3803, 3803]"
3,10,4164,"[263, 263, 184, 184, 184, 852, 852, 1374, 1298..."
4,3,4078,"[4164, 4164, 852]"


In [73]:
train_len, test_len = np.floor(len(train_df)*0.8), np.floor(len(train_df)*0.2)
train, test = train_df.ix[:train_len-1], train_df.ix[train_len:train_len + test_len]

In [68]:
vocab = sub_list

In [218]:
class BucketedDataIterator():
    def __init__(self, df, num_buckets = 5):
        df = df.sort_values('seq_length').reset_index(drop=True)
        self.size = len(df) / num_buckets
        self.dfs = []
        for bucket in range(num_buckets):
            self.dfs.append(df.ix[bucket*self.size: (bucket+1)*self.size - 1])
        self.num_buckets = num_buckets

        # cursor[i] will be the cursor for the ith bucket
        self.cursor = np.array([0] * num_buckets)
        self.shuffle()

        self.epochs = 0

    def shuffle(self):
        #sorts dataframe by sequence length, but keeps it random within the same length
        for i in range(self.num_buckets):
            self.dfs[i] = self.dfs[i].sample(frac=1).reset_index(drop=True)
            self.cursor[i] = 0

    def next_batch(self, n):
        if np.any(self.cursor+n+1 > self.size):
            self.epochs += 1
            self.shuffle()

        i = np.random.randint(0,self.num_buckets)

        res = self.dfs[i].ix[self.cursor[i]:self.cursor[i]+n-1]
        self.cursor[i] += n

        # Pad sequences with 0s so they are all the same length
        maxlen = max(res['seq_length'])
        x = np.zeros([n, maxlen], dtype=np.int32)
        for i, x_i in enumerate(x):
            x_i[:res['seq_length'].values[i]] = res['sub_label'].values[i]

        return x, res['sub_label'], res['seq_length']

In [269]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

def build_graph(
    vocab_size = len(vocab),
    state_size = 64,
    batch_size = 256,
    num_classes = len(vocab)):

    reset_graph()

    # Placeholders
    x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32, [batch_size])
    y = tf.placeholder(tf.int32, [batch_size])
    keep_prob = tf.placeholder_with_default(1.0, [])

    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', [vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)

    # RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = tf.get_variable('init_state', [1, state_size],
                                 initializer=tf.constant_initializer(0.0))
    init_state = tf.tile(init_state, [batch_size, 1])
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen,
                                                 initial_state=init_state)

    # Add dropout, as the model otherwise quickly overfits
    rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob)

    """
    Obtain the last relevant output. The best approach in the future will be to use:

        last_rnn_output = tf.gather_nd(rnn_outputs, tf.pack([tf.range(batch_size), seqlen-1], axis=1))

    which is the Tensorflow equivalent of numpy's rnn_outputs[range(30), seqlen-1, :], but the
    gradient for this op has not been implemented as of this writing.

    The below solution works, but throws a UserWarning re: the gradient.
    """
    idx = tf.range(batch_size)*tf.shape(rnn_outputs)[1] + (seqlen - 1)
    last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]), idx)

    # Softmax layer
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes])
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(last_rnn_output, W) + b
    preds = tf.nn.softmax(logits)
    correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)

    return {
        'x': x,
        'seqlen': seqlen,
        'y': y,
        'dropout': keep_prob,
        'loss': loss,
        'ts': train_step,
        'preds': preds,
        'accuracy': accuracy,
        'saver': tf.train.Saver()
    }

def train_graph(graph, batch_size = 256, num_epochs = 10, iterator = PaddedDataIterator,save=False):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tr = iterator(train)
        te = iterator(test)

        step, accuracy = 0, 0
        tr_losses, te_losses = [], []
        current_epoch = 0
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
            feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2], g['dropout']: 0.6}
            accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
            accuracy += accuracy_

            if tr.epochs > current_epoch:
                current_epoch += 1
                tr_losses.append(accuracy / step)
                step, accuracy = 0, 0

                #eval test set
                te_epoch = te.epochs
                while te.epochs == te_epoch:
                    step += 1
                    batch = te.next_batch(batch_size)
                    feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2]}
                    accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
                    accuracy += accuracy_

                te_losses.append(accuracy / step)
                step, accuracy = 0,0
                print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- te:", te_losses[-1])
                
        if isinstance(save, str):
            g['saver'].save(sess, save)

    return tr_losses, te_losses

def recommend_subs(g, checkpoint,pred_data,batch_size):
    te = BucketedDataIterator(pred_data,num_buckets=1)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        g['saver'].restore(sess, checkpoint)
        batch = te.next_batch(batch_size)
        feed = {g['x']: batch[0],g['y']: batch[1], g['seqlen']: batch[2]}
        preds = sess.run([g['preds']], feed_dict=feed)[0]
        return tf.cast(tf.argmax(preds,1),tf.int32).eval()

In [230]:
g = build_graph()
tr_losses, te_losses = train_graph(g,num_epochs=1,iterator=BucketedDataIterator,
                                   save='../SubRecommender/models/seqtest10')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[[2593 2593 2593 ..., 2593 2593    0]
 [4060 4060 4060 ..., 4060 4060    0]
 [3131 3131 3131 ..., 3131 3131    0]
 ..., 
 [4151 4151 4151 ..., 4151 4151    0]
 [3697 3697 3697 ..., 3697    0    0]
 [2348 2348 2348 ..., 2348 2348    0]]
[[ 690  690  690 ...,  690  690  690]
 [4078 4078 4078 ..., 4078 4078 4078]
 [4204 4204 4204 ..., 4204 4204 4204]
 ..., 
 [3542 3542 3542 ..., 3542 3542 3542]
 [3733 3733 3733 ..., 3733 3733 3733]
 [1248 1248 1248 ..., 1248 1248 1248]]
Accuracy after epoch 1  - tr: 0.000434027777778 - te: 0.001953125


In [195]:
def pad_pred_data(pred_data,batch_size):
    i = len(pred_data)
    while i < batch_size:
        pred_data.append([0])
        i = i + 1
    return pred_data

In [182]:
test.head()

Unnamed: 0,seq_length,sub_label,sub_seqs
5886,7,4117,"[3492, 273, 2930, 2930, 3019, 3492, 2930]"
5887,19,4117,"[2930, 2930, 273, 273, 1329, 2181, 3492, 2181,..."
5888,12,4117,"[1329, 1329, 2181, 3492, 273, 2930, 3623, 3492..."
5889,18,1329,"[2181, 2181, 4117, 273, 3623, 3623, 2930, 2930..."
5890,18,2930,"[4117, 1329, 1329, 273, 3492, 3492, 4117, 4117..."


In [183]:
test_df.head()

Unnamed: 0,sub_seqs,sub_label,seq_length
0,"[18, 5, 40, 23, 1004, 34, 67, 1234]",0,8
1,[0],0,1
2,[0],0,1
3,[0],0,1
4,[0],0,1


In [232]:
test_preds = [[3492, 273, 2930, 2930, 3019, 3492, 2930]]
pad_test_preds = pad_pred_data(test_preds,256)
test_df = pd.DataFrame({'sub_seqs':pad_test_preds})
test_df['sub_label'] = 0
test_df['seq_length'] = test_df.apply (lambda row: len(row['sub_seqs']),axis=1)

In [268]:
 recs = recommend_subs(g,'../SubRecommender/models/seqtest10',test,256)

ValueError: Cannot feed value of shape (1472, 24) for Tensor 'Placeholder:0', which has shape '(256, ?)'

In [266]:
recs

array([  69, 2934,  285,  281,  390, 2972,  595,  556, 2972,  810,  583,
       1454, 1247, 1748, 2614,   45, 4056,  868, 2055, 1160, 4421, 2386,
       4428, 2977, 4083, 2355,  244, 4184, 2532, 4341,   52, 3950, 3510,
       3246, 4341,  694, 2355,   68,  283, 2235, 1247, 4184, 3608,   45,
       2582, 4202,  556, 3246, 3981,  586, 2972, 1748, 1418,  645,  707,
         45,  691, 2575,  390, 1340,   45, 1411, 1204, 2934, 3977,  406,
        406, 2905, 1216,  869, 2517,  583, 1204, 1482, 2165, 3849, 3372,
       1656, 1313, 3588,  949, 4074, 2120, 2125, 1247, 2016, 3981, 4341,
       1454, 1917, 1454, 1247,  543,   23,  544, 1160,  829,   45, 2488,
       1313,  281,   45, 1382, 1809, 1160, 4074,  317, 3608, 2355, 2153,
       1777,  548,  316, 4387,  390, 1092, 1247,   68,  784, 2407,   84,
         49,  986,  113, 4341, 1160, 1809, 3977, 2403,  884,  502,   68,
         68, 4341, 2355,   45, 4341, 4083, 4341,  869,   45, 1501,   68,
       1313,  183,  185, 2456, 1204, 1302, 4341, 12