In [29]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import random

In [4]:
with open('../SubRecommender/data/train_reddit_data.json','r') as data_file:    
    reddit_data = json.load(data_file)

In [30]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [60]:
df = pd.DataFrame(reddit_data,columns=['user','subreddit','utc_stamp'])
df['utc_stamp'] = pd.to_datetime(df['utc_stamp'],unit='s')
df.sort_values(by=['user','utc_stamp'], ascending=True, inplace=True)
users = list(df.groupby('user')['user'].nunique().keys())
sub_list = list(df.groupby('subreddit')['subreddit'].nunique().keys())
training_sequences = []
training_labels = []
seq_lengths = []
for usr in users:
    user_comment_subs = list(df.loc[df['user'] == usr]['subreddit'].values)
    comment_chunks = chunks(user_comment_subs,25)
    for chnk in comment_chunks:
        label = sub_list.index(random.choice(chnk))
        training_labels.append(label)
        chnk_seq = [sub_list.index(sub) for sub in chnk if sub_list.index(sub) != label]
        training_sequences.append(chnk_seq)  
        seq_lengths.append(len(chnk_seq))

In [54]:
print('training_seq = ' + str(training_sequences[10]))
print('training_label = ' + training_labels[10])

training_seq = ['techsupport', 'indie', 'funny', 'gameideas', 'pcmasterrace', 'skateboarding', 'skateboarding', 'skateboarding', 'skateboarding', 'CodAW', 'gaming', 'GrandTheftAutoV', 'trees', 'gaming', 'skateboarding', 'pcmasterrace', 'pcmasterrace', 'pcmasterrace', 'pcmasterrace', 'pcmasterrace', 'pcmasterrace', 'whatisthisthing', 'longboarding', 'pcmasterrace']
training_label = pics


In [61]:
train_df = pd.DataFrame({'sub_seqs':training_sequences,'sub_label':training_labels,'seq_length':seq_lengths})
train_df.head()

Unnamed: 0,seq_length,sub_label,sub_seqs
0,21,1623,"[1298, 1297, 1298, 1298, 1298, 1298, 1298, 129..."
1,6,4164,"[3465, 4267, 1908, 2959, 1623, 1623]"
2,3,4164,"[1908, 3803, 3803]"
3,10,4164,"[263, 263, 184, 184, 184, 852, 852, 1374, 1298..."
4,3,4078,"[4164, 4164, 852]"


In [73]:
train_len, test_len = np.floor(len(train_df)*0.8), np.floor(len(train_df)*0.2)
train, test = train_df.ix[:train_len-1], train_df.ix[train_len:train_len + test_len]

In [62]:
class SimpleDataIterator():
    def __init__(self, df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()

    def shuffle(self):
        self.df = self.df.sample(frac=1).reset_index(drop=True)
        self.cursor = 0

    def next_batch(self, n):
        if self.cursor+n-1 > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.ix[self.cursor:self.cursor+n-1]
        self.cursor += n
        return res['sub_seqs'], res['sub_label'], res['seq_length']

In [64]:
data = SimpleDataIterator(train_df)
d = data.next_batch(3)
print('Input sequences\n', d[0], end='\n\n')
print('Target values\n', d[1], end='\n\n')
print('Sequence lengths\n', d[2])

Input sequences
 0    [4183, 4183, 946, 1103, 3300, 3300, 946, 1103,...
1    [3081, 969, 2605, 2605, 861, 969, 861, 3257, 1...
2    [1927, 2283, 4004, 4200, 194, 1927, 2809, 194,...
Name: sub_seqs, dtype: object

Target values
 0     755
1    3041
2    2566
Name: sub_label, dtype: int64

Sequence lengths
 0    18
1    12
2    24
Name: seq_length, dtype: int64


In [65]:
class PaddedDataIterator(SimpleDataIterator):
    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.ix[self.cursor:self.cursor+n-1]
        self.cursor += n

        # Pad sequences with 0s so they are all the same length
        maxlen = max(res['seq_length'])
        x = np.zeros([n, maxlen], dtype=np.int32)
        for i, x_i in enumerate(x):
            x_i[:res['seq_length'].values[i]] = res['sub_seqs'].values[i]

        return x, res['sub_label'], res['seq_length']

In [66]:
data = PaddedDataIterator(train_df)
d = data.next_batch(3)
print('Input sequences\n', d[0], end='\n\n')

Input sequences
 [[1794 3990 1794 1794 3990 4428 4428 3990 4428  296 1777 1777 3542 4428
  1794 3990 3990 3990 3990 3990 3990 3990 2920]
 [2989 2070 3366  606 3415 2841 3102 4432 3102 4433 3661 4352 4352  833
  3665 2582 2582 2582 1777 2841 2271 2841   12]
 [1960 1960 1960 4218 4218 1960 2320 2320 4067 1777 1777 4067 2320 1777
  1960    0    0    0    0    0    0    0    0]]



In [68]:
vocab = sub_list

In [87]:
class BucketedDataIterator():
    def __init__(self, df, num_buckets = 5):
        df = df.sort_values('seq_length').reset_index(drop=True)
        self.size = len(df) / num_buckets
        self.dfs = []
        for bucket in range(num_buckets):
            self.dfs.append(df.ix[bucket*self.size: (bucket+1)*self.size - 1])
        self.num_buckets = num_buckets

        # cursor[i] will be the cursor for the ith bucket
        self.cursor = np.array([0] * num_buckets)
        self.shuffle()

        self.epochs = 0

    def shuffle(self):
        #sorts dataframe by sequence length, but keeps it random within the same length
        for i in range(self.num_buckets):
            self.dfs[i] = self.dfs[i].sample(frac=1).reset_index(drop=True)
            self.cursor[i] = 0

    def next_batch(self, n):
        if np.any(self.cursor+n+1 > self.size):
            self.epochs += 1
            self.shuffle()

        i = np.random.randint(0,self.num_buckets)

        res = self.dfs[i].ix[self.cursor[i]:self.cursor[i]+n-1]
        self.cursor[i] += n

        # Pad sequences with 0s so they are all the same length
        maxlen = max(res['seq_length'])
        x = np.zeros([n, maxlen], dtype=np.int32)
        for i, x_i in enumerate(x):
            x_i[:res['seq_length'].values[i]] = res['sub_label'].values[i]

        return x, res['sub_label'], res['seq_length']

In [88]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

def build_graph(
    vocab_size = len(vocab),
    state_size = 64,
    batch_size = 256,
    num_classes = len(vocab)):

    reset_graph()

    # Placeholders
    x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32, [batch_size])
    y = tf.placeholder(tf.int32, [batch_size])
    keep_prob = tf.placeholder_with_default(1.0, [])

    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', [vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)

    # RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = tf.get_variable('init_state', [1, state_size],
                                 initializer=tf.constant_initializer(0.0))
    init_state = tf.tile(init_state, [batch_size, 1])
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen,
                                                 initial_state=init_state)

    # Add dropout, as the model otherwise quickly overfits
    rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob)

    """
    Obtain the last relevant output. The best approach in the future will be to use:

        last_rnn_output = tf.gather_nd(rnn_outputs, tf.pack([tf.range(batch_size), seqlen-1], axis=1))

    which is the Tensorflow equivalent of numpy's rnn_outputs[range(30), seqlen-1, :], but the
    gradient for this op has not been implemented as of this writing.

    The below solution works, but throws a UserWarning re: the gradient.
    """
    idx = tf.range(batch_size)*tf.shape(rnn_outputs)[1] + (seqlen - 1)
    last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]), idx)

    # Softmax layer
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes])
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(last_rnn_output, W) + b
    preds = tf.nn.softmax(logits)
    correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)

    return {
        'x': x,
        'seqlen': seqlen,
        'y': y,
        'dropout': keep_prob,
        'loss': loss,
        'ts': train_step,
        'preds': preds,
        'accuracy': accuracy
    }

def train_graph(graph, batch_size = 256, num_epochs = 10, iterator = PaddedDataIterator,save=False):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tr = iterator(train)
        te = iterator(test)

        step, accuracy = 0, 0
        tr_losses, te_losses = [], []
        current_epoch = 0
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
            feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2], g['dropout']: 0.6}
            accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
            accuracy += accuracy_

            if tr.epochs > current_epoch:
                current_epoch += 1
                tr_losses.append(accuracy / step)
                step, accuracy = 0, 0

                #eval test set
                te_epoch = te.epochs
                while te.epochs == te_epoch:
                    step += 1
                    batch = te.next_batch(batch_size)
                    feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2]}
                    accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
                    accuracy += accuracy_

                te_losses.append(accuracy / step)
                step, accuracy = 0,0
                print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- te:", te_losses[-1])
                
        if isinstance(save, str):
            g['saver'].save(sess, save)

    return tr_losses, te_losses

def recommend_subs(g, checkpoint, pred_data,iterator):
    te = iterator(pred_data)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        g['saver'].restore(sess, checkpoint)
        feed = {g['x']: pred_data}
        preds = sess.run([g['preds']], feed_dict=feed)
    return preds

In [91]:
g = build_graph()
tr_losses, te_losses = train_graph(g,num_epochs=10,iterator=BucketedDataIterator)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Accuracy after epoch 1  - tr: 0.00572916666667 - te: 0.01171875
Accuracy after epoch 2  - tr: 0.046875 - te: 0.0859375
Accuracy after epoch 3  - tr: 0.078125 - te: 0.19140625
Accuracy after epoch 4  - tr: 0.17138671875 - te: 0.2734375
Accuracy after epoch 5  - tr: 0.157986111111 - te: 0.33984375
Accuracy after epoch 6  - tr: 0.243489583333 - te: 0.328125
Accuracy after epoch 7  - tr: 0.286892361111 - te: 0.36328125
Accuracy after epoch 8  - tr: 0.287388392857 - te: 0.375
Accuracy after epoch 9  - tr: 0.29833984375 - te: 0.28515625
Accuracy after epoch 10  - tr: 0.2359375 - te: 0.33203125


In [None]:
g = build_seq2seq_graph()
tr_losses, te_losses = train_graph(g, iterator=BucketedDataIterator)