In [None]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

import tensorflow as tf
import altair as alt
import collections

from src.load_data import read_tables
from src.item_recommender import ItemRecommender
from src.CF_Softmax_Model import CFModel, build_CF_model, compute_scores, book_neighbors, user_recommendations, split_dataframe
from src.Baseline_Model import build_baseline_model

In [None]:
# reviews, books = read_tables('data', 'goodreads_reviews_mystery_thriller_crime.json.gz', 'goodreads_books_mystery_thriller_crime.json.gz')
# reviews.to_csv('data/cleaned_reviews_mystery_1.csv', header=reviews.columns, index=False)
# books.to_csv('data/cleaned_books_mystery_1.csv', header=books.columns, index=False)

In [None]:
reviews = pd.read_csv('data/cleaned_reviews_mystery_1.csv')

In [None]:
books = pd.read_csv('data/cleaned_books_mystery_1.csv')

In [None]:
reviews.head(1)

In [None]:
books.head(1)

In [None]:
len(reviews.user_id.unique()), len(reviews.book_id.unique())

In [None]:
len(books.author_id.unique()), len(books.publisher.unique()), len(books.book_id.unique())

In [None]:
SAMPLE = False
if SAMPLE:
    N = 10000
    reviews = reviews[(reviews['user_id']<N) & (reviews['book_id']<N)]
    books = books[books['book_id'] < N]

In [None]:
# some of the tensorflow code assumes the ids are string
reviews['book_id'] = reviews['book_id'].astype(str)
reviews['user_id'] = reviews['user_id'].astype(str)
books['book_id'] = books['book_id'].astype(str)
books["author_id"] = books["author_id"].astype(str)

In [52]:
#create rated book_id list for each user
rated_books =(reviews[['user_id', 'book_id']].groupby('user_id', as_index=False).aggregate(lambda x: list(x)))
rated_books.head()

Unnamed: 0,user_id,book_id
0,7,[10]
1,11,[287]
2,19,[404]
3,49,[544]
4,64,[354]


In [None]:
author_dict = {
    book: author for book, author in zip(books["book_id"], books["author_id"])
}
publisher_dict = {
    book: publisher
    for book, publisher in zip(books["book_id"], books["publisher"])
}

In [None]:
book = [['82014', '83650', '20417', '84776', '65947', '3912', '1016', '23365', '2724', '12991', '21534', '44449', '41186', '95668', '93301'], ['17124', '43808', '52912', '55484'], ['97224', '80190', '82014', '73302', '70630', '90870', '10395', '21830', '86178', '98208', '94335', '20679', '11652', '44449'], ['47263', '10395', '55466'], ['58826', '60026', '60347', '60346', '92204', '79328', '101791', '96559', '100295', '1846', '88505', '95519', '36745', '9954', '82901', '86968', '16069', '104449', '75268']]

t = pd.DataFrame.from_dict(book)
print(t.head())
t.fillna('').values

In [None]:
def make_batch(ratings, batch_size):
    """Creates a batch of examples.
    Args:
        ratings: A DataFrame of ratings such that examples["book_id"] is a list of
        books rated by a user.
    batch_size: The batch size.
    """
    def pad(x, fill):
        return pd.DataFrame.from_dict(x).fillna(fill).values

    book = []
    author = []
    publisher = []
    label = []
    print('make_batch#1')
    for book_ids in ratings["book_id"].values:
        book_ids = book_ids[:10]
        book.append(book_ids)
        author.append([author_dict[book_id] for book_id in book_ids])
        publisher.append([publisher_dict[book_id] for book_id in book_ids])
        label.append([int(book_id) for book_id in book_ids])
    print('make_batch#2') 
    print('book', book[:5])
    features = {
      "book_id": pad(book, ""),
      #"author_id": pad(author, ""),
      #"publisher": pad(publisher, ""),
      "label": pad(label, -1)
      }
    print('make_batch#3')    
    batch = (
      tf.data.Dataset.from_tensor_slices(features)
      .shuffle(1000)
      .repeat()
      .batch(batch_size)
      .make_one_shot_iterator()
      .get_next())
    print('make_batch#4')    
    return batch

def select_random(x):
    """Selectes a random elements from each row of x."""
    def to_float(x):
        return tf.cast(x, tf.float32)
    def to_int(x):
        return tf.cast(x, tf.int64)
    batch_size = tf.shape(x)[0]
    rn = tf.range(batch_size)
    nnz = to_float(tf.count_nonzero(x >= 0, axis=1))
    rnd = tf.random_uniform([batch_size])
    ids = tf.stack([to_int(rn), to_int(nnz * rnd)], axis=1)
    return to_int(tf.gather_nd(x, ids))

t = make_batch(rated_books, 100)

In [None]:
def softmax_loss(user_embeddings, book_embeddings, labels):
    user_emb_dim = user_embeddings.shape[1].value
    book_emb_dim = book_embeddings.shape[1].value
    if user_emb_dim != book_emb_dim:
        raise ValueError('The user embedding dimension %d should match the book embedding dimension %d' %(
                user_emb_dim, book_emb_dim))
    logits = tf.matmul(user_embeddings, book_embeddings, transpose_b=True)
    
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))
    return loss

In [None]:
def build_softmax_model(rated_books, embedding_cols, hidden_dims, learning_rate =1,):
    print('build_softmax_model#1')
    def create_network(features):
        #create a bog-of-words embedding for each sparse feature
        inputs = tf.feature_column.input_layer(features, embedding_cols)
        #hidden layer
        input_dim = inputs.shape[1].value
        for i, output_dim in enumerate(hidden_dims):
            w = tf.get_variable(
                'hidden%d_w_'% i, shape=[input_dim, output_dim],
                initializer=tf.truncated_normal_initializer(
                stddev=1./np.sqrt(output_dim)))/10
            outputs = tf.matmul(inputs, w)
            input_dim = output_dim
            inputs = outputs
        return outputs
    
    train_rated_books, test_rated_books = split_dataframe(rated_books)
    print('build_softmax_model#2')    
    train_batch = make_batch(train_rated_books, 200)
    test_batch = make_batch(test_rated_books, 100)
    print('build_softmax_model#3')    
    with tf.variable_scope('model', reuse=False):
        #train
        train_user_embeddings =create_network(train_batch)
        train_labels = select_random(train_batch['label'])
        
    with tf.variable_scope('model', reuse=True):
        #test
        test_user_embeddings = create_network(test_batch)
        test_labels = select_random(test_batch['label'])
        
        book_embeddings = tf.get_variable("input_layer/book_id_embedding/embedding_weights")
    print('build_softmax_model#4')        
    train_loss = softmax_loss(train_user_embeddings, book_embeddings, train_labels)
    test_loss = softmax_loss(test_user_embeddings, book_embeddings, test_labels)
    
    _, test_prediction_at_10 = tf.metrics.precision_at_k(
        labels=test_labels, predictions=tf.matmul(test_user_embeddings, book_embeddings, transpose_b=True),
        k=10)
    
    metrics=(
        {'train_loss': train_loss, 'test_loss': test_loss},
        {'test_precision_at_10': test_prediction_at_10})
    embeddings = {'book_id': book_embeddings}
    return CFModel(embeddings, train_loss, metrics, learning_rate=learning_rate)

In [None]:
books['book_id'].nunique()

In [None]:
def make_embedding_col(key, embedding_dim):
    categorical_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=key, vocabulary_list=list(set(books[key].values)), num_oov_buckets=0)
    return tf.feature_column.embedding_column(categorical_column=categorical_col, dimension=embedding_dim, combiner='mean')

In [None]:
with tf.Graph().as_default():
    print('embedding_cols start')
    embedding_cols = [
            make_embedding_col('book_id', 5),
             #make_embedding_col('author_id',10),
             #make_embedding_col('publisher',10)
        ]    
    print('embedding_cols done')
    softmax_model = build_softmax_model(
        rated_books, 
        embedding_cols = embedding_cols, hidden_dims=[5],
        learning_rate = 0.1
    )
    

    

In [None]:
softmax_model.train( num_iterations =1000)