In [10]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

import tensorflow as tf
import altair as alt
import collections
from sklearn.model_selection import train_test_split


from src.clean_data import clean_books, clean_reviews, scale_data
from src.EDAFunction import read_reviews, read_books
from src.item_recommender import ItemRecommender
from src.CFModel import CFModel, build_model, compute_scores, book_neighbors

In [2]:
books_df = read_books(os.path.join('data/', 'goodreads_books_mystery_thriller_crime.json.gz'), head=False)

counting file: data/goodreads_books_mystery_thriller_crime.json.gz
current line: 0,complete
done!


In [3]:
books_df1 = books_df.copy()

### content based filter

In [4]:
cleaned_books = clean_books(books_df1)

In [5]:
cleaned_books.head()

Unnamed: 0_level_0,title,num_pages,is_ebook,average_rating,ratings_count,text_reviews_count
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6066814,"Crowner Royal (Crowner John Mystery, #13)",400,0,3.93,186,15
33394837,The House of Memory (Pluto's Snitch #2),318,1,4.33,269,60
29074697,The Slaughtered Virgin of Zenopolis (Inspector Capstan #1),0,1,3.49,192,23
1902202,"Dead in the Morning (Patrick Grant, #1)",0,0,3.3,52,8
9671977,Aristotele e i misteri di Eleusi,659,0,3.54,22,3


In [None]:
# cleaned_books1 = cleaned_books.loc[book_ids]
# len(cleaned_books1)

In [None]:
# scaled_df = scale_data(cleaned_books1)
# len(scaled_df)

In [None]:
# rec = ItemRecommender()
# rec_model = rec.fit(scaled_df) #the computation complexity is 4 billion (200K books in total)

### collaborative filter

In [2]:
reviews_df = read_reviews(os.path.join('data/', 'goodreads_reviews_mystery_thriller_crime.json.gz'))

counting file: data/goodreads_reviews_mystery_thriller_crime.json.gz
current line: 0,1000000,complete
done!


In [4]:
cleaned_reviews = clean_reviews(reviews_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


remapping the ids


In [5]:
cleaned_reviews.head(2)

Unnamed: 0,review_id,user_id,book_id,rating,timestamp,n_votes,n_comments,old_user_id,old_book_id
620327,d23dc89ab32cd864e54d18369751163b,168861,77248,3,2001-01-01 00:00:00,0,0,d889b42d9eb7b80e02f24830e27c6389,196084
620326,c7613da4cbd48baa83efec99b4dd6a41,168861,196059,3,2001-01-11 00:00:00,0,0,d889b42d9eb7b80e02f24830e27c6389,79030


### build sparse matrix and baseline model

In [13]:
from src.CFModel import CFModel, build_model, compute_scores, book_neighbors
model = build_model(cleaned_reviews, embedding_dim=30, init_stddev=0.5)


In [15]:
model.train(num_iterations=100, learning_rate=1.)

 iteration 40: train_error=17.252905, test_error=17.680166

KeyboardInterrupt: 

### get neighbors of a given book name

In [190]:
DOT = 'dot'
COSINE = 'cosine'
def compute_scores(query_embedding, item_embeddings, measure=COSINE):
  """Computes the scores of the candidates given a query.
  Args:
    query_embedding: a vector of shape [k], representing the query embedding.
    item_embeddings: a matrix of shape [N, k], such that row i is the embedding
      of item i.
    measure: a string specifying the similarity measure to be used. Can be
      either DOT or COSINE.
  Returns:
    scores: a vector of shape [N], such that scores[i] is the score of item i.
  """
  u = query_embedding
  V = item_embeddings
  if measure == COSINE:
    V = V / np.linalg.norm(V, axis=1, keepdims=True)
    u = u / np.linalg.norm(u)
  scores = u.dot(V.T)
  return scores

# def user_recommendations(model, measure=DOT, exclude_rated=False, k=6):
#   if USER_RATINGS:
#     scores = compute_scores(
#         model.embeddings["user_id"][943], model.embeddings["book_id"], measure)
#     score_key = measure + ' score'
#     df = pd.DataFrame({
#         score_key: list(scores),
#         'book_id': books['book_id'],
#         'titles': books['title'],
#         'genres': books['all_genres'],
#     })
#     if exclude_rated:
#       # remove books that are already rated
#       rated_books = ratings[ratings.user_id == "943"]["book_id"].values
#       df = df[df.book_id.apply(lambda book_id: book_id not in rated_books)]
#     display.display(df.sort_values([score_key], ascending=False).head(k))  

def book_neighbors(model, title_substring, measure=COSINE, k=6):
  # Search for book ids that match the given substring.
    ids =  cleaned_books1[cleaned_books1['title'].str.contains(title_substring)].index.values
    titles = cleaned_books1.loc[ids]['title'].values
    if len(titles) == 0:
        raise ValueError("Found no books with title %s" % title_substring)
    print("Nearest neighbors of : %s." % titles[0])
    if len(titles) > 1:
        print("[Found more than one matching book. Other candidates: {}]".format(
        ", ".join(titles[1:])))
    book_id = cleaned_reviews[cleaned_reviews.old_book_id == ids[0]].book_id
    scores = compute_scores(
      model.embeddings["book_id"][book_id], model.embeddings["book_id"],
      measure)
    score_key = measure + ' score'
    df = pd.DataFrame({
      score_key: scores[0],
      'titles': cleaned_books1['title'],
    'is_ebook': cleaned_books1['is_ebook'],
    'average_rating': cleaned_books1['average_rating'],
    'ratings_count': cleaned_books1['ratings_count']
  })
    display.display(df.sort_values([score_key], ascending=False).head(k))

In [191]:
title_substring ='Harry Potter'
book_neighbors(model, title_substring, measure=COSINE, k=6)

Nearest neighbors of : Harry Potter and the Half-Blood Prince by J.K. Rowling | Summary & Study Guide.


Unnamed: 0_level_0,cosine score,titles,is_ebook,average_rating,ratings_count
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
22067806,1.0000001192092896,Missing You by Harlan Coben -- Review,1,4.05,2
12768886,0.7083189487457275,"Εγκλήματα στην πανσιόν ""Απόλλων""",0,2.34,33
2205477,0.695481538772583,Real World,0,3.4,246
27384959,0.6858937740325928,The Bourbon Thief,1,4.12,537
23198677,0.6831674575805664,Primeros Casos de Poirot,0,3.92,6
30364183,0.6708451509475708,"Death of a Nurse (Hamish Macbeth, #31)",0,3.76,10


In [229]:
cleaned_books1[cleaned_books1.title.apply(lambda x:True if 'Harry Potter' in x else False)]

Unnamed: 0_level_0,title,num_pages,is_ebook,average_rating,ratings_count,text_reviews_count
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25572694,Harry Potter and the Half-Blood Prince by J.K. Rowling | Summary & Study Guide,0,1,5.0,2,1


### inspect neighbor clustering

In [195]:
def book_embedding_norm(models):
    """
    visualizes the norm and number of ratings of the book
    Args:
        model: a MFmodel object
    """
    if not isinstance(models, list):
        models = [models]
    df=pd.DataFrame({
        'title': cleaned_books1['title'],
        'average_rating': cleaned_books1['average_rating'],
        'ratings_count':cleaned_books1['ratings_count']
    })
    charts = []
    brush = alt.selection_interval()
    for i, model in enumerate(models):
        norm_key = 'norm' +str(i)
        df[norm_key] = np.linalg.norm(model.embeddings['book_id'], axis=1)
        nearest = alt.selection(
                type='single', encodings=['x', 'y'], on='mouseover', nearest=True,
                empty='none')
        base = alt.Chart().mark_circle().encode(
                x='ratings_count',
                y=norm_key,
               color=alt.condition(brush, alt.value('#4c78a8'), alt.value('lightgray'))
            ).properties(
            selection=nearest).add_selection(brush)
        text = alt.Chart().mark_text(align='center', dx=5, dy=-5).encode(
        x='ratings_count', y=norm_key,
        text=alt.condition(nearest, 'title', alt.value('')))
        charts.append(alt.layer(base, text))
    return alt.hconcat(*charts, data=df)
    
    

In [196]:
book_embedding_norm(model)

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.HConcatChart(...)

In [9]:
from scipy.sparse import csr_matrix
# csr_matrix((train_reviews.rating.values, (train_reviews.user_id.values, train_reviews.book_id.values)), 
#            shape=(len(train_reviews.user_id.unique()), len(train_reviews.book_id.unique()))).toarray()

#rating matrix will be too sparse if we convert the rating data into a pd pivot table 
#rating_mat = pd.pivot_table(train_reviews, values=['rating'], index=['user_id'], columns =['book_id'])