In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display


from src.clean_data import clean_books, clean_reviews, scale_data, review_map_id
from src.load_data import read_reviews, read_books

### Load book data

In [2]:
books_df = read_books(os.path.join('data/', 'goodreads_books_mystery_thriller_crime.json.gz'), head=False)

counting file: data/goodreads_books_mystery_thriller_crime.json.gz
current line: 0,complete
done!


In [3]:
books_df1 = books_df.copy()

In [4]:
len(books_df1)

219235

In [5]:
cleaned_books = clean_books(books_df1)

In [55]:
len(cleaned_books)

182904

In [48]:
# we notice there are different versions of same book. Since they have different publisher/average rating, we treat
# them as different book
books_df1[books_df1.title == 'The Adventures of Sherlock Holmes']

Unnamed: 0,book_id,work_id,isbn,asin,title,description,num_pages,is_ebook,link,country_code,language_code,average_rating,ratings_count,text_reviews_count,author_id,publisher,publication_year,similar_books
1478,12639858,1222101,161293028X,,The Adventures of Sherlock Holmes,"This is a beautifully-designed new edition of the definitive ""Adventures of Sherlock Holmes"" by Sir Arthur Conan Doyle. Includes 12 complete short stories, comprising the best-known cases of the world's best-loved sleuth.",236,0,https://www.goodreads.com/book/show/12639858-the-adventures-of-sherlock-holmes,US,,4.3,29,1,2448,Tribeca Books,2011,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"
1654,24564118,1222101,0140661008,,The Adventures of Sherlock Holmes,"The Adventures of Sherlock Holmes is a collection of twelve stories by Sir Arthur Conan Doyle, featuring his famous detective and illustrated by Sidney Paget.\nThese are the first of the Sherlock Holmes short stories, originally published as single stories in the Strand Magazine from July 1891 to June 1892. The book was published in England on October 14, 1892 by George Newnes Ltd and in a US Edition on October 15 by Harper. The initial combined print run was 14,500 copies.",302,0,https://www.goodreads.com/book/show/24564118-the-adventures-of-sherlock-holmes,US,eng,4.3,1,1,2448,Penguin Group,1994,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"
2827,11196320,1222101,0786105119,,The Adventures of Sherlock Holmes,"First published in 1891-1892, stories read include The Red-Headed League, The Man With the Twisted Lip, The Five Orange Pips, The Notable Bachelor, A Scandal in Bohemia, and The Engineer's Thumb.",10,0,https://www.goodreads.com/book/show/11196320-the-adventures-of-sherlock-holmes,US,,4.3,6,2,2448,Blackstone Audiobooks,1980,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"
5583,5774655,1222101,,B001LF3366,The Adventures of Sherlock Holmes,"Complete in nine handsome volumes, each with an introduction by a Doyle scholar, a chronology, a selected bibliography, and explanatory notes, the Oxford Sherlock Holmes series offers a definitive collection of the famous detective's adventures. No home library is complete without it.\nComprising the series of short stories that made the fortunes of the Strand, the magazine in which they were first published, this volume won even more popularity for Sherlock Holmes and Dr. Watson. Holmes is at the height of his powers in many of his most famous cases, including The Red-Headed League, The Speckled Band, and The Blue Carbuncle.",0,1,https://www.goodreads.com/book/show/5774655-the-adventures-of-sherlock-holmes,US,eng,4.3,8,2,2448,,,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"
14122,12320698,1222101,1849903670,,The Adventures of Sherlock Holmes,"A Sherlock tie-in edition of Conan Doyle's first collection of Sherlock Holmes stories, with an introduction by show cocreator Mark Gatiss\nIn this new edition of Conan Doyle's first collection of short stories, Mark Gatiss explains how these gripping tales inspired and influenced the new series. This collection contains 12 short stories first published in theStrandmagazine between 1891 and 1892, and then published as a collection in October 1892. It includes some of Conan Doyle's best tales of murder and mystery, such as ""The Adventures of the Speckled Band,"" in which the strange last words of a dying woman ""It was the band, the speckled band!"" and an inexplicable whistling in the night are the only clues Sherlock Holmes has to prevent another murder; and ""The Five Orange Pips,"" in which an untimely death and the discovery of the letter containing five orange pips lead to a cross-Atlantic conspiracy.",324,0,https://www.goodreads.com/book/show/12320698-the-adventures-of-sherlock-holmes,US,,4.3,367,37,2448,BBC Books,2012,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211367,13503565,1222101,,,The Adventures of Sherlock Holmes,"Complete in nine handsome volumes, each with an introduction by a Doyle scholar, a chronology, a selected bibliography, and explanatory notes, the Oxford Sherlock Holmes series offers a definitive collection of the famous detective's adventures. No home library is complete without it.\nComprising the series of short stories that made the fortunes of the Strand, the magazine in which they were first published, this volume won even more popularity for Sherlock Holmes and Dr. Watson. Holmes is at the height of his powers in many of his most famous cases, including The Red-Headed League, The Speckled Band, and The Blue Carbuncle.",11,0,https://www.goodreads.com/book/show/13503565-the-adventures-of-sherlock-holmes,US,eng,4.3,21,3,2448,Librivox Audio,2010,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"
215762,962730,1222101,,,The Adventures of Sherlock Holmes,"This edition of The Adventures of Sherlock Holmescontains the earliest cases of the greatest fictional detective of all time. It comprises A Study in Scarlet, The Sign of Fourand the complete Adventures of Sherlock Holmes, reprinted from the Strand Magazine. It is illustrated by Sidney Paget, the finest of illustrators, and the man from whom our images of Sherlock Holmes and his world derive.\nThis is the first of three volumes of The Complete Sherlock Holmes. The three books will present all the Holmes stories arranged chronologically in order of first publication.",0,0,https://www.goodreads.com/book/show/962730.The_Adventures_of_Sherlock_Holmes,US,,4.3,597,16,2448,Wordsworth Editions,,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"
216362,22620243,1222101,,B00JQG8NYY,The Adventures of Sherlock Holmes,"Sir Arthur Ignatius Conan Doyle, DL (22 May 1859 - 7 July 1930)\nwas a Scottish author most noted for his stories about the\ndetective Sherlock Holmes, which are generally considered a major\ninnovation in the field of crime fiction, and the adventures of\nProfessor Challenger. He was a prolific writer whose other works\ninclude science fiction stories, historical novels, plays and\nromances, poetry, and non-fiction. Conan was originally a given\nname, but Doyle used it as part of his surname in his later years.",0,1,https://www.goodreads.com/book/show/22620243-the-adventures-of-sherlock-holmes,US,eng,4.3,1488,7,2448,Arthur Conan Doyle,,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"
217873,13635969,1222101,,,The Adventures of Sherlock Holmes,"ISBN: 1853260339\nWith an Introduction by Dr. Julian Wolfreys\nThis edition of 'The Adventures of Sherlock Holmes' contains the earliest cases of the greatest fictional detective of all time. It comprises 'A Study in Scarlet', 'The Sign of Four' and the complete 'Adventures of Sherlock Holmes', reprinted from 'The Strand Magazine'. It is illustrated by Sidney Paget, the finest of illustrators, from whom our images of Sherlock Holmes and his world derive.\nThis is the first of three volumes of 'The Complete Sherlock Holmes' reproduced from original copies of 'The Strand Magazine'. The three books present all the Holmes stories arranged chronologically in order of first publication.\n(contains both ""The Adventures of Sherlock Holmes"" and ""The Memoirs of Sherlock Holmes"")",450,0,https://www.goodreads.com/book/show/13635969-the-adventures-of-sherlock-holmes,US,eng,4.3,29,5,2448,Wordsworth Classics,1992,"[184594, 359251, 1474865, 192887, 141270, 567550, 18626865, 122646, 278854, 93276]"


In [22]:
title = cleaned_books.groupby('title').max()['ratings_count']

In [58]:
title.iloc[-215:]

title
you don't exist                                 13 
¿Quién mató a Alex?: El misterio que nos une    8  
¿Quién mató a Palomino Molero?                  207
¿Tienes miedo a la oscuridad?                   36 
À couteaux tirés                                2  
                                               ..  
簪中录（3）                                          24 
簪中录（4）                                          18 
聖女の救済 (ガリレオ, #5)                                13 
모차르트의 악보 (39 클루스. 2)                            25 
스노우맨                                            46 
Name: ratings_count, Length: 215, dtype: int64

In [60]:
# after delete all books with language_code 'non english', there are still around 11k (6%) non english books
# with missing language_code. We will leave them since the amount is fairly small and it's hard to incorporate to pipeline to delete
# these books
title.iloc[-214:].sum(),  title.iloc[-214:].sum()/len(cleaned_books)

(11019, 0.060244718540873904)

### Load review data

In [7]:
reviews_df = read_reviews(os.path.join('data/', 'goodreads_reviews_mystery_thriller_crime.json.gz'))

counting file: data/goodreads_reviews_mystery_thriller_crime.json.gz
current line: 0,1000000,complete
done!


In [8]:
cleaned_reviews = clean_reviews(reviews_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


### book ids that are english and have non-zero ratings

In [9]:
#books in english
book_ids = cleaned_books.index.values
book_ids.shape

(182904,)

In [10]:
# books that have non-zero ratings
book_in_reviews = cleaned_reviews.book_id.unique()
book_in_reviews.shape

(214260,)

In [11]:
#get the ids for books that are in english and have non zero ratings
total_book = np.intersect1d(book_ids, book_in_reviews)
len(total_book)

178443

In [12]:
# final clean book data
cleaned_books1 = cleaned_books.loc[total_book]
len(cleaned_books1)

178443

In [13]:
# final clean review data
cleaned_reviews1 = cleaned_reviews[cleaned_reviews.book_id.isin(total_book)]

In [14]:
len(cleaned_reviews1.book_id.unique())

178443

In [15]:
# remap the user id and book id for matrix factorization
cleaned_reviews2 = review_map_id(cleaned_reviews1).copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['old_user_id'] = df['user_id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_id'] = map_id(df['old_user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['old_book_id'] = df['book_id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

In [16]:
cleaned_reviews2.head()

Unnamed: 0,review_id,user_id,book_id,rating,timestamp,n_votes,n_comments,old_user_id,old_book_id
620327,d23dc89ab32cd864e54d18369751163b,158142,63529,3,2001-01-01 00:00:00,0,0,d889b42d9eb7b80e02f24830e27c6389,196084
620326,c7613da4cbd48baa83efec99b4dd6a41,158142,163000,3,2001-01-11 00:00:00,0,0,d889b42d9eb7b80e02f24830e27c6389,79030
620325,469b9d160ad68ce84b6cac4585fb226a,158142,140825,4,2001-01-25 00:00:00,0,0,d889b42d9eb7b80e02f24830e27c6389,442783
620324,9f4fd2c20177f66fe5832ec299fb4bbf,158142,145138,2,2001-01-29 00:00:00,0,0,d889b42d9eb7b80e02f24830e27c6389,552719
620323,0c7a29c98c4f11b409235d9c22a2455b,158142,85564,4,2001-02-02 00:00:00,1,0,d889b42d9eb7b80e02f24830e27c6389,238117


### save the cleaned books and reviews

In [17]:
cleaned_books1.to_csv('data/cleaned_books_mystery.csv', header=cleaned_books1.columns)

In [19]:
cleaned_reviews2.to_csv('data/cleaned_reviews_mystery.csv', header=cleaned_reviews1.columns, index=False)