In [74]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import string

In [75]:
raw = pd.read_csv("data_sets/Books_Raw.csv", na_values='None')

In [76]:
start_time = time.time()
reviews = pd.read_csv("data_sets/Reviews_Raw.csv", na_values='None')
end_time = time.time()
print("Import time:", end_time - start_time, "seconds")

Import time: 55.03046989440918 seconds


In [77]:
# Change the values of missing
missing_val = raw["authors"][212400]
reviews = reviews.replace(missing_val, None)
raw = raw.replace(missing_val, None)

In [78]:
# start a new dataframe where we collect clean columns
clean_books = raw[["Title", "publisher", "publishedDate"]].copy()

## Basic cleaning of the book file

In [79]:
def process_string(in_str):
    if in_str is None:
        return []
    else:
        temp = in_str[1:-1].split(", ")
        temp = [mystr[1:-1] for mystr in temp]
        return temp

clean_books["authors"] = raw["authors"].apply(process_string)

In [80]:
type(raw["authors"].iloc[-2])

str

In [81]:
def my_count(str):
    if str == None:
        return None
    else:
        return str.count("'")
        
raw["categories_count"] = raw["categories"].apply(my_count)

In [82]:
raw[raw["categories_count"] > 2.0]

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,categories_count


In [83]:
# No book has more than 1 category, so let's safely remove the quotes and brackets like we did for authors

# Remove the outer square brackets. Remove all quote marks. (Note some fields have multiple sets of quotes, if there are multiple authors)
def remove_outer_braces(str):
    if str is None:
        return None
    else:
        return str[2:-2]

clean_books["categories"] = raw["categories"].apply(remove_outer_braces)

In [84]:
clean_books

Unnamed: 0,Title,publisher,publishedDate,authors,categories
0,Its Only Art If Its Well Hung!,,1996,[Julie Strain],Comics & Graphic Novels
1,Dr. Seuss: American Icon,A&C Black,2005-01-01,[Philip Nel],Biography & Autobiography
2,Wonderful Worship in Smaller Churches,,2000,[David R. Ray],Religion
3,Whispers of the Wicked Saints,iUniverse,2005-02,[Veronica Haddon],Fiction
4,"Nation Dance: Religion, Identity and Cultural ...",,2003-03-01,[Edward Long],
...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,Scholastic Paperbacks,2000-06-01,[Elvira Woodruff],Juvenile Fiction
212400,Red Boots for Christmas,,1995,[],Juvenile Fiction
212401,Mamaw,,2018-01-17,[Wild Wild Cabbage],
212402,The Autograph Man,Vintage,2003-08-12,[Zadie Smith],Fiction


In [85]:
# Notice one book does not have a title
print(clean_books[clean_books["Title"].isnull()])

# Let's remove it
clean_books.dropna(subset="Title", inplace=True)

     Title publisher publishedDate            authors categories
1066  None      None    2015-12-15  [Maharshi Ramana]       None


## Basic cleaning of the reviews file

In [86]:
# Drop all reviews that don't have a title, as they can't be matched to a book
reviews.dropna(subset="Title", inplace=True)
reviews.count()

Id                    2999792
Title                 2999792
Price                  481164
User_id               2438018
profileName           2437900
review/helpfulness    2999792
review/score          2999792
review/time           2999792
review/summary        2999385
review/text           2999784
dtype: int64

## Find title Typo's

In [87]:
def title_projection(title_string):
    """
    Create a surjective fct to compare titles.
    In particular, let's remove punctuation marks and convert everything to lowercase letters
    """
    assert title_string != None, "Surprise: "
    remove_punctuations = ''.join([char for char in title_string if (char not in string.punctuation and char != ' ')])
    lower_cases = remove_punctuations.upper()
    return lower_cases

title_projection("h! j,./?;;$#@% ^& *()''")

'HJ'

In [88]:
clean_books["projected_title"] = clean_books["Title"].apply(title_projection)
reviews["projected_title"] = reviews["Title"].apply(title_projection)

In [89]:
projected_title_counts = clean_books["projected_title"].value_counts()
double_counts = projected_title_counts[projected_title_counts > 1]
print("number of books with multiple title:", len(double_counts))
print("number of titles for the same book:", sum(double_counts))

number of books with multiple title: 5131
number of titles for the same book: 10911


In [90]:
double_counts.head(10)

projected_title
LITTLEWOMENORMEGJOBETHANDAMY                                                                                      9
MOBYDICKORTHEWHALE                                                                                                8
PLATOTHEMANANDHISWORK                                                                                             6
PROGRESSANDPOVERTYANINQUIRYINTOTHECAUSEOFINDUSTRIALDEPRESSIONSANDOFINCREASEOFWANTWITHINCREASEOFWEALTHTHEREMEDY    6
REPRESENTATIVEMENSEVENLECTURES                                                                                    5
NOTESONTHEPARABLESOFOURLORD                                                                                       5
JESUSTHESONOFMANHISWORDSANDHISDEEDSASTOLDANDRECORDEDBYTHOSEWHOKNEWHIM                                             5
THECHRISTIANYEARTHOUGHTSINVERSEFORTHESUNDAYSANDHOLYDAYSTHROUGHOUTTHEYEAR                                          5
ASTORIAORANECDOTESOFANENTERPRISEBEYONDTHEROCKYMOUNTAINS 

In [91]:
as_df = double_counts.reset_index(name="number_of_instances")

bad_books = []
for proj_title in double_counts.index:
    entries = clean_books[clean_books["projected_title"] == proj_title]
    if entries["categories"].nunique() > 1:
        bad_books.append(proj_title)

In [92]:
len(bad_books)

408

In [93]:
clean_books[clean_books["projected_title"] == "UPFROMSLAVERYANAUTOBIOGRAPHY"]

Unnamed: 0,Title,publisher,publishedDate,authors,categories,projected_title
2401,Up from slavery: An autobiography,First Avenue Editions ™,2019-01-01,[Booker T. Washington],Biography & Autobiography,UPFROMSLAVERYANAUTOBIOGRAPHY
28164,Up from slavery;: An autobiography,"Doubleday, Page & Company",1907,[Booker T. Washington],African Americans,UPFROMSLAVERYANAUTOBIOGRAPHY
99678,Up From Slavery: An Autobiography,,1901,[Booker T. Washington],African American civil rights workers,UPFROMSLAVERYANAUTOBIOGRAPHY
205199,"Up from slavery,: An autobiography",,,[],,UPFROMSLAVERYANAUTOBIOGRAPHY
205261,"Up from slavery;: An autobiography,",,,[],,UPFROMSLAVERYANAUTOBIOGRAPHY


## Re-organise based on the new title key

In [96]:
reviews.head(1)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text,projected_title
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,ITSONLYARTIFITSWELLHUNG


In [105]:
#Every User_id gets 1 vote for every book
new_reviews = reviews.groupby(["projected_title", "User_id"])['review/score'].mean().reset_index()
new_reviews = new_reviews.rename(columns={'review/score': 'score'})

In [126]:
average_score_per_book = new_reviews.groupby(["projected_title"])["score"].mean()
average_score_per_book = average_score_per_book.sort_index()
number_of_reviews_per_book = new_reviews.groupby(["projected_title"])["score"].size()
number_of_reviews_per_book = number_of_reviews_per_book.sort_index()
# std_per_book = new_reviews.groupby(["projected_title"])["score"].std()
# std_per_book = std_per_book.sort_index()

In [127]:
new_books = average_score_per_book.reset_index()
new_books["rating_count"] = number_of_reviews_per_book.tolist()
# new_books["std"] = std_per_book.tolist()
new_books = new_books.sort_values("rating_count", ascending=False)
new_books.head(5)

Unnamed: 0,projected_title,score,rating_count
66465,HARRYPOTTERANDTHESORCERERSSTONE,4.687642,3663
160254,THEHOBBIT,4.677061,3577
160256,THEHOBBITTHEREANDBACKAGAIN,4.676707,3576
160255,THEHOBBITORTHEREANDBACKAGAIN,4.678584,3565
160257,THEHOBBITTORTHEREANDBACKAGAINILLUSTRATEDBYTHEA...,4.678525,3562


## [without using the projected title] Add review data to the raw data

In [128]:
number_of_reviews_per_title = reviews["Title"].value_counts()
number_of_reviews_per_title

Title
The Hobbit                                                                                                         22023
Pride and Prejudice                                                                                                20371
Atlas Shrugged                                                                                                     12513
Wuthering Heights                                                                                                  10780
The Giver                                                                                                           7644
                                                                                                                   ...  
Illuminatus! Part I: The Eye in the Pyramid (The eye in the Pyramid, The Golden Apple,Leviathan, Parts 1, 2, 3)        1
Modern Chess Openings                                                                                                  1
Child Abuse and Neglect: C

In [129]:
average_score_per_title = reviews.groupby("Title")["review/score"].mean().sort_index()
#std_score_per_title = reviews.groupby("Title")["review/score"].std().sort_index()
average_score_per_title

Title
" Film technique, " and, " Film acting "                                                                                                   4.500000
" We'll Always Have Paris": The Definitive Guide to Great Lines from the Movies                                                            5.000000
"... And Poetry is Born ..." Russian Classical Poetry                                                                                      4.000000
"A Titanic hero" Thomas Andrews, shipbuilder                                                                                               4.875000
"A Truthful Impression of the Country": British and American Travel Writing in China, 1880-1949                                            4.000000
                                                                                                                                             ...   
with an everlasting love                                                                                  

In [130]:
# Notice some of these titles are WILD
print(any(i == '" Film technique, " and, " Film acting "' for i in reviews["Title"]))
print(any(i == 'www.whitbread.org/book' for i in reviews["Title"]))

True
True


In [131]:
# The titles with a std of None only have 1 review. Probably we will want to change this to 0, but there is no need to do that know.
# It is unclear to LS whether this std is a biased or an un-biased estimater. (recall an 1/(m-1) for un-biased variance estimators)
print(number_of_reviews_per_title['"... And Poetry is Born ..." Russian Classical Poetry'])

1


In [139]:
clean_alphabetically = clean_books.sort_values(by="Title")

clean_alphabetically["ratings_average"] = average_score_per_title.tolist()
# clean_alphabetically["ratings_std"]  = std_score_per_title.tolist()
clean_alphabetically["ratings_count"] = number_of_reviews_per_title.sort_index().tolist()

clean_alphabetically

Unnamed: 0,Title,publisher,publishedDate,authors,categories,projected_title,ratings_average,ratings_count
118557,""" Film technique, "" and, "" Film acting """,Sims Press,2008-11,[V. I. Pudovkin],Drama,FILMTECHNIQUEANDFILMACTING,4.500000,2
28608,""" We'll Always Have Paris"": The Definitive Gui...",Perennial,1994,"[Robert A. Nowlan, Gwendolyn Wright Nowlan]",Reference,WELLALWAYSHAVEPARISTHEDEFINITIVEGUIDETOGREATLI...,5.000000,2
113956,"""... And Poetry is Born ..."" Russian Classical...",,1984,[Aleksandr Sergeevich Pushkin],Russian poetry,ANDPOETRYISBORNRUSSIANCLASSICALPOETRY,4.000000,1
209946,"""A Titanic hero"" Thomas Andrews, shipbuilder",,1913,[Shan F. Bullock],,ATITANICHEROTHOMASANDREWSSHIPBUILDER,4.875000,8
196186,"""A Truthful Impression of the Country"": Britis...",University of Michigan Press,2001,"[Nicholas J. Clifford, Nicholas Rowland Cliffo...",History,ATRUTHFULIMPRESSIONOFTHECOUNTRYBRITISHANDAMERI...,4.000000,1
...,...,...,...,...,...,...,...,...
91449,with an everlasting love,Harvest House Publishers,1999-07-01,[Kay Arthur],Religion,WITHANEVERLASTINGLOVE,4.761905,21
64773,work and Motivation,SAGE,2012,[Gary P. Latham],Business & Economics,WORKANDMOTIVATION,5.000000,1
84075,www.whitbread.org/book,,,[],,WWWWHITBREADORGBOOK,2.666667,3
153555,xBase Programming for the True Beginner: An In...,McGraw-Hill/Irwin,1995-11-01,"[Eugene Kaluzniacky, Vijay Kanabar]",Computers,XBASEPROGRAMMINGFORTHETRUEBEGINNERANINTRODUCTI...,5.000000,1


In [140]:
threshold = clean_alphabetically[clean_alphabetically["ratings_count"] > 100]
print(len(clean_alphabetically[clean_alphabetically["ratings_count"] > 100]))
threshold["ratings_count"].sum()

3983


1499646

In [135]:
# Group reviews by title and get a list of indices for each title
# indices_per_title = reviews.groupby('Title').apply(lambda x: x.index.tolist())

# Convert the result to a Pandas Series
# indices_series = pd.Series(indices_per_title, name='review_ids').sort_index()

In [136]:
# Verifying it works
# titanic_indices = indices_series.iloc[3]
# print(titanic_indices, type(titanic_indices[0]))
# reviews.loc[titanic_indices].head(3)

In [141]:
# clean_alphabetically["review_ids"] = indices_series.tolist()

In [142]:
# sort it by index again
final_books = clean_alphabetically.sort_index()
final_books

Unnamed: 0,Title,publisher,publishedDate,authors,categories,projected_title,ratings_average,ratings_count
0,Its Only Art If Its Well Hung!,,1996,[Julie Strain],Comics & Graphic Novels,ITSONLYARTIFITSWELLHUNG,4.000000,1
1,Dr. Seuss: American Icon,A&C Black,2005-01-01,[Philip Nel],Biography & Autobiography,DRSEUSSAMERICANICON,4.555556,9
2,Wonderful Worship in Smaller Churches,,2000,[David R. Ray],Religion,WONDERFULWORSHIPINSMALLERCHURCHES,5.000000,4
3,Whispers of the Wicked Saints,iUniverse,2005-02,[Veronica Haddon],Fiction,WHISPERSOFTHEWICKEDSAINTS,3.718750,32
4,"Nation Dance: Religion, Identity and Cultural ...",,2003-03-01,[Edward Long],,NATIONDANCERELIGIONIDENTITYANDCULTURALDIFFEREN...,5.000000,1
...,...,...,...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,Scholastic Paperbacks,2000-06-01,[Elvira Woodruff],Juvenile Fiction,THEORPHANOFELLISISLANDTIMETRAVELADVENTURES,4.678571,28
212400,Red Boots for Christmas,,1995,[],Juvenile Fiction,REDBOOTSFORCHRISTMAS,5.000000,2
212401,Mamaw,,2018-01-17,[Wild Wild Cabbage],,MAMAW,4.666667,3
212402,The Autograph Man,Vintage,2003-08-12,[Zadie Smith],Fiction,THEAUTOGRAPHMAN,2.500000,4


In [144]:
# This is to show the length of the review_id's list is actually the same as the rating_count
# lens = final_books["review_ids"].apply(len)
# bools = lens == final_books["ratings_count"]
# bools.prod()

## Reviews per user

In [145]:
review_only_users = reviews[reviews["User_id"].notnull()].drop_duplicates()
print("Number of reviews with a User_id:", len(review_only_users))

Number of reviews with a User_id: 2432472


In [146]:
temp = review_only_users['User_id'].value_counts()
print("Number of users with at least 10 reviews:", len(temp[temp>10]))

Number of users with at least 10 reviews: 25926


In [147]:
temp = review_only_users.groupby(['User_id', 'Title']).size()
multiple_reviews = temp[temp > 1] # These are users that have given multiple reviews to the same book
print("Number of reviews that are of this type:", multiple_reviews.sum())
multiple_reviews

Number of reviews that are of this type: 497801


User_id                Title                                            
A0015610VMNR0JC9XVL1   The richest man in Babylon                            3
A00540411RKGTDNU543WS  The Hobbit                                            5
A00787411M1CAS4K6H99N  Anne Frank's Tales from the Secret Annex              2
A008059932M4DUB2IWDB8  Seven pillars of wisdom,: A triumph                   2
                       Seven pillars of wisdom: A triumph                    2
                                                                            ..
AZZUIE66HZNY1          Pride and Prejudice                                  10
AZZVZL4QEHEHO          Lonesome Dove                                         2
AZZWH0XJ9B39J          Science of survival: Prediction of human behavior     2
AZZY9C5IAQ8KG          A Fine Balance                                        3
AZZZYCR4NZADZ          Out                                                   2
Length: 181140, dtype: int64

In [148]:
copy_without_ids = review_only_users.copy()
del copy_without_ids["Id"]
copy_without_ids.drop_duplicates()
temp = copy_without_ids.groupby(['User_id', 'Title']).size()
multiple_reviews = temp[temp > 1]
print("Number of reviews that are of this type:", multiple_reviews.sum())

Number of reviews that are of this type: 497801


In [149]:
multiple_reviews

User_id                Title                                            
A0015610VMNR0JC9XVL1   The richest man in Babylon                            3
A00540411RKGTDNU543WS  The Hobbit                                            5
A00787411M1CAS4K6H99N  Anne Frank's Tales from the Secret Annex              2
A008059932M4DUB2IWDB8  Seven pillars of wisdom,: A triumph                   2
                       Seven pillars of wisdom: A triumph                    2
                                                                            ..
AZZUIE66HZNY1          Pride and Prejudice                                  10
AZZVZL4QEHEHO          Lonesome Dove                                         2
AZZWH0XJ9B39J          Science of survival: Prediction of human behavior     2
AZZY9C5IAQ8KG          A Fine Balance                                        3
AZZZYCR4NZADZ          Out                                                   2
Length: 181140, dtype: int64

In [150]:
tab2 = reviews.groupby('Title').nunique()

## Export files

In [153]:
start_time = time.time()
reviews.to_csv('data_sets/reviews_clean.csv', index=False)
end_time = time.time()
print("Export time:", end_time - start_time, "seconds")

Export time: 92.38104176521301 seconds


In [154]:
clean_books.to_csv('data_sets/books_clean.csv', index=False)
clean_books

Unnamed: 0,Title,publisher,publishedDate,authors,categories,projected_title
0,Its Only Art If Its Well Hung!,,1996,[Julie Strain],Comics & Graphic Novels,ITSONLYARTIFITSWELLHUNG
1,Dr. Seuss: American Icon,A&C Black,2005-01-01,[Philip Nel],Biography & Autobiography,DRSEUSSAMERICANICON
2,Wonderful Worship in Smaller Churches,,2000,[David R. Ray],Religion,WONDERFULWORSHIPINSMALLERCHURCHES
3,Whispers of the Wicked Saints,iUniverse,2005-02,[Veronica Haddon],Fiction,WHISPERSOFTHEWICKEDSAINTS
4,"Nation Dance: Religion, Identity and Cultural ...",,2003-03-01,[Edward Long],,NATIONDANCERELIGIONIDENTITYANDCULTURALDIFFEREN...
...,...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,Scholastic Paperbacks,2000-06-01,[Elvira Woodruff],Juvenile Fiction,THEORPHANOFELLISISLANDTIMETRAVELADVENTURES
212400,Red Boots for Christmas,,1995,[],Juvenile Fiction,REDBOOTSFORCHRISTMAS
212401,Mamaw,,2018-01-17,[Wild Wild Cabbage],,MAMAW
212402,The Autograph Man,Vintage,2003-08-12,[Zadie Smith],Fiction,THEAUTOGRAPHMAN


In [155]:
new_reviews.to_csv('data_sets/reviews_for_recommenders.csv', index=False)
new_reviews

Unnamed: 0,projected_title,User_id,score
0,0001,A20J0X937MBVEX,5.0
1,01442DEVELOPINGSKILLSINALGEBRAONEBOOKB,A3H9FJL67HJA3D,5.0
2,01442DEVELOPINGSKILLSINALGEBRAONEBOOKB,AVDU7UUIB1DM9,5.0
3,01443DEVELOPINGSKILLSINALGEBRAONEBOOKC,A125AU4F6Z3569,5.0
4,01443DEVELOPINGSKILLSINALGEBRAONEBOOKC,A2W2RQYG5F8TDP,5.0
...,...,...,...
1992540,ZYMURGYFORTHEHOMEBREWERANDBEERLOVERTHEBESTARTI...,A2EOFB1LIYOKMG,4.0
1992541,ZYMURGYFORTHEHOMEBREWERANDBEERLOVERTHEBESTARTI...,A3KDRLIBWILPMJ,4.0
1992542,ZYMURGYFORTHEHOMEBREWERANDBEERLOVERTHEBESTARTI...,AENRHAKDM5GMP,5.0
1992543,ZYMURGYFORTHEHOMEBREWERANDBEERLOVERTHEBESTARTI...,AQC3T4NFVZB0S,5.0


In [156]:
new_books.to_csv('data_sets/books_for_recommenders.csv', index=False)
new_books

Unnamed: 0,projected_title,score,rating_count
66465,HARRYPOTTERANDTHESORCERERSSTONE,4.687642,3663
160254,THEHOBBIT,4.677061,3577
160256,THEHOBBITTHEREANDBACKAGAIN,4.676707,3576
160255,THEHOBBITORTHEREANDBACKAGAIN,4.678584,3565
160257,THEHOBBITTORTHEREANDBACKAGAINILLUSTRATEDBYTHEA...,4.678525,3562
...,...,...,...
25298,BRUCETEGNERSCOMPLETEBOOKOFKARATE,4.000000,1
97376,MICHELINNEWENGLANDREGIONALROADATLAS,4.000000,1
97377,MICHELINPORTUGALFOLDEDMAPMOTORISTTOURINGMAPMIC...,4.000000,1
97378,MICHELINQUEBECREGIONALATLASTRAVELGUIDEMICHELIN...,2.000000,1
