In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import string

In [2]:
raw = pd.read_csv("data_sets/Books_Raw.csv", na_values='None')

In [3]:
start_time = time.time()
reviews = pd.read_csv("data_sets/Reviews_Raw.csv", na_values='None')
end_time = time.time()
print("Import time:", end_time - start_time, "seconds")

Import time: 22.786155223846436 seconds


In [4]:
# Change the values of missing
missing_val = raw["authors"][212400]
reviews = reviews.replace(missing_val, None)
raw = raw.replace(missing_val, None)

In [5]:
clean_books = raw[["Title", "publisher", "publishedDate"]].copy()

## Basic cleaning of the book file

In [6]:
def process_string(in_str):
    if in_str is None:
        return []
    else:
        temp = in_str[1:-1].split(", ")
        temp = [mystr[1:-1] for mystr in temp]
        return temp

clean_books["authors"] = raw["authors"].apply(process_string)

In [7]:
type(raw["authors"].iloc[-2])

str

In [8]:
def my_count(str):
    if str == None:
        return None
    else:
        return str.count("'")
        
raw["categories_count"] = raw["categories"].apply(my_count)

In [9]:
raw[raw["categories_count"] > 2.0]

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,categories_count


In [10]:
# No book has more than 1 category, so let's safely remove the quotes and brackets like we did for authors

# Remove the outer square brackets. Remove all quote marks. (Note some fields have multiple sets of quotes, if there are multiple authors)
def remove_outer_braces(str):
    if str is None:
        return None
    else:
        return str[2:-2]

clean_books["categories"] = raw["categories"].apply(remove_outer_braces)

In [11]:
clean_books

Unnamed: 0,Title,publisher,publishedDate,authors,categories
0,Its Only Art If Its Well Hung!,,1996,[Julie Strain],Comics & Graphic Novels
1,Dr. Seuss: American Icon,A&C Black,2005-01-01,[Philip Nel],Biography & Autobiography
2,Wonderful Worship in Smaller Churches,,2000,[David R. Ray],Religion
3,Whispers of the Wicked Saints,iUniverse,2005-02,[Veronica Haddon],Fiction
4,"Nation Dance: Religion, Identity and Cultural ...",,2003-03-01,[Edward Long],
...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,Scholastic Paperbacks,2000-06-01,[Elvira Woodruff],Juvenile Fiction
212400,Red Boots for Christmas,,1995,[],Juvenile Fiction
212401,Mamaw,,2018-01-17,[Wild Wild Cabbage],
212402,The Autograph Man,Vintage,2003-08-12,[Zadie Smith],Fiction


In [12]:
# Notice one book does not have a title
print(clean_books[clean_books["Title"].isnull()])

# Let's remove it
clean_books.dropna(subset="Title", inplace=True)

     Title publisher publishedDate            authors categories
1066  None      None    2015-12-15  [Maharshi Ramana]       None


## Title Typo's

## Clean the reviews file

In [6]:
# Drop all reviews that don't have a title, as they can't be match to a book
reviews.dropna(subset="Title", inplace=True)
reviews.count()

Id                    2999792
Title                 2999792
Price                  481164
User_id               2438018
profileName           2437900
review/helpfulness    2999792
review/score          2999792
review/time           2999792
review/summary        2999385
review/text           2999784
dtype: int64

In [15]:
def title_projection(title_string):
    """
    Create a surjective fct to compare titles.
    In particular, let's remove punctuation marks and convert everything to lowercase letters
    """
    assert title_string != None, "Surprise: "
    remove_punctuations = ''.join([char for char in title_string if (char not in string.punctuation and char != ' ')])
    lower_cases = remove_punctuations.upper()
    return lower_cases

title_projection("h! j,./?;;$#@% ^& *()''")

'HJ'

In [16]:
clean_books["projected_title"] = clean_books["Title"].apply(title_projection)

In [30]:
projected_title_counts = clean_books["projected_title"].value_counts()
double_counts = projected_title_counts[projected_title_counts > 1]
print("number of books with multiple title:", len(double_counts))
print("number of titles for the same book:", sum(double_counts))

number of books with multiple title: 5131
number of titles for the same book: 10911


In [31]:
clean_books[clean_books["projected_title"] == "MOBYDICKORTHEWHALE"]

Unnamed: 0,Title,publisher,publishedDate,authors,categories,projected_title
73201,"Moby Dick, or, The Whale",Рипол Классик,2016-01-04,[H. Melville],Fiction,MOBYDICKORTHEWHALE
79776,"Moby Dick;: Or, The whale,",BookRix,2022-02-01,[Herman Melville],Fiction,MOBYDICKORTHEWHALE
79922,MOBY DICK or THE WHALE,Рипол Классик,2016-01-04,[H. Melville],Fiction,MOBYDICKORTHEWHALE
108278,"Moby Dick; Or, the Whale",Рипол Классик,2016-01-04,[H. Melville],Fiction,MOBYDICKORTHEWHALE
110202,Moby Dick Or the Whale,Penguin,2002-12-31,[Herman Melville],Fiction,MOBYDICKORTHEWHALE
116082,Moby-Dick or The Whale,Penguin,2002-12-31,[Herman Melville],Fiction,MOBYDICKORTHEWHALE
133629,Moby-Dick or the Whale,Рипол Классик,2016-01-04,[H. Melville],Fiction,MOBYDICKORTHEWHALE
173449,Moby-Dick Or the Whale,,,[],,MOBYDICKORTHEWHALE


## Add review data to the raw data

In [14]:
number_of_reviews_per_title = reviews["Title"].value_counts()
number_of_reviews_per_title

Title
The Hobbit                                                                                                         22023
Pride and Prejudice                                                                                                20371
Atlas Shrugged                                                                                                     12513
Wuthering Heights                                                                                                  10780
The Giver                                                                                                           7644
                                                                                                                   ...  
Illuminatus! Part I: The Eye in the Pyramid (The eye in the Pyramid, The Golden Apple,Leviathan, Parts 1, 2, 3)        1
Modern Chess Openings                                                                                                  1
Child Abuse and Neglect: C

In [15]:
average_score_per_title = reviews.groupby("Title")["review/score"].mean().sort_index()
std_score_per_title = reviews.groupby("Title")["review/score"].std().sort_index()
std_score_per_title

Title
" Film technique, " and, " Film acting "                                                                                                   0.707107
" We'll Always Have Paris": The Definitive Guide to Great Lines from the Movies                                                            0.000000
"... And Poetry is Born ..." Russian Classical Poetry                                                                                           NaN
"A Titanic hero" Thomas Andrews, shipbuilder                                                                                               0.353553
"A Truthful Impression of the Country": British and American Travel Writing in China, 1880-1949                                                 NaN
                                                                                                                                             ...   
with an everlasting love                                                                                  

In [16]:
# Notice some of these titles are WILD
print(any(i == '" Film technique, " and, " Film acting "' for i in reviews["Title"]))
print(any(i == 'www.whitbread.org/book' for i in reviews["Title"]))

True
True


In [17]:
# The titles with a std of None only have 1 review. Probably we will want to change this to 0, but there is no need to do that know.
# It is unclear to LS whether this std is a biased or an un-biased estimater. (recall an 1/(m-1) for un-biased variance estimators)
print(number_of_reviews_per_title['"... And Poetry is Born ..." Russian Classical Poetry'])

1


In [18]:
clean_alphabetically = clean_books.sort_values(by="Title")

clean_alphabetically["ratings_average"] = average_score_per_title.tolist()
clean_alphabetically["ratings_std"]  = std_score_per_title.tolist()
clean_alphabetically["ratings_count"] = number_of_reviews_per_title.sort_index().tolist()

clean_alphabetically

Unnamed: 0,Title,publisher,publishedDate,authors,categories,ratings_average,ratings_std,ratings_count
118557,""" Film technique, "" and, "" Film acting """,Sims Press,2008-11,[V. I. Pudovkin],Drama,4.500000,0.707107,2
28608,""" We'll Always Have Paris"": The Definitive Gui...",Perennial,1994,"[Robert A. Nowlan, Gwendolyn Wright Nowlan]",Reference,5.000000,0.000000,2
113956,"""... And Poetry is Born ..."" Russian Classical...",,1984,[Aleksandr Sergeevich Pushkin],Russian poetry,4.000000,,1
209946,"""A Titanic hero"" Thomas Andrews, shipbuilder",,1913,[Shan F. Bullock],,4.875000,0.353553,8
196186,"""A Truthful Impression of the Country"": Britis...",University of Michigan Press,2001,"[Nicholas J. Clifford, Nicholas Rowland Cliffo...",History,4.000000,,1
...,...,...,...,...,...,...,...,...
91449,with an everlasting love,Harvest House Publishers,1999-07-01,[Kay Arthur],Religion,4.761905,0.889087,21
64773,work and Motivation,SAGE,2012,[Gary P. Latham],Business & Economics,5.000000,,1
84075,www.whitbread.org/book,,,[],,2.666667,2.081666,3
153555,xBase Programming for the True Beginner: An In...,McGraw-Hill/Irwin,1995-11-01,"[Eugene Kaluzniacky, Vijay Kanabar]",Computers,5.000000,,1


In [19]:
threshold = clean_alphabetically[clean_alphabetically["ratings_count"] > 100]
print(len(clean_alphabetically[clean_alphabetically["ratings_count"] > 100]))
threshold["ratings_count"].sum()

3983


1499646

In [20]:
# Group reviews by title and get a list of indices for each title
indices_per_title = reviews.groupby('Title').apply(lambda x: x.index.tolist())

# Convert the result to a Pandas Series
indices_series = pd.Series(indices_per_title, name='review_ids').sort_index()

  indices_per_title = reviews.groupby('Title').apply(lambda x: x.index.tolist())


In [21]:
indices_series

Title
" Film technique, " and, " Film acting "                                                                                                                                  [1664106, 1664107]
" We'll Always Have Paris": The Definitive Guide to Great Lines from the Movies                                                                                             [402728, 402729]
"... And Poetry is Born ..." Russian Classical Poetry                                                                                                                              [1604731]
"A Titanic hero" Thomas Andrews, shipbuilder                                                                                               [2783512, 2783513, 2783514, 2783515, 2783516, ...
"A Truthful Impression of the Country": British and American Travel Writing in China, 1880-1949                                                                                    [2588738]
                                                 

In [22]:
# Verifying it works
titanic_indices = indices_series.iloc[3]
print(titanic_indices, type(titanic_indices[0]))
reviews.loc[titanic_indices]

[2783512, 2783513, 2783514, 2783515, 2783516, 2783517, 2783518, 2783519] <class 'int'>


Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
2783512,B00087XUJ8,"""A Titanic hero"" Thomas Andrews, shipbuilder",,A46JFD3YDSEB6,only me,6/6,5.0,914716800,A man who shone like a star,Anyone who has ever looked at Thomas Andrews' ...
2783513,B00087XUJ8,"""A Titanic hero"" Thomas Andrews, shipbuilder",,A1KW23XZW6LTAB,"Eileen Grimes ""Titanic Astrology author""",4/4,5.0,969840000,God bless this man,This book was such a lovely find; I had been f...
2783514,B00087XUJ8,"""A Titanic hero"" Thomas Andrews, shipbuilder",,A11QO67YPZ05CX,"""darl85""",4/4,5.0,916444800,A man loved and respected by so many,Shan F.Bullock truely captures the essence of ...
2783515,B00087XUJ8,"""A Titanic hero"" Thomas Andrews, shipbuilder",,A305WWFYE89S6O,thomas e. lewis,2/2,4.0,947894400,A must read for every Titanic Buff!,I took a tour of a local titanic display not t...
2783516,B00087XUJ8,"""A Titanic hero"" Thomas Andrews, shipbuilder",,,,2/2,5.0,908236800,A wonderful book for such a wonderful man.,When I first saw James Cameron's movie Titanic...
2783517,B00087XUJ8,"""A Titanic hero"" Thomas Andrews, shipbuilder",,ADMPGBI8ACXDN,first.officer@cfu-cybernet.net,1/1,5.0,903830400,"A dry, yet riveting tale of a man destined for...",The stories of Mr. Andrews' life were captivat...
2783518,B00087XUJ8,"""A Titanic hero"" Thomas Andrews, shipbuilder",,A3ADZU0KM5KNW6,Joel Grissom,0/0,5.0,946944000,the best true book ever!,Andrews is a man who died with his ship and sh...
2783519,B00087XUJ8,"""A Titanic hero"" Thomas Andrews, shipbuilder",,AMQU5P20MYAC2,ntlelmbrt@yahoo.com,0/0,5.0,900892800,An absolutely charming chronicle of Thomas And...,Keeping in mind the era that this book was wri...


In [23]:
clean_alphabetically["review_ids"] = indices_series.tolist()

In [24]:
# sort it by index again
clean_books = clean_alphabetically.sort_index()
clean_books

Unnamed: 0,Title,publisher,publishedDate,authors,categories,ratings_average,ratings_std,ratings_count,review_ids
0,Its Only Art If Its Well Hung!,,1996,[Julie Strain],Comics & Graphic Novels,4.000000,,1,[0]
1,Dr. Seuss: American Icon,A&C Black,2005-01-01,[Philip Nel],Biography & Autobiography,4.555556,0.527046,9,"[1, 2, 3, 4, 5, 6, 7, 8, 9]"
2,Wonderful Worship in Smaller Churches,,2000,[David R. Ray],Religion,5.000000,0.000000,4,"[10, 11, 12, 13]"
3,Whispers of the Wicked Saints,iUniverse,2005-02,[Veronica Haddon],Fiction,3.718750,1.764056,32,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2..."
4,"Nation Dance: Religion, Identity and Cultural ...",,2003-03-01,[Edward Long],,5.000000,,1,[46]
...,...,...,...,...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,Scholastic Paperbacks,2000-06-01,[Elvira Woodruff],Juvenile Fiction,4.678571,0.547964,28,"[2821541, 2821542, 2821543, 2821544, 2821545, ..."
212400,Red Boots for Christmas,,1995,[],Juvenile Fiction,5.000000,0.000000,2,"[2821569, 2821570]"
212401,Mamaw,,2018-01-17,[Wild Wild Cabbage],,4.666667,0.577350,3,"[2821571, 2821572, 2821573]"
212402,The Autograph Man,Vintage,2003-08-12,[Zadie Smith],Fiction,2.500000,1.290994,4,"[2821574, 2821575, 2821576, 2821577]"


In [25]:
# This is to show the length of the review_id's list is actually the same as the rating_count
lens = clean_books["review_ids"].apply(len)
bools = lens == clean_books["ratings_count"]
bools.prod()

1

## Reviews per user

In [90]:
review_only_users = reviews[reviews["User_id"].notnull()].drop_duplicates()
print("Number of reviews with a User_id:", len(review_only_users))

Number of reviews with a User_id: 2432472


In [91]:
temp = review_only_users['User_id'].value_counts()
print("Number of users with at least 10 reviews:", len(temp[temp>10]))

Number of users with at least 10 reviews: 25926


In [134]:
temp = review_only_users.groupby(['User_id', 'Title']).size()
multiple_reviews = temp[temp > 1] # These are users that have given multiple reviews to the same book
print("Number of reviews that are of this type:", multiple_reviews.sum())
multiple_reviews

Number of reviews that are of this type: 497801


User_id                Title                                            
A0015610VMNR0JC9XVL1   The richest man in Babylon                            3
A00540411RKGTDNU543WS  The Hobbit                                            5
A00787411M1CAS4K6H99N  Anne Frank's Tales from the Secret Annex              2
A008059932M4DUB2IWDB8  Seven pillars of wisdom,: A triumph                   2
                       Seven pillars of wisdom: A triumph                    2
                                                                            ..
AZZUIE66HZNY1          Pride and Prejudice                                  10
AZZVZL4QEHEHO          Lonesome Dove                                         2
AZZWH0XJ9B39J          Science of survival: Prediction of human behavior     2
AZZY9C5IAQ8KG          A Fine Balance                                        3
AZZZYCR4NZADZ          Out                                                   2
Length: 181140, dtype: int64

In [138]:
copy_without_ids = review_only_users.copy()
del copy_without_ids["Id"]
copy_without_ids.drop_duplicates()
temp = copy_without_ids.groupby(['User_id', 'Title']).size()
multiple_reviews = temp[temp > 1]
print("Number of reviews that are of this type:", multiple_reviews.sum())

Number of reviews that are of this type: 497801


In [140]:
multiple_reviews

User_id                Title                                            
A0015610VMNR0JC9XVL1   The richest man in Babylon                            3
A00540411RKGTDNU543WS  The Hobbit                                            5
A00787411M1CAS4K6H99N  Anne Frank's Tales from the Secret Annex              2
A008059932M4DUB2IWDB8  Seven pillars of wisdom,: A triumph                   2
                       Seven pillars of wisdom: A triumph                    2
                                                                            ..
AZZUIE66HZNY1          Pride and Prejudice                                  10
AZZVZL4QEHEHO          Lonesome Dove                                         2
AZZWH0XJ9B39J          Science of survival: Prediction of human behavior     2
AZZY9C5IAQ8KG          A Fine Balance                                        3
AZZZYCR4NZADZ          Out                                                   2
Length: 181140, dtype: int64

In [34]:
tab2 = reviews.groupby('Title').nunique()

## Export files

In [41]:
start_time = time.time()
reviews.to_csv('data_sets/reviews_clean.csv', index=False)
end_time = time.time()
print("Import time:", end_time - start_time, "seconds")

Import time: 67.74606561660767 seconds


In [42]:
reviews_small = reviews.copy()

del reviews_small["Price"]
del reviews_small["review/text"]
del reviews_small["review/summary"]
del reviews_small["review/time"]
del reviews_small["review/helpfulness"]
del reviews_small["Id"]
del reviews_small["profileName"]

reviews_small.to_csv('data_sets/reviews_for_recommenders.csv', index=False)

In [43]:
reviews_small

Unnamed: 0,Title,User_id,review/score
0,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,4.0
1,Dr. Seuss: American Icon,A30TK6U7DNS82R,5.0
2,Dr. Seuss: American Icon,A3UH4UZ4RSVO82,5.0
3,Dr. Seuss: American Icon,A2MVUWT453QH61,4.0
4,Dr. Seuss: American Icon,A22X4XUPKF66MR,4.0
...,...,...,...
2999995,The Idea of History,,4.0
2999996,The Idea of History,A1SMUB9ASL5L9Y,4.0
2999997,The Idea of History,A2AQMEKZKK5EE4,4.0
2999998,The Idea of History,A18SQGYBKS852K,5.0


In [44]:
clean_books.to_csv('data_sets/books_clean.csv', index=False)
clean_books

Unnamed: 0,Title,publisher,publishedDate,authors,categories,ratings_average,ratings_std,ratings_count,review_ids
0,Its Only Art If Its Well Hung!,,1996,[Julie Strain],Comics & Graphic Novels,4.000000,,1,[0]
1,Dr. Seuss: American Icon,A&C Black,2005-01-01,[Philip Nel],Biography & Autobiography,4.555556,0.527046,9,"[1, 2, 3, 4, 5, 6, 7, 8, 9]"
2,Wonderful Worship in Smaller Churches,,2000,[David R. Ray],Religion,5.000000,0.000000,4,"[10, 11, 12, 13]"
3,Whispers of the Wicked Saints,iUniverse,2005-02,[Veronica Haddon],Fiction,3.718750,1.764056,32,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2..."
4,"Nation Dance: Religion, Identity and Cultural ...",,2003-03-01,[Edward Long],,5.000000,,1,[46]
...,...,...,...,...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,Scholastic Paperbacks,2000-06-01,[Elvira Woodruff],Juvenile Fiction,4.678571,0.547964,28,"[2821541, 2821542, 2821543, 2821544, 2821545, ..."
212400,Red Boots for Christmas,,1995,[],Juvenile Fiction,5.000000,0.000000,2,"[2821569, 2821570]"
212401,Mamaw,,2018-01-17,[Wild Wild Cabbage],,4.666667,0.577350,3,"[2821571, 2821572, 2821573]"
212402,The Autograph Man,Vintage,2003-08-12,[Zadie Smith],Fiction,2.500000,1.290994,4,"[2821574, 2821575, 2821576, 2821577]"
