In [1]:
import pandas as pd
import numpy as np
import random
import requests
import csv
import time
import string
import regex as re
from joblib import Memory
from tqdm import tqdm
from nltk.corpus import stopwords
from bs4 import BeautifulSoup as bs
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.model_selection import GridSearchCV
from nltk.tokenize import TreebankWordTokenizer

In [4]:
pd.options.display.max_columns =999 #dispaly the whole grid
# function to read csv file

def read (name):
    return pd.read_csv('./datasets/'+ name + '.csv')

In [5]:
rating = read('ratings')
book_all = read('books')
review_count = pd.DataFrame(rating.groupby('book_id').size(), columns=['count'])
reviews_per_user = pd.DataFrame(rating.groupby('user_id').size(), columns=['count'])


In [6]:
threshold = 500
popular_list = list(set(review_count.query('count >= @threshold').index))
popular_book = rating[rating.book_id.isin(popular_list)]
threshold = 117
active_list = list(set(reviews_per_user.query('count >= @threshold').index))
book_df = popular_book[popular_book.user_id.isin(active_list)]

In [7]:
book_df.shape

(2249118, 3)

In [8]:
book_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2249118 entries, 0 to 5976478
Data columns (total 3 columns):
user_id    int64
book_id    int64
rating     int64
dtypes: int64(3)
memory usage: 68.6 MB


In [9]:
# form a dataframe for the selected book
selected_books = pd.DataFrame(book_df['book_id'].unique(), columns=['book_id'])
selected_books.head()

Unnamed: 0,book_id
0,258
1,70
2,264
3,388
4,18


In [10]:
# save the book_df for future usage
selected_books.to_csv('./datasets/selected_book.csv')

In [11]:
# combine with 'book.csv' with only books that are filtered earlier
gdread = selected_books.merge(book_all, how='left', on='book_id')

In [12]:
gdread.shape

(2526, 23)

In [13]:
gdread.head(3)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,258,1232,1232,3209783,279,143034901,9780143000000.0,"Carlos Ruiz Zafón, Lucia Graves",2001.0,La sombra del viento,The Shadow of the Wind (The Cemetery of Forgot...,eng,4.24,263685,317554,24652,4789,11769,42214,101612,157170,https://images.gr-assets.com/books/1344545047m...,https://images.gr-assets.com/books/1344545047s...
1,70,375802,375802,2422333,224,812550706,9780813000000.0,Orson Scott Card,1985.0,Ender's Game,"Ender's Game (Ender's Saga, #1)",eng,4.3,813439,873417,38054,15330,27612,103439,264207,462829,https://images.gr-assets.com/books/1408303130m...,https://images.gr-assets.com/books/1408303130s...
2,264,3876,3876,589497,343,743297334,9780743000000.0,Ernest Hemingway,1926.0,The Sun Also Rises,The Sun Also Rises,en-US,3.83,284988,303117,9826,9602,23682,70004,105785,94044,https://images.gr-assets.com/books/1331828228m...,https://images.gr-assets.com/books/1331828228s...


In [14]:
# double confirm the outcome 
(gdread.book_id == selected_books.book_id).unique()

array([ True])

In [15]:
# extract the goodread_id 
gdread_id = gdread.goodreads_book_id

In [16]:
# split into a few parts for easier monitor
gdread_id = np.array_split(gdread_id, 5)

## Web scrapping
#### Scrap the books information from goodreads website using the gdread_id

In [1]:
# declare global index for counting 
# scrape on goodreads.com using goodreads id
# and save the summary and genres in a csv fileindex = 0

In [None]:
def scrape_and_run(gr_id):
    
    page = requests.get("https://www.goodreads.com/book/show/" + str(gr_id), 
                        headers = {'User-Agent' : 'data' })
    
    if page.status_code == 200:
        soup = bs(page.content, 'html.parser')
        summary = soup.find('span', attrs={'style': 'display:none'})
        
        # some of the information might be empty, so we need to set a condition to prevent error when 'get_text()'
        if summary is not None:
            summary = soup.find('span', attrs={'style': 'display:none'}).get_text()
        
        genre = soup.find_all('a', class_='actionLinkLite bookPageGenreLink')
        if genre is not None:
            # extract test in every class
            genre = [i.get_text() for i in soup.find_all('a', class_='actionLinkLite bookPageGenreLink')]
    
    else: # handle page error
        print(page.status_code)
        print('error found in book with gr_id', gr_id)
        summary = None
        genre = None
    
    sleep_duration = random.randint(2,10)
    time.sleep(sleep_duration)
    global index
    index +=1
    if index%100 == 0:
        print(index, 'books processed')
    
    return summary, genre

In [None]:
def get_summary(gdread_id):
    book_summary ={}
    genre_summary={}
    for book in gdread_id:
        summary, genre = scrape_and_run(book) # call the scrape_and_run function
        book_summary[book]= summary # assign the return value from function to book_summary 
        genre_summary[book] = genre
    return book_summary, genre_summary # the results are in a form of dictionary 

In [None]:
def convert_df(part):
    df = pd.DataFrame(part).T
    df.reset_index(inplace=True) # reset the indext so we can assign a column name for goodreads_book_id
    df.columns=['goodreads_book_id','summary','genre']
    return df

In [None]:
a # start to scrap goodread website
part_1 = get_summary(gdread_id[0])

In [None]:
# save the result as csv file
convert_df(part_1).to_csv('part_1.csv')

In [None]:
index = 0
part_2 = get_summary(gdread_id[1])

In [None]:
convert_df(part_2).to_csv('part_2.csv')

In [None]:
index = 0
part_3 = get_summary(gdread_id[2])

In [None]:
convert_df(part_3).to_csv('part_3.csv')

In [None]:
index = 0
part_4 = get_summary(gdread_id[3])

In [None]:
convert_df(part_4).to_csv('part_4.csv')

In [None]:
index = 0
part_5 = get_summary(gdread_id[4])

In [None]:
convert_df(part_5).to_csv('part_5.csv')