In [None]:
# !pip install -e git+https://github.com/gauravmm/jupyter-testing.git#egg=jupyter-testing

In [33]:
# setup library imports
# For now the requests library will not be used since we are collecting the data manually
# import requests

import os 
import bs4
from bs4 import BeautifulSoup
from testing.testing import test

import pandas as pd
import numpy as np

In [34]:
def get_html_test(get_html): 
    file_path = "../HTML/Fiction"
    # file_path = "../HTML/Science"
    # file_path = "../HTML/Religion"
    # file_path = "../HTML/Crime"

    roots = get_html(file_path)

    num_pages = 25
    
    test.equal(len(roots), num_pages)
    test.equal(isinstance(roots, list), True)
    test.equal(all(isinstance(root, bs4.BeautifulSoup) for root in roots), True)

@test
def get_html(file_path): 
    """
    Retrieve ALL the html pages on goodreads for given genre.

    Returns:
        roots (list): list of bs4 objects for html file
    """
    lst_html = list()
    roots = list()

    
    for filename in sorted(os.listdir(file_path)):
        with open(os.path.join(file_path, filename)) as f:
            content = f.read()
            lst_html.append(content)

    for html_page in lst_html: 
        # response.text (string): String of HTML corresponding to a page of 50 books
        root = BeautifulSoup(html_page, 'html.parser')        
        roots.append(root)

    return roots
            

### TESTING get_html: PASSED 3/3
###



In [35]:
def parse_page_test(parse_page):
    file_path = "../HTML/Fiction"
    # file_path = "../HTML/Science"
    # file_path = "../HTML/Religion"
    # file_path = "../HTML/Crime"

    roots = get_html(file_path)
    
    book_attributes = parse_page(roots)


    book_per_page = 50
    num_pages = 25 
    expected_total = book_per_page * num_pages

    test.true(len(book_attributes) == 9)
    [test.equal(len(attribute), expected_total) for attribute in book_attributes]


# @test
def parse_page(roots):
    """
    Parse the reviews on each of the 25 pages.
    
    Args:
        book_attributes (list): book_title, author_name, ratings, num_of_ratings, date_published

    Returns:
        book_attributes (list) : 
        - book_url, book_title, author_name, ratings, num_of_ratings, date_published
    """
    
    book_id, book_url, book_title, author_name, ratings, num_of_ratings, date_published, book_shelved, book_genre = list(), list(), list(), list(), list(), list(), list(), list(), list()
    book_attributes = list()

    for root in roots:
        book_link_prefix = "https://www.goodreads.com"
        book_url_page = [x['href'] for x in root.find_all("a", class_="bookTitle")]
        
        book_id_page = [int(book_link.split("/book/show/")[1].split(".")[0].split("-")[0]) for book_link in book_url_page]
        book_id.extend(book_id_page)

        book_url_page = [book_link_prefix+book_link for book_link in book_url_page]
        book_url.extend(book_url_page)
        
        book_title_page = [x.get_text() for x in root.find_all("a", class_="bookTitle")]
        book_title.extend(book_title_page)

        author_name_page = [x.get_text() for x in root.find_all("a", class_="authorName")]
        author_name.extend(author_name_page)

        ratings_data = []
        shevles_genre_data = []

        for div in root.find_all("div", class_="left"):
            start = 'shelved'
            end = 'avg rating'
            s = div.get_text()
            shevles_genre_data = s[s.find(start)+len(start):s.rfind(end)]

            keyword = " times as "
            before_keyword, keyword, after_keyword = shevles_genre_data.partition(keyword)
            book_shelved.append(int(before_keyword))
            book_genre.append(after_keyword.split()[0][:-1])
            

        for div in root.find_all("div", class_="left"):
            for span in div.find_all('span', {'class' : 'greyText smallText'}):
                ratings_data.append(span.get_text())
        
        for elem in ratings_data: 

            ratings.append(elem.split()[2])
            num_of_ratings.append(elem.split()[4])
            
            # If date published is not given pass in nan value
            if len(elem.split()) < 9: 
                date_published.append(np.nan)
            else: 
                date_published.append(elem.split()[8])

    book_attributes = [book_id, book_url, book_title, author_name, ratings, num_of_ratings, date_published, book_shelved, book_genre]
    
    return book_attributes


In [36]:
def create_dataframe_test(create_dataframe):
    file_path = "../HTML/Fiction"
    # file_path = "../HTML/Science"
    # file_path = "../HTML/Religion"
    # file_path = "../HTML/Crime"

    roots = get_html(file_path)
    
    book_attributes = parse_page(roots)
    
    df = create_dataframe(book_attributes)
    
    test.equal(len(df.columns), len(book_attributes)) 
    test.equal(isinstance(df, pd.DataFrame), True) 
    # TODO: check row and column values

    
        
@test
def create_dataframe(book_attributes):
    """
    Create a dataframe
    
    Args:
        book_attributes (list): book_title, author_name, ratings, num_of_ratings, date_published
        
    Returns:
        df (pd.DataFrame) : 
        - Columns: book_title, author_name, ratings, num_of_ratings, date_published
    """

    df = pd.DataFrame(
        {'book_id': book_attributes[0],
        'book_url': book_attributes[1],
        'book_title': book_attributes[2],
        'author_name': book_attributes[3],
        'ratings': book_attributes[4],
        'num_of_ratings': book_attributes[5],
        'date_published': book_attributes[6],
        'book_shelved': book_attributes[7],
        'book_genre': book_attributes[8]
        })
    

    df.to_csv('goodreads_fiction.csv', index=False)  
    # df.to_csv('goodreads_science.csv', index=False)  
    # df.to_csv('goodreads_religion.csv', index=False)  
    # df.to_csv('goodreads_crime.csv', index=False)  


    return df

### TESTING create_dataframe: PASSED 2/2
###



In [31]:
file_path = "../HTML/Fiction"
# file_path = "../HTML/Science"
# file_path = "../HTML/Religion"
# file_path = "../HTML/Crime"

roots = get_html(file_path)

book_attributes = parse_page(roots)

df = create_dataframe(book_attributes)

In [32]:
df.head()

Unnamed: 0,book_id,book_url,book_title,author_name,ratings,num_of_ratings,date_published,book_shelved,book_genre
0,2429135,https://www.goodreads.com/book/show/2429135.Th...,"The Girl with the Dragon Tattoo (Millennium, #1)",Stieg Larsson,4.15,2840117,2005,4241,crime
1,5060378,https://www.goodreads.com/book/show/5060378-th...,"The Girl Who Played with Fire (Millennium, #2)",Stieg Larsson,4.24,863312,2006,2919,crime
2,19288043,https://www.goodreads.com/book/show/19288043-g...,Gone Girl (Paperback),Gillian Flynn,4.09,2586588,2012,2893,crime
3,16299,https://www.goodreads.com/book/show/16299.And_...,And Then There Were None (Paperback),Agatha Christie,4.27,998814,1939,2835,crime
4,22557272,https://www.goodreads.com/book/show/22557272-t...,The Girl on the Train (Hardcover),Paula Hawkins,3.94,2401006,2015,2528,crime


In [None]:
F = open("IDS.txt", "w")
for line in df["book_id"]:
    F.write(line)
    F.write("\n")
F.close()

In [None]:
!mkdir book_reviews_folder

In [None]:
!python get_reviews.py --book_ids_path IDS.txt \
--output_directory_path book_reviews_folder --sort_order default --browser chrome 

In [None]:
reviews_df = pd.read_json('book_reviews_folder/all_reviews.json')
reviews_df

In [None]:
reviews_df.to_csv("review.csv")

In [None]:
!pip install -r requirements.txt