In [33]:
# !pip install -e git+https://github.com/gauravmm/jupyter-testing.git#egg=jupyter-testing

In [34]:
# setup library imports
import io, time, json
import requests
from pathlib import Path
from bs4 import BeautifulSoup
from testing.testing import test

import pandas as pd


In [35]:
def all_books_test(all_books):
    roots = all_books()
    
    num_pages = 25 
    
    test.equal(len(roots), num_pages) 
    test.equal(isinstance(roots, list), True) 


@test
def all_books():
    """
    Retrieve ALL the books on goodreads for a given query.

    Returns:
        results (list): list of dicts representing each books
    """

    url = "https://www.goodreads.com/shelf/show/fiction"
    offset = 0
    roots = list() # Create a list to store beautifulsoup objects

    query = 1
    params = {"page": query}
    response = requests.get(url, params=params)

    assert response.status_code == 200

    num_pages = 25

    for i in range(num_pages):
        response = requests.get(url, params=params)
        # response.text (string): String of HTML corresponding to a page of 50 books
        root = BeautifulSoup(response.text, 'html.parser')
        roots.append(root)
        
        offset += 1
        params["page"] = offset 
        time.sleep(0.2) 

    return roots 

In [None]:
def parse_page_test(parse_page):
    roots = all_books()
    book_attributes = parse_page(roots)

    book_per_page = 50
    num_pages = 25 
    expected_total = book_per_page * num_pages

    test.true(len(book_attributes[0]) == expected_total)
    test.true(len(book_attributes[1]) == expected_total)
    test.true(len(book_attributes[2]) == expected_total)
    test.true(len(book_attributes[3]) == expected_total)
    test.true(len(book_attributes[4]) == expected_total)



@test
def parse_page(bs_obj):
    """
    Parse the reviews on each of the 25 pages.
    
    Args:
        book_attributes (list): book_titles, author_name, ratings, num_of_ratings, date_published

    Returns:
        book_attributes (list) : 
        - book_titles, author_name, ratings, num_of_ratings, date_published
    """
    
    book_titles = list()
    author_name = list()
    ratings = list()
    num_of_ratings = list()
    date_published = list()

    for obj in bs_obj: 
        root = obj
        book_title_page = [x.get_text() for x in root.find_all("a", class_="bookTitle")]
        book_titles.extend(book_title_page)
        author_name_page = [x.get_text() for x in root.find_all("a", class_="authorName")]
        author_name.extend(author_name_page)

        ratings_data = []

        for div in root.find_all("div", class_="left"):
            for span in div.find_all('span', {'class' : 'greyText smallText'}):
                ratings_data.append(span.get_text())

        for elem in ratings_data: 
            ratings.append(elem.split()[2])
            num_of_ratings.append(elem.split()[4])
            date_published.append(elem.split()[8])
            
    book_attributes = [book_titles, author_name, ratings, num_of_ratings, date_published]
    
    return book_attributes

In [None]:
def create_dataframe_test(create_dataframe):
    roots = all_books()
    book_attributes = parse_page(roots)
    df = create_dataframe(book_attributes)
    
    test.equal(len(df.columns), 5) 
    test.equal(isinstance(df, pd.DataFrame), True) 
    # TODO: check row and column values

    
        
@test
def create_dataframe(book_attributes):
    """
    Create a dataframe
    
    Args:
        book_attributes (list): book_titles, author_name, ratings, num_of_ratings, date_published
        
    Returns:
        df (pd.DataFrame) : 
        - Columns: book_titles, author_name, ratings, num_of_ratings, date_published
    """

    df = pd.DataFrame(
        {'book_title': book_attributes[0],
        'author_name': book_attributes[1],
        'ratings': book_attributes[2],
        'num_of_ratings': book_attributes[3],
        'date_published': book_attributes[4]
        })
    

    df.to_csv('goodreads.csv', index=False,)  

    return df