In [None]:
# !pip install -e git+https://github.com/gauravmm/jupyter-testing.git#egg=jupyter-testing

In [66]:
# setup library imports
# For now the requests library will not be used since we are collecting the data manually
# import requests

import os 
from pathlib import Path
import bs4
from bs4 import BeautifulSoup
from testing.testing import test
import pandas as pd


In [None]:
def get_html_test(get_html): 
    roots = get_html()
    num_pages = 25
    
    test.equal(len(roots), num_pages)
    test.equal(isinstance(roots, list), True)
    test.equal(all(isinstance(root, bs4.BeautifulSoup) for root in roots), True)

@test
def get_html(): 
    """
    Retrieve ALL the html pages on goodreads for fiction genre.

    Returns:
        roots (list): list of bs4 objects for html file
    """
    lst_html = list()
    roots = list()

    file_path_fiction = "../HTML/Fiction"
    for filename in sorted(os.listdir(file_path_fiction)):
        with open(os.path.join(file_path_fiction, filename)) as f:
            content = f.read()
            lst_html.append(content)

    for html_page in lst_html: 
        # response.text (string): String of HTML corresponding to a page of 50 books
        root = BeautifulSoup(html_page, 'html.parser')        
        roots.append(root)

    return roots
            

In [None]:
def parse_page_test(parse_page):
    roots = get_html()
    book_attributes = parse_page(roots)


    book_per_page = 50
    num_pages = 25 
    expected_total = book_per_page * num_pages

    test.true(len(book_attributes) == 9)
    [test.equal(len(attribute), expected_total) for attribute in book_attributes]


@test
def parse_page(roots):
    """
    Parse the reviews on each of the 25 pages.
    
    Args:
        book_attributes (list): book_title, author_name, ratings, num_of_ratings, date_published

    Returns:
        book_attributes (list) : 
        - book_url, book_title, author_name, ratings, num_of_ratings, date_published
    """
    
    book_id, book_url, book_title, author_name, ratings, num_of_ratings, date_published, book_shelved, book_genre = list(), list(), list(), list(), list(), list(), list(), list(), list()
    book_attributes = list()

    for root in roots:
        book_link_prefix = "https://www.goodreads.com"
        book_url_page = [x['href'] for x in root.find_all("a", class_="bookTitle")]
        
        book_id_page = [int(book_link.split("/book/show/")[1].split(".")[0].split("-")[0]) for book_link in book_url_page]
        book_id.extend(book_id_page)

        book_url_page = [book_link_prefix+book_link for book_link in book_url_page]
        book_url.extend(book_url_page)
        
        book_title_page = [x.get_text() for x in root.find_all("a", class_="bookTitle")]
        book_title.extend(book_title_page)

        author_name_page = [x.get_text() for x in root.find_all("a", class_="authorName")]
        author_name.extend(author_name_page)

        ratings_data = []
        shevles_genre_data = []

        for div in root.find_all("div", class_="left"):
            start = 'shelved'
            end = 'avg rating'
            s = div.get_text()
            shevles_genre_data = s[s.find(start)+len(start):s.rfind(end)]

            keyword = " times as "
            before_keyword, keyword, after_keyword = shevles_genre_data.partition(keyword)
            book_shelved.append(int(before_keyword))
            book_genre.append(after_keyword.split()[0][:-1])
            

        for div in root.find_all("div", class_="left"):
            for span in div.find_all('span', {'class' : 'greyText smallText'}):
                ratings_data.append(span.get_text())
            
        for elem in ratings_data: 
            ratings.append(elem.split()[2])
            num_of_ratings.append(elem.split()[4])
            date_published.append(elem.split()[8])

    book_attributes = [book_id, book_url, book_title, author_name, ratings, num_of_ratings, date_published, book_shelved, book_genre]
    
    return book_attributes

In [None]:
def create_dataframe_test(create_dataframe):
    roots = get_html()
    book_attributes = parse_page(roots)
    df = create_dataframe(book_attributes)
    
    test.equal(len(df.columns), len(book_attributes)) 
    test.equal(isinstance(df, pd.DataFrame), True) 
    # TODO: check row and column values

    
        
@test
def create_dataframe(book_attributes):
    """
    Create a dataframe
    
    Args:
        book_attributes (list): book_title, author_name, ratings, num_of_ratings, date_published
        
    Returns:
        df (pd.DataFrame) : 
        - Columns: book_title, author_name, ratings, num_of_ratings, date_published
    """

    df = pd.DataFrame(
        {'book_id': book_attributes[0],
        'book_url': book_attributes[1],
        'book_title': book_attributes[2],
        'author_name': book_attributes[3],
        'ratings': book_attributes[4],
        'num_of_ratings': book_attributes[5],
        'date_published': book_attributes[6],
        'book_shelved': book_attributes[7],
        'book_genre': book_attributes[8]
        })
    

    df.to_csv('goodreads.csv', index=False,)  

    return df