In [33]:
# !pip install -e git+https://github.com/gauravmm/jupyter-testing.git#egg=jupyter-testing

In [30]:
# setup library imports
import io, time, json
import requests
from pathlib import Path
import bs4
from bs4 import BeautifulSoup
from testing.testing import test

import pandas as pd


In [32]:
import os 

def get_html_test(get_html): 
    roots = get_html()
    num_pages = 25
    
    test.equal(len(roots), num_pages)
    test.equal(isinstance(roots, list), True)
    test.equal(all(isinstance(root, bs4.BeautifulSoup) for root in roots), True)

@test
def get_html(): 
    """
    Retrieve ALL the html pages on goodreads for fiction genre.

    Returns:
        roots (list): list of bs4 objects for html file
    """
    lst_html = list()
    roots = list()

    file_path_fiction = "../HTML/Fiction"
    for filename in os.listdir(file_path_fiction):
        with open(os.path.join(file_path_fiction, filename)) as f:
            content = f.read()
            lst_html.append(content)
            # content = BeautifulSoup(f, 'html.parser')
            # roots.append(content)


    for html_page in lst_html: 
        # response.text (string): String of HTML corresponding to a page of 50 books
        root = BeautifulSoup(html_page, 'html.parser')        
        roots.append(root)

    return roots
            

### TESTING get_html: PASSED 3/3
###



In [33]:
def parse_page_test(parse_page):
    roots = get_html()
    book_attributes = parse_page(roots)


    book_per_page = 50
    num_pages = 25 
    expected_total = book_per_page * num_pages

    test.true(len(book_attributes[0]) == expected_total)
    test.true(len(book_attributes[1]) == expected_total)
    test.true(len(book_attributes[2]) == expected_total)
    test.true(len(book_attributes[3]) == expected_total)
    test.true(len(book_attributes[4]) == expected_total)



@test
def parse_page(roots):
    """
    Parse the reviews on each of the 25 pages.
    
    Args:
        book_attributes (list): book_titles, author_name, ratings, num_of_ratings, date_published

    Returns:
        book_attributes (list) : 
        - book_titles, author_name, ratings, num_of_ratings, date_published
    """
    
    book_titles = list()
    author_name = list()
    ratings = list()
    num_of_ratings = list()
    date_published = list()

    for root in roots: 
        book_title_page = [x.get_text() for x in root.find_all("a", class_="bookTitle")]
        book_titles.extend(book_title_page)
        author_name_page = [x.get_text() for x in root.find_all("a", class_="authorName")]
        author_name.extend(author_name_page)

        ratings_data = []

        for div in root.find_all("div", class_="left"):
            for span in div.find_all('span', {'class' : 'greyText smallText'}):
                ratings_data.append(span.get_text())

        for elem in ratings_data: 
            ratings.append(elem.split()[2])
            num_of_ratings.append(elem.split()[4])
            date_published.append(elem.split()[8])
            
    book_attributes = [book_titles, author_name, ratings, num_of_ratings, date_published]
    
    return book_attributes

### TESTING parse_page: PASSED 5/5
###



In [34]:
def create_dataframe_test(create_dataframe):
    roots = get_html()
    book_attributes = parse_page(roots)
    df = create_dataframe(book_attributes)
    
    test.equal(len(df.columns), 5) 
    test.equal(isinstance(df, pd.DataFrame), True) 
    # TODO: check row and column values

    
        
@test
def create_dataframe(book_attributes):
    """
    Create a dataframe
    
    Args:
        book_attributes (list): book_titles, author_name, ratings, num_of_ratings, date_published
        
    Returns:
        df (pd.DataFrame) : 
        - Columns: book_titles, author_name, ratings, num_of_ratings, date_published
    """

    df = pd.DataFrame(
        {'book_title': book_attributes[0],
        'author_name': book_attributes[1],
        'ratings': book_attributes[2],
        'num_of_ratings': book_attributes[3],
        'date_published': book_attributes[4]
        })
    

    df.to_csv('goodreads.csv', index=False,)  

    return df

### TESTING create_dataframe: PASSED 2/2
###

