# Helper Classes

These classes are used to sort the list of books into a list of boxes of books.

In [1]:
'''
Takes a book dictionary and moves the weight from the dictionary
and into a float class property.  This class makes it easier to 
evaluate books for BookBox.
'''
class Book(object):
    
    def __init__(self,book):
        self.book = book
        self.weight = self.__get_book_weight(self.book)
        
    def __repr__(self):
        return '{}:\nweight: {}\n{}'.format(self.__class__.__name__,
                                            self.weight,
                                            self.book)
    
    def __get_book_weight(self, book):
        return float(book['shipping_weight'].replace(' pounds',''))
    

In [2]:
'''
Contains a list of Book objects limited by the sum of the Book object's
weight property.
'''
class BookBox(object):
    
    def __init__(self):
        self.books = []
        self.max_weight = 10
        self.remaining_weight = self.max_weight
        self.weight = 0
    
    
    def __repr__(self):
        return '{}:\nweight: {}\n{}'.format(self.__class__.__name__,
                                            self.weight,
                                            self.books)
    
     
    # calculate the weight of this box        
    def __calc_weight(self):
        weight = 0
        for book in self.books:
            weight += book.weight
            weight = round(weight,1)
        return weight
    
    # update the weight values for this box
    def __update_weights(self):
        self.weight = self.__calc_weight()
        self.remaining_weight = round(self.max_weight - self.weight,1)
    
    
    # add book to this box
    def add_book(self, book):
        self.books.append(book)
        self.__update_weights()


In [3]:
'''
Handles conversion of a list of book dictionaries to
a list of Boxes that contain Book objects.
'''
class BookObjectList(object):  
    
    def __init__(self, bookList):
        self.list = self.__create_book_object_list(bookList)
        self.boxes = self.list_to_boxes()
             
    # for sorting   
    def __getKey(self, book):
        return book.weight
    
    # convert list of books to book objects
    def __create_book_object_list(self, bookList):
        book_obj_list = []
        for book in bookList:
            book_obj_list.append(Book(book))    
        return book_obj_list
    
    
    # get book that is equal weight or less
    def get_best_fit(self,weight):
        less_books = []
        best_book = None
        for book in self.list:
            if book.weight <= weight:
                less_books.append(book)
        if less_books:
            best_book = max(less_books, key=self.__getKey)
        return best_book
    
    # remove book from list
    def remove_book(self, book):
        self.list.remove(book)
        
    # move a single book from list ot box   
    def move_book_to_box(self, book, box):
        box.add_book(book)
        self.remove_book(book)
        
        return box
    
    # fill a box
    def fill_box(self): 
        box = BookBox()
        while box.remaining_weight > 0:
            next_book = self.get_best_fit(box.remaining_weight)
            if next_book:
                box = self.move_book_to_box(next_book,box)                
            else: 
                break        
        return box
    
    # take list of books and place books from list
    # into a list of boxes
    def list_to_boxes(self): 
        boxes = []
        while self.list:
            box = BookBox()
            boxes.append(self.fill_box())    
        return boxes

# Book html Files to JSON

* Reads .html files from the specified directory
* Scrapes book information from each file using BeautifulSoup
* Sorts books into boxes based on weight limit
* Ouput list of boxes containg books from files to JSON file

In [4]:
import sys
from bs4 import BeautifulSoup
import os
import json
from pprint import pprint

'''
Takes a directory of html files from Amazon book listings
and creates a JSON data file with the information from the 
listing.
'''
class BooksToJson(object):

    def __init__(self, directory, json_file_name='amazonbooks.json'):
        self.json_file_name = json_file_name
        self.json_format = None
        if (self.__check_dir_files(directory)):
            self.directory = directory
            self.json_format = self.__create_json(self.directory)
            # create file
            with open(self.json_file_name, 'w') as outfile:  
                json.dump(self.json_format, outfile)
        
    def __check_dir_files(self, directory):
        confirmed = False
        if (self.__check_dir(directory)):
            confirmed = self.__check_files(directory)
        return confirmed
       
    def __check_dir(self, directory):
        dir_confirmed = False
        # check that directory exists
        if (os.path.exists(directory)): 
            dir_confirmed = True
        else:
            print ('directory not found')
            dir_confirmed = False
        return dir_confirmed
  
    def __check_files(self, directory):
        files_confirmed = False
        if any(File.endswith(".html") for File in os.listdir(directory)):
            files_confirmed = True   
        return files_confirmed
       
    def __create_json(self, directory):
        books = []
        for i in os.listdir(directory):
            if i.endswith('.html'):
                with open(directory+'/'+i, encoding="ISO-8859-1") as file:
                    soup = BeautifulSoup(file, 'html.parser')
                book = self.__get_book(soup)
                books.append(book)
        boxed_books = self.__pack_books(books)
        return boxed_books
      
    def __pack_books(self, books):
        boxed_books = []
        bookList = BookObjectList(books)
        box_id = 1
        for box in bookList.boxes:
            box_dict = self.__get_box(box, box_id)
            boxed_books.append(box_dict)
            box_id+=1
        return boxed_books
    
    def __get_box(self, box, box_id):
        box_dict = {}
        box_dict['id'] = box_id
        box_dict['totalWeight'] = str(box.weight)+' pounds'
        books_from_box = self.__get_books_from_box(box)
        box_dict['contents'] = books_from_box
        return box_dict
            
    def __get_books_from_box(self, box):
        boxed_books = []
        for book in box.books:
            boxed_books.append(book.book)
        return boxed_books

    def __get_book(self, soup):
        title = self.get_title(soup)
        author = self.get_author(soup)
        price = self.get_price(soup)
        weight = self.get_weight(soup)
        isbn = self.get_isbn(soup)
        book = {'title': title,
                           'author': author,
                           'price': price+' USD',
                           'shipping_weight': weight,
                           'isbn-10': int(isbn)}
               
        return book
           
    # get the tag that contains the title
    def __get_title_tag(self, soup):
        return soup.find('span', id='btAsinTitle')
 
    # get title of book 
    def get_title(self, soup):
        title_tag = self.__get_title_tag(soup)
        title_dirty = title_tag.get_text()
        title_clean = title_dirty.replace(' [Hardcover]','').replace(' [Paperback]','')
        return title_clean
   
    # get author of book 
    def get_author(self, soup):
        title_tag = self.__get_title_tag(soup)
        author_tag = title_tag.find_next("a", href=True)
        author = author_tag.get_text()
        return author    
    
    def get_price(self, soup):
        price_dirty = soup.find("span", attrs={'class': 'bb_price'}).text
        price_clean = price_dirty.replace('\n','').strip()
        return price_clean
               
    def get_weight(self, soup):
        weight_dirty = soup.find(text='Shipping Weight:').next
        weight_clean = weight_dirty.replace('(','').strip()
        return weight_clean 
    
    def get_isbn(self, soup):       
        isbn_dirty = soup.find(text='ISBN-10:').next
        # remove out check digit 'X'
        isbn_clean = isbn_dirty.replace('X','').strip()
        return int(isbn_clean)
    
    def display_json(self):
        pprint(self.json_format)
        

In [5]:
jsonFile = BooksToJson('Data')

# JSON File

In [6]:
jsonFile.display_json()

[{'contents': [{'author': 'Neil Gaiman',
                'isbn-10': 62255657,
                'price': '$15.22 USD',
                'shipping_weight': '9.4 pounds',
                'title': 'The Ocean at the End of the Lane: A Novel [Deckle '
                         'Edge]'}],
  'id': 1,
  'totalWeight': '9.4 pounds'},
 {'contents': [{'author': 'Alan LeMay',
                'isbn-10': 147780630,
                'price': '$8.97 USD',
                'shipping_weight': '8.8 pounds',
                'title': 'The Unforgiven'},
               {'author': 'Reza Aslan',
                'isbn-10': 140006922,
                'price': '$16.89 USD',
                'shipping_weight': '1.2 pounds',
                'title': 'Zealot: The Life and Times of Jesus of Nazareth'}],
  'id': 2,
  'totalWeight': '10.0 pounds'},
 {'contents': [{'author': 'Europa Publications',
                'isbn-10': 1857435885,
                'price': '$7,450.00 USD',
                'shipping_weight': '7.8 pounds',
 

# Test

Creates temporary directory and html files to test the collection of book information and creation of JSON file.

Each portion of collection and JSON file creation can be unit tested individually or all test can be run at once for integration testing.

In [7]:
import unittest
import os
import tempfile
import shutil
 
class TestBookJson(unittest.TestCase):
    
    def setUp(self): 
        # set up books
        self.book1_title = 'Title of Book 1' 
        self.book1_author = 'Book1First Book1Last'
        self.book1_price = '$11.11'
        self.book1_weight = '1.1 pounds'
        self.book1_isbn = 111111111 
        self.book2_title = 'Title of Book 2' 
        self.book2_author = 'Book2First Book2Last'
        self.book2_price = '$22.22'
        self.book2_weight = '2.2 pounds'
        self.book2_isbn = 22222222
        self.test_obj = self.__create_test_object()
        
        
    # test object contstruction and directory 
    def test_construction_directory(self):    
        self.assertEqual(self.test_obj.directory, self.temp_dir)
    
    def test_get_title(self):
        self.assertEqual(self.test_obj.get_title(self.soup1), self.book1_title)
      
    def test_get_author(self):
        self.assertEqual(self.test_obj.get_author(self.soup1), self.book1_author)
       
    def test_get_price(self):
        self.assertEqual(self.test_obj.get_price(self.soup1), self.book1_price)
         
    def test_get_weight(self): 
        self.assertEqual(self.test_obj.get_weight(self.soup1), self.book1_weight)
  
    def test_get_isbn(self):
        self.assertEqual(self.test_obj.get_isbn(self.soup1), self.book1_isbn)
        
    def test_json_id_box(self):
        self.assertEqual(self.json_data[0]['id'], 1)
    
    def test_json_weight_box(self):
        book1_weight = round(float(self.book1_weight.replace(' pounds', '')),1)
        book2_weight = round(float(self.book2_weight.replace(' pounds', '')),1)
        total_weight = round((book1_weight + book2_weight),1)
        total_weight_formatted = str(total_weight)+' pounds'
        self.assertEqual(self.json_data[0]['totalWeight'], total_weight_formatted)
            
    def test_json_author_book2(self):
        self.assertEqual(self.json_data[0]['contents'][0]['author'], self.book2_author)
      
    def test_json_title_book2(self):
        self.assertEqual(self.json_data[0]['contents'][0]['title'], self.book2_title)
 
    def test_json_price_book2(self):
        self.assertEqual(self.json_data[0]['contents'][0]['price'], self.book2_price+' USD')
      
    def test_json_weight_book2(self):
        self.assertEqual(self.json_data[0]['contents'][0]['shipping_weight'], self.book2_weight)
             
    def test_json_isbn_book2(self):
        self.assertEqual(self.json_data[0]['contents'][0]['isbn-10'], self.book2_isbn)
        
    def test_json_author_book1(self):
        self.assertEqual(self.json_data[0]['contents'][1]['author'], self.book1_author)
    
    def test_json_title_book1(self):
        self.assertEqual(self.json_data[0]['contents'][1]['title'], self.book1_title)
 
    def test_json_price_book1(self):
        self.assertEqual(self.json_data[0]['contents'][1]['price'], self.book1_price+' USD')
    
    
    def test_json_weight_book1(self):
        self.assertEqual(self.json_data[0]['contents'][1]['shipping_weight'], self.book1_weight)
            
    def test_json_isbn_book1(self):
        self.assertEqual(self.json_data[0]['contents'][1]['isbn-10'], self.book1_isbn)
           
   
    # creates temporary directory structure and files
    # in order to create object
    def __create_test_object(self): 
        test_obj = None
        temp_dir = tempfile.mkdtemp()
        #store directory for testing
        self.temp_dir = temp_dir
        temp_file1 = 'tempfile1.html'
        temp_file2 = 'tempfile2.html'
        path1 = os.path.join(temp_dir, temp_file1)
        path2 = os.path.join(temp_dir, temp_file2)
        # Ensure the file is read/write by the creator only
        saved_umask = os.umask(0o077)
        try:
            # write file 1
            with open(path2, "w") as file1:
                # create html file 1
                html_text1 = self.__get_html1()
                file1.write(html_text1)
            # create soup and store for testing
            with open(path2, encoding='ISO-8859-1') as soup_file1:
                self.soup1 = BeautifulSoup(soup_file1, 'html.parser')
            # create html file 2
            with open(path1, "w") as file2:
                html_text2 = self.__get_html2()
                file2.write(html_text2)
            # create object while temp dir and files 
            # are available
            test_obj = BooksToJson(temp_dir)
            
            # load up json file before it is removed
            with open(test_obj.json_file_name) as j_file:
                self.json_data = json.load(j_file)
            
        except IOError as e:
            print ('IOError')
        else:
            os.remove(path1)
            os.remove(path2)
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)
        return test_obj
    
    
    def __get_html1(self):        
        # create file contents                  
        html_book1 = self.__get_book_html(self.book1_title, 
                                          self.book1_author,
                                          self.book1_price,
                                          self.book1_weight,
                                          self.book1_isbn)
        return html_book1
      
    
    def __get_html2(self):  
        # create file contents                  
        html_book2 = self.__get_book_html(self.book2_title, 
                                          self.book2_author,
                                          self.book2_price,
                                          self.book2_weight,
                                          self.book2_isbn)
        return html_book2
        
        
    def __get_book_html(self, title, author, price, weight, isbn):
        book_html = ('<div class="buying">'
            '<h1 class="parseasinTitle ">'
            '<span id="btAsinTitle"  >{} <span  style="text-transform: '
            'capitalize; font-size: 16px;">[Hardcover]</span></span>'
            '</h1><span >'
                     
            '<a href="/s?_encoding=UTF8&amp;field-author=Reza%20Aslan&amp;'
            'search-alias=books&amp;sort=relevancerank">{}</a>' 
            '<span class="byLinePipe">(Author)</span>'
            '</span>'
            '</div>'
            
            '<span class="bb_price">'
            '{}      </span>'
                     
            '<li><b>Shipping Weight:</b> {} '
            '(<a href="http://www.amazon.com/gp/help/seller/shipping.html?ie=UTF8&amp;asin=140006922X&amp;seller=ATVPDKIKX0DER">View shipping rates and policies</a>)'
            '</li>'
                     
            '<li><b>ISBN-10:</b> {}X'
            '</li>'.format(title, author, price, weight, isbn)
             )
        return book_html

In [8]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

..................
----------------------------------------------------------------------
Ran 18 tests in 0.202s

OK
