In [None]:
import requests
from bs4 import BeautifulSoup
from random import uniform
from os.path import exists
import re
import time
import csv
import datetime


In [None]:
top_100_url = 'https://www.amazon.com.br/gp/bestsellers/books/ref=zg_bs_pg_2?ie=UTF8&pg='
top_100_url_page_number = 1
book_reviews_url_begin = 'https://www.amazon.com.br/texto/product-reviews/'
book_reviews_url_end = '/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber='
book_reviews_url_page_number = 1
books_with_reviews = 0
books_saved = 0
csv_file_path = 'datasets/original_complete_csv_file.csv'
log_file_path = 'log/read_top_100_log.txt'

In [None]:
# Gets the current time and date when the script starts
def start_counting_time(log_file):
    # datetime object containing current date and time
    start_time = datetime.datetime.now()
    # dd/mm/YY H:M:S
    start_time_string = start_time.strftime("%d/%m/%Y %H:%M:%S")
    print("Beginning web scrapping at ", start_time_string, "\n")	
    log_file.write("Beginning web scrapping at " + start_time_string + "\n")	
        
    return start_time

In [None]:
# Gets the current time and date when the script ends
def finish_counting_time(start_time, books_with_reviews, log_file, books_saved):
    # datetime object containing current date and time
    finish_time = datetime.datetime.now()
    # dd/mm/YY H:M:S
    finish_time_string = finish_time.strftime("%d/%m/%Y %H:%M:%S")
    log_file.write("\nFinished web scrapping at " + finish_time_string + "\n")	
    print("\nFinished web scrapping at", finish_time_string)	
    
    difference = (finish_time - start_time).total_seconds()
    log_file.write(str(books_with_reviews) + " books with reviews were found among top 100 books, " + str(books_saved) + " where saved and the script took " + str(difference) + " seconds to complete.\n")
    print(str(books_with_reviews) + " books with reviews were found among top 100 books, " + str(books_saved) + " where saved and the script took " + str(difference) + " seconds to complete.\n")

In [None]:
# Write the review line in the csv file
def write_review(review, writer, book_id, review_number, log_file):

    # Try to get the rating information from the review
    match = re.search('a-star-\d', str(review))
    if match:
        rating = match.group(0)[7:]
    else:
        rating = ''    

    # Get the review title
    anchor = review.find_all("a", {"class": "review-title"})
    if anchor:
        title = anchor[0].span.text.replace(';', ',')
        title = str(title).replace(';)', ':)')
        title = str(title).replace(';', ',')
        title = str(title).replace('"', '')
        title = title.strip()
    else:
        title = ''
    # Get the review id
    review_id = review.get('id')

    # Get the review text
    review_text_content = review.select('.review-text-content')
    spam_quantity = len(review_text_content[0].select('span'))

    log_file.write('\t\tread review #' + str(review_number) + ' whose id is ' + str(review_id) + '\n')
    print('\t\tread review #' + str(review_number) + ' whose id is ' + str(review_id))

    # If the review text is not empty, try to sanitize the text
    if spam_quantity != 0:
        text = review_text_content[0].select('span')[spam_quantity - 1]
        text = str(text).replace(';', ',')
        text = str(text).replace('"', '')
        text = str(text).replace('<span>', '')
        text = str(text).replace('</span>', '')
        text = str(text).replace('<br/>', '')
        text = str(text).replace('</br>', '')
        text = str(text).replace('<br>', '')
        text = text.strip()
        
        writer.writerow([book_id + ';' + review_id + ';' + rating + ';' + title + ';' + text])

In [None]:
# Get all book ids already stored in the file
def get_books_from_file(csv_file_path, log_file):
    
    # Check if the file exists
    file_exists = exists(csv_file_path)
    # Create an empty set of books
    books = set()

    if file_exists:
        # Open the file to read
        csv_file = open(csv_file_path, 'r')

        # Read all lines
        lines = csv_file.readlines()
        
        log_file.write("\nBooks already found in the csv file:\n")
        print("\nBooks already found in the csv file:")

        # For each line in the file
        for line in lines:
            # Search for first occurance of digits (book_id) 
            match = re.search(r'\d+', str(line))
            if match:
                book_id = match.group()
                # Add the book_id to the set
                books.add(book_id)              

        # Close the file
        csv_file.close()

    for book in books:
        log_file.write("\t" + book + "\n")
        print("\t" + book)
    
    log_file.write("\n " + str(len(books)) + " books where found.\n")
    print("\n " + str(len(books)) + " books where found.\n")
    return books

In [None]:
# Get all the book reviews for that page
def get_book_reviews(book_reviews_url_begin, book_id, book_reviews_url_end, book_reviews_url_page_number, books_with_reviews, log_file):
    
    log_file.write('\tBook #' + str(books_with_reviews) + ' whose id is: ' + str(book_id) + ', reading page #' + str(book_reviews_url_page_number) + '\n')
    print('\tBook #' + str(books_with_reviews) + ' whose id is: ' + str(book_id) + ', reading page #' + str(book_reviews_url_page_number))
    
    # Access the book page and get all the reviews
    book_page = requests.get(book_reviews_url_begin + str(book_id) + book_reviews_url_end + str(book_reviews_url_page_number))
    log_file.write("\t\tServer response status code: " + str(book_page.status_code) + '\n')
    print("\t\tServer response status code: " + str(book_page.status_code))

    # If it gets a 503 or 404 error from Amazon, sleeps for some seconds and try again until it works
    while (book_page.status_code != 200):
        if book_page.status_code == 404:
            log_file.write("\t\tGoing to try next book (404).\n")
            print("\t\tGoing to try next book (404).")
            return False
            
        log_file.write("\t\tGoing to retry.\n")
        print("\t\tGoing to retry.")
        time.sleep(uniform(1.0, 5.0))
        book_page = requests.get(book_reviews_url_begin + str(book_id) + book_reviews_url_end + str(book_reviews_url_page_number))
        log_file.write("\t\tServer response status code: " + str(book_page.status_code) + '\n')
        print("\t\tServer response status code: " + str(book_page.status_code))


    book_soup = BeautifulSoup(book_page.text, 'html.parser')
    
    # Selecting the reviews from the page
    reviews = book_soup.select('.review')
    return reviews


In [None]:
def get_next_page(top_100_url, top_100_url_page_number, log_file):
    
    # Get the next 50 books 
    top_100_books_page = requests.get(top_100_url + str(top_100_url_page_number))
    log_file.write("\tServer response status code: " + str(top_100_books_page.status_code) + '\n')       
    print("\tServer response status code: " + str(top_100_books_page.status_code))
                     
    # If it gets a 503 error from Amazon, sleeps for some seconds and try again until it works
    while (top_100_books_page.status_code == 503):
        log_file.write("\tSever responded with 503, going to retry.\n")
        print("\tSever responded with 503, going to retry.")
        time.sleep(uniform(1.0, 5.0))
        top_100_books_page = requests.get(top_100_url + str(top_100_url_page_number))

    return top_100_books_page

In [None]:
def get_books_information(top_100_books_page):
    # Initializing the beautifulsoup for html manipulation 
    soup = BeautifulSoup(top_100_books_page.text, 'html.parser')
        
    # Selecting the top 100 books from the page
    div_books = soup.select('.zg-grid-general-faceout')

    return div_books

In [None]:
def get_book_id(div_book):
    # Try to get the book id
    m = re.search('pd_rd_i=\d*', str(div_book))
    book_id = m.group(0)[8:]

    return book_id

In [None]:
def book_has_reviews(div_book):
    # Check if the book has reviews
    r = re.search('a-icon-row', str(div_book))
    # If the book has reviews, grab all of them
    return r

In [None]:
# File that will hold the log
log_file = open(log_file_path, "w")
# Start counting the time
start_time = start_counting_time(log_file)
# Get the books that are already stored in the csv file
books_already_stored = get_books_from_file(csv_file_path, log_file)

with open(csv_file_path, 'a', encoding='UTF8', newline='') as f:
    writer = csv.writer(f, delimiter=';', quoting=csv.QUOTE_MINIMAL)

    # If the books set has no books, the file does not exist
    if len(books_already_stored) == 0:
        writer.writerow( ['book_id;review_id;review_rating;review_title;review_text;'] )

    # Top 100 books are divided into 2 pages, this while loop will be exectuded twice
    while (top_100_url_page_number < 3):
        
        top_100_books_page = get_next_page(top_100_url, top_100_url_page_number, log_file)
        # If the request was succeded
        if (top_100_books_page.status_code == 200):
            
            # Get all the divs of books
            div_books = get_books_information(top_100_books_page)

            # For each book found
            for div_book in div_books:
                
                # Get the book id
                book_id = get_book_id(div_book)
                # Check if the book has reviews
                has_reviews = book_has_reviews(div_book)
                if has_reviews:
                    # Increment the book with reviews counter
                    books_with_reviews+=1 
                    log_file.write('\nGoing to read the book #' + str(books_with_reviews) + ' whose id is ' + str(book_id) + '\n')
                    print('\nGoing to read the book #' + str(books_with_reviews) + ' whose id is ' + str(book_id))
                    
                    # Check if the book is already stored in the csv file
                    if book_id in books_already_stored:
                        log_file.write('\tBook #' + str(books_with_reviews) + ' whose id is ' + str(book_id) + ' is already stored in the file.\n')
                        print('\tBook #' + str(books_with_reviews) + ' whose id is ' + str(book_id) + ' is already stored in the file.')
                        # Skip this book because it is already in the file
                        continue

                
                    end = False    

                    x = 1
                    review_number = 0

                    # While it finds reviews
                    while not end:
                    
                        # Reads only 10 pages and the sleeps for 3 seconds, trying to avoid 503 back from Amazon
                        for book_reviews_url_page_number in range(x, x + 10):    
                            
                            # Get the book reviews for this page
                            reviews = get_book_reviews(book_reviews_url_begin, book_id, book_reviews_url_end, book_reviews_url_page_number, books_with_reviews, log_file)

                            # If it doesn't find any reviews, it has reached the last review and might finish
                            if not reviews:
                                log_file.write('\t\tDidn\'t find more reviews.\n')
                                print('\t\tDidn\'t find more reviews.\n')
                                end = True
                                break
                            else:
                                if review_number == 0: 
                                    books_saved += 1

                                log_file.write('\t\tFound more ' + str(len(reviews)) + ' reviews.\n')
                                print('\t\tFound more ' + str(len(reviews)) + ' reviews.')
                                # For each review found in the page, write the data into the csv file
                                for review in reviews:
                                    review_number+=1
                                    #write_review(review, writer, book_id, review_number, log_file)
                                    write_review(review, writer, book_id, review_number, log_file)
                                    
                        # Advance to the next page that hasn't been reached yet
                        x+=10  
                        # Sleeps for 3 seconds, trying to avoid 503 back from Amazon 
                        time.sleep(3)

        top_100_url_page_number+=1

# Finish counting the time
finish_counting_time(start_time, books_with_reviews, log_file, books_saved)
log_file.close()