# WEBSCRAPING AMAZON.COM USING SELENIUM AND BEAUTIFULSOUP

In [356]:
import pandas as pd
import numpy as np
import csv

In [357]:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
import re
import pprint
from selenium import webdriver

# SOCIOLOGY

In [358]:
driver = webdriver.Chrome(executable_path="C://chromedriver.exe")

In [359]:
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"}

In [360]:
def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A11288&s={}"
    search_term = search_term.replace(' ','-')
    return template.format(search_term)

In [361]:
url = get_url('review rank')
print(url)

https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A11288&s=review-rank


In [362]:
driver.get(url)

## Extract rhe collection

In [363]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [364]:
results = soup.find_all('div', {'class': 'a-section a-spacing-medium'})

In [365]:
len(results)

16

## Prototype the record

In [443]:
item = results[0]

In [444]:
book_title = item.find('span', class_="a-size-medium a-color-base a-text-normal")

In [445]:
book_title = book_title.text

In [446]:
book_title

'Discrimination and Disparities'

In [447]:
author_parent = item.find('div', class_="a-row a-size-base a-color-secondary")

In [448]:
author = author_parent.find('a', class_="a-size-base a-link-normal").text.strip()

In [449]:
author

'Thomas Sowell'

In [450]:
review_rating_parent = item.find('a', class_="a-popover-trigger a-declarative")

In [451]:
review_rating = review_rating_parent.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()

In [452]:
review_rating

'4.9 out of 5 stars'

In [453]:
review_count_parent = item.find('div', class_="a-row a-size-small")

In [454]:
review_count = review_count_parent.find_all('span')[3].text.strip()

In [455]:
review_count

'997'

## Generalise the pattern

In [456]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    book_title = item.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title = book_title.text
    
    try:
        # author and url
        author_parent = item.find('div', class_="a-row a-size-base a-color-secondary")
        author = author_parent.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent = item.find('a', class_="a-popover-trigger a-declarative")
        review_rating = review_rating_parent.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating = '0'
    
    try:
        # review count
        review_count_parent = item.find('div', class_="a-row a-size-small")
        review_count = review_count_parent.find_all('span')[3].text.strip()
    except AttributeError:
        review_count = '0'
        
    result = (book_title, author, review_rating, review_count)
    return result

In [457]:
records = []
results = soup.find_all('div', {'class': 'a-section a-spacing-medium'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [462]:
records[15]

('Rac(e)ing to Class: Confronting Poverty and Race in Schools and Classrooms',
 'H. Richard Milner IV',
 '4.9 out of 5 stars',
 '51')

In [459]:
for row in records:
    print(row[1])

Thomas Sowell
Angela Y. Davis
Unspecified
Isabel Wilkerson
Angela Y. Davis
Unspecified
Unspecified
Unspecified
Mehrsa Baradaran
Shomari Wills
Robert B. Reich
Unspecified
LedByDonkeys
John C. Maxwell
Thom Hartmann
H. Richard Milner IV


## Getting the next page

In [460]:
def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A11288&s={}"
    search_term = search_term.replace(' ','-')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'
    
    return url

In [426]:
print(url)

https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A11288&s=review-rank


## Putting it all together

In [461]:
#import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A11288&s={}"
    search_term = search_term.replace(' ','-')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'  
    
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    book_title = item.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title = book_title.text
    
    try:
        # author and url
        author_parent = item.find('div', class_="a-row a-size-base a-color-secondary")
        author = author_parent.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent = item.find('a', class_="a-popover-trigger a-declarative")
        review_rating = review_rating_parent.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating = '0'
    
    try:
        # review count
        review_count_parent = item.find('div', class_="a-row a-size-small")
        review_count = review_count_parent.find_all('span')[3].text.strip()
    except AttributeError:
        review_count = '0'
        
    result = (book_title, author, review_rating, review_count)
    return result


def main(search_term): 
    """run main program routine"""
    driver = webdriver.Chrome(executable_path="C://chromedriver.exe")
    
    record = []
    
    url = get_url(search_term)
    
    for page in range(2, 76):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'class': 'a-section a-spacing-medium'})
        
        for item in results:
            record = extract_record(item)
            if record:    
                records.append(record)
                
    driver.close()
    # save the data to csv file
    with open('subs_sociology.csv', 'w', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Book Title', 'Author', 'Review Rating', 'Review Count'])
        writer.writerows(records)

In [464]:
main('review rank')

In [466]:
sociology = pd.read_csv('sub_sociology.csv')

In [467]:
sociology

Unnamed: 0,Book Title,Author,Review Rating,Review Count
0,Discrimination and Disparities,Thomas Sowell,4.9 out of 5 stars,997
1,"Women, Race & Class",Angela Y. Davis,4.9 out of 5 stars,683
2,I See You: How Love Opens Our Eyes to Invisibl...,Unspecified,4.9 out of 5 stars,75
3,Caste (Oprah's Book Club): The Origins of Our ...,Isabel Wilkerson,4.8 out of 5 stars,5708
4,The Meaning of Freedom: And Other Difficult Di...,Angela Y. Davis,4.9 out of 5 stars,71
...,...,...,...,...
1195,Only The Strongest Women Become Social Workers...,Claire Shepherd,5.0 out of 5 stars,3
1196,Who Is the First-Class Ghanaian?: A Story of T...,Unspecified,5.0 out of 5 stars,3
1197,LA JIBARITA (Spanish Edition),Unspecified,5.0 out of 5 stars,3
1198,Max Weber: The Lawyer as Social Thinker (Key S...,Frank Parkin,5.0 out of 5 stars,3


# LITERATURE

In [599]:
driver = webdriver.Chrome(executable_path="C://chromedriver.exe")

In [602]:
def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A17&s={}&qid=1602588965&ref=sr_pg_1"
    search_term = search_term.replace(' ','-')
    return template.format(search_term)

In [603]:
url1 = get_url('review rank')
print(url1)

https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A17&s=review-rank&qid=1602588965&ref=sr_pg_1


In [604]:
driver.get(url1)

In [605]:
soup1 = BeautifulSoup(driver.page_source, 'html.parser')

In [606]:
results1 = soup1.find_all('div', {'class': 'a-section a-spacing-medium'})

In [607]:
len(results1)

16

In [608]:
item1 = results1[0]

In [609]:
book_title1 = item1.find('span', class_="a-size-medium a-color-base a-text-normal")

In [610]:
book_title1 = book_title1.text

In [611]:
book_title1

'Manga in Theory and Practice: The Craft of Creating Manga'

In [612]:
author_parent1 = item1.find('div', class_="a-row a-size-base a-color-secondary")

In [613]:
author1 = author_parent1.find('a', class_="a-size-base a-link-normal").text.strip()

In [614]:
author1

'Hirohiko Araki'

In [615]:
review_rating_parent1 = item1.find('a', class_="a-popover-trigger a-declarative")

In [616]:
review_rating1 = review_rating_parent1.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()

In [617]:
review_rating1

'4.9 out of 5 stars'

In [618]:
review_count_parent1 = item1.find('div', class_="a-row a-size-small")

In [619]:
review_count1 = review_count_parent1.find_all('span')[3].text.strip()

In [620]:
review_count1

'733'

In [621]:
def extract_record(item1):
    """Extract and return data from a single record"""
    
    # description and url
    book_title1 = item1.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title1 = book_title1.text
    
    try:
        # author and url
        author_parent1 = item1.find('div', class_="a-row a-size-base a-color-secondary")
        author1 = author_parent1.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author1 = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent1 = item1.find('a', class_="a-popover-trigger a-declarative")
        review_rating1 = review_rating_parent1.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating1 = '0'
    
    try:
        # review count
        review_count_parent1 = item1.find('div', class_="a-row a-size-small")
        review_count1 = review_count_parent1.find_all('span')[3].text.strip()
    except AttributeError:
        review_count1 = '0'
        
    result1 = (book_title1, author1, review_rating1, review_count1)
    return result1

In [622]:
records1 = []
results1 = soup1.find_all('div', {'class': 'a-section a-spacing-medium'})

for item1 in results1:
    record1 = extract_record(item1)
    if record1:
        records1.append(record1)

In [623]:
records1[0]

('Manga in Theory and Practice: The Craft of Creating Manga',
 'Hirohiko Araki',
 '4.9 out of 5 stars',
 '733')

In [624]:
for row in records1:
    print(row[1])

Hirohiko Araki
Liza K Womack
Wanda E. Brunstetter
Stephanie Nicole Norris
Ross Patterson
Katie Fox
B. Love
Rick Remender
Robin Patchen
Rick Kirkman
DJ COLBERT
Brandilyn Collins
Bernard Lee DeLeo
Rick Kirkman
James H. Drescher
Kathryn Stockett


In [625]:
def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A17&s={}&qid=1602588965&ref=sr_pg_1"
    search_term = search_term.replace(' ','-')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'  
    
    return url

In [630]:
#import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A17&s={}&qid=1602588965&ref=sr_pg_1"
    search_term = search_term.replace(' ','-')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'  
    
    return url

def extract_record(item1):
    """Extract and return data from a single record"""
    
    # description and url
    book_title1 = item1.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title1 = book_title1.text
    
    try:
        # author and url
        author_parent1 = item1.find('div', class_="a-row a-size-base a-color-secondary")
        author1 = author_parent1.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author1 = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent1 = item1.find('a', class_="a-popover-trigger a-declarative")
        review_rating1 = review_rating_parent1.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating1 = '0'
    
    try:
        # review count
        review_count_parent1 = item1.find('div', class_="a-row a-size-small")
        review_count1 = review_count_parent1.find_all('span')[3].text.strip()
    except AttributeError:
        review_count1 = '0'
        
    result1 = (book_title1, author1, review_rating1, review_count1)
    return result1

def main1(search_term): 
    """run main program routine"""
    driver = webdriver.Chrome(executable_path="C://chromedriver.exe")
    
    record1 = []
    
    url1 = get_url(search_term)
    
    for page in range(2, 76):
        driver.get(url1.format(page))
        soup1 = BeautifulSoup(driver.page_source, 'html.parser')
        results1 = soup1.find_all('div', {'class': 'a-section a-spacing-medium'})
        
        for item1 in results1:
            record1 = extract_record(item1)
            if record1:    
                records1.append(record1)
                
    driver.close()
    # save the data to csv file
    with open('LitFiction.csv', 'w', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Book Title', 'Author', 'Review Rating', 'Review Count'])
        writer.writerows(records1)

In [631]:
main1('review rank')

In [633]:
lit_fiction = pd.read_csv('LitFiction.csv')

In [634]:
lit_fiction

Unnamed: 0,Book Title,Author,Review Rating,Review Count
0,Manga in Theory and Practice: The Craft of Cre...,Hirohiko Araki,4.9 out of 5 stars,733
1,The Dance Class: Book One in the series: Mama ...,Liza K Womack,4.9 out of 5 stars,289
2,Amish Cooking Class - The Blessing,Wanda E. Brunstetter,4.9 out of 5 stars,247
3,The Sweetest Surrender (Falling For A Rose Boo...,Stephanie Nicole Norris,4.9 out of 5 stars,186
4,"When Darkness Falls, He Doesn't Catch It",Ross Patterson,4.9 out of 5 stars,146
...,...,...,...,...
1227,The Sub-American Dream,Unspecified,5.0 out of 5 stars,7
1228,Orange Sun,Gerald A. Loeb,5.0 out of 5 stars,7
1229,Life Is Unfinished Without the Language of Poetry,Unspecified,5.0 out of 5 stars,7
1230,There Is Only Lampyridae,Unspecified,5.0 out of 5 stars,7


# BUSINESS AND MONEY

In [717]:
driver = webdriver.Chrome(executable_path="C://chromedriver.exe")

In [718]:
def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A3&s={}&dc&qid=1602606880&ref=sr_st_review-rank"
    search_term = search_term.replace(' ','-')
    return template.format(search_term)

In [719]:
url2 = get_url('review rank')
print(url2)

https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A3&s=review-rank&dc&qid=1602606880&ref=sr_st_review-rank


In [720]:
driver.get(url2)

In [721]:
soup2 = BeautifulSoup(driver.page_source, 'html.parser')

In [722]:
results2 = soup2.find_all('div', {'class': 'a-section a-spacing-medium'})

In [723]:
len(results2)

16

In [724]:
item2 = results2[0]

In [725]:
book_title2 = item2.find('span', class_="a-size-medium a-color-base a-text-normal")

In [726]:
book_title2 = book_title2.text

In [727]:
book_title2

'The Clean Money Revolution: Reinventing Power, Purpose, and Capitalism'

In [728]:
author_parent2 = item2.find('div', class_="a-row a-size-base a-color-secondary")

In [729]:
author2 = author_parent2.find('a', class_="a-size-base a-link-normal").text.strip()

In [730]:
author2

'Joel Solomon'

In [731]:
review_rating_parent2 = item2.find('a', class_="a-popover-trigger a-declarative")

In [732]:
review_rating2 = review_rating_parent2.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()

In [733]:
review_rating2

'5.0 out of 5 stars'

In [734]:
review_count_parent2 = item2.find('div', class_="a-row a-size-small")

In [735]:
review_count2 = review_count_parent2.find_all('span')[3].text.strip()

In [736]:
review_count2

'79'

In [737]:
def extract_record(item2):
    """Extract and return data from a single record"""
    
    # description and url
    book_title2 = item2.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title2 = book_title2.text
    
    try:
        # author and url
        author_parent2 = item2.find('div', class_="a-row a-size-base a-color-secondary")
        author2 = author_parent2.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author2 = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent2 = item2.find('a', class_="a-popover-trigger a-declarative")
        review_rating2 = review_rating_parent2.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating2 = '0'
    
    try:
        # review count
        review_count_parent2 = item2.find('div', class_="a-row a-size-small")
        review_count2 = review_count_parent2.find_all('span')[3].text.strip()
    except AttributeError:
        review_count2 = '0'
        
    result2 = (book_title2, author2, review_rating2, review_count2)
    return result2

In [738]:
records2 = []
results2 = soup2.find_all('div', {'class': 'a-section a-spacing-medium'})

for item2 in results2:
    record2 = extract_record(item2)
    if record2:
        records2.append(record2)

In [739]:
records2[0]

('The Clean Money Revolution: Reinventing Power, Purpose, and Capitalism',
 'Joel Solomon',
 '5.0 out of 5 stars',
 '79')

In [740]:
for row in records2:
    print(row[0])

The Clean Money Revolution: Reinventing Power, Purpose, and Capitalism
Fear Is Not the Boss of You: How to Get Out of Your Head and Live the Life You Were Made For
Smuggler's Cove: Exotic Cocktails, Rum, and the Cult of Tiki
Discrimination and Disparities
Do It! Speaking: 77 Instant-Action Ideas to Market, Monetize, and Maximize Your Expertise
The Age of Influence: The Power of Influencers to Elevate Your Brand
Unlocking Greatness: The Unexpected Journey from the Life You Have to the Life You Want
Leadership Begins with Motivation: 33 Unique Ways to Think & Act Like a Successful Leader That Will Transform Your Professional & Personal Life
Do Good: Embracing Brand Citizenship to Fuel Both Purpose and Profit
The Seven Spiritual Laws of Success
The War on Normal People: The Truth About America's Disappearing Jobs and Why Universal Basic Income Is Our Future
Beyond a Million: The Entrepreneur’s Playbook for Expanding Wealth, Freedom and Time
Buy, Rehab, Rent, Refinance, Repeat: The BRRRR R

In [741]:
def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A3&s={}&dc&qid=1602606880&ref=sr_st_review-rank"
    search_term = search_term.replace(' ','-')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'  
    
    return url

In [746]:
#import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=categories&i=stripbooks&rh=n%3A283155%2Cn%3A3&s={}&dc&qid=1602606880&ref=sr_st_review-rank"
    search_term = search_term.replace(' ','-')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'  
    
    return url

def extract_record(item2):
    """Extract and return data from a single record"""
    
    # description and url
    book_title2 = item2.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title2 = book_title2.text
    
    try:
        # author and url
        author_parent2 = item2.find('div', class_="a-row a-size-base a-color-secondary")
        author2 = author_parent2.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author2 = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent2 = item2.find('a', class_="a-popover-trigger a-declarative")
        review_rating2 = review_rating_parent2.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating2 = '0'
    
    try:
        # review count
        review_count_parent2 = item2.find('div', class_="a-row a-size-small")
        review_count2 = review_count_parent2.find_all('span')[3].text.strip()
    except AttributeError:
        review_count2 = '0'
        
    result2 = (book_title2, author2, review_rating2, review_count2)
    return result2

def main2(search_term): 
    """run main program routine"""
    driver = webdriver.Chrome(executable_path="C://chromedriver.exe")
    
    record2 = []
    
    url2 = get_url(search_term)
    
    for page in range(2, 76):
        driver.get(url2.format(page))
        soup2 = BeautifulSoup(driver.page_source, 'html.parser')
        results2 = soup2.find_all('div', {'class': 'a-section a-spacing-medium'})
        
        for item2 in results2:
            record2 = extract_record(item2)
            if record2:    
                records2.append(record2)
                
    driver.close()
    # save the data to csv file
    with open('business and money.csv', 'w', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Book Title', 'Author', 'Review Rating', 'Review Count'])
        writer.writerows(records2)

In [747]:
main2('review rank')

In [748]:
bus_money = pd.read_csv('business and money.csv')

In [749]:
bus_money 

Unnamed: 0,Book Title,Author,Review Rating,Review Count
0,"The Clean Money Revolution: Reinventing Power,...",Joel Solomon,5.0 out of 5 stars,79
1,Fear Is Not the Boss of You: How to Get Out of...,Jennifer Allwood,4.9 out of 5 stars,1126
2,"Smuggler's Cove: Exotic Cocktails, Rum, and th...",Martin Cate,4.9 out of 5 stars,1093
3,Discrimination and Disparities,Thomas Sowell,4.9 out of 5 stars,1001
4,Do It! Speaking: 77 Instant-Action Ideas to Ma...,David Newman,4.9 out of 5 stars,116
...,...,...,...,...
1243,RCM--Gateway to World Class Maintenance,Anthony M. Smith,5.0 out of 5 stars,4
1244,Teacher Strike!: Public Education and the Maki...,Jon Shelton,5.0 out of 5 stars,4
1245,Customer Service Management Training 101: Quic...,Renee Evenson,0,8
1246,Knock Your Socks Off Service Recovery,Ron Zemke,5.0 out of 5 stars,4


# RELIGION

In [750]:
driver = webdriver.Chrome(executable_path="C://chromedriver.exe")

In [751]:
def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=department&i=stripbooks&rh=n%3A283155%2Cn%3A22&s=review-rank&dc&qid=1602610498&rnid=1000&ref=sr_st_review-rank"
    search_term = search_term.replace(' ','-')
    return template.format(search_term)

In [752]:
url3 = get_url('review rank')
print(url3)

https://www.amazon.com/s?k=department&i=stripbooks&rh=n%3A283155%2Cn%3A22&s=review-rank&dc&qid=1602610498&rnid=1000&ref=sr_st_review-rank


In [753]:
driver.get(url3)

In [754]:
soup3 = BeautifulSoup(driver.page_source, 'html.parser')

In [755]:
results3 = soup3.find_all('div', {'class': 'a-section a-spacing-medium'})

In [756]:
len(results3)

16

In [757]:
item3 = results3[0]

book_title3 = item3.find('span', class_="a-size-medium a-color-base a-text-normal")
book_title3 = book_title3.text

author_parent3 = item3.find('div', class_="a-row a-size-base a-color-secondary")
author3 = author_parent3.find('a', class_="a-size-base a-link-normal").text.strip()

review_rating_parent3 = item3.find('a', class_="a-popover-trigger a-declarative")
review_rating3 = review_rating_parent3.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()

review_count_parent3 = item3.find('div', class_="a-row a-size-small")
review_count3 = review_count_parent3.find_all('span')[3].text.strip()

In [758]:
def extract_record(item3):
    """Extract and return data from a single record"""
    
    # description and url
    book_title3 = item3.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title3 = book_title3.text
    
    try:
        # author and url
        author_parent3 = item3.find('div', class_="a-row a-size-base a-color-secondary")
        author3 = author_parent3.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author3 = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent3 = item3.find('a', class_="a-popover-trigger a-declarative")
        review_rating3 = review_rating_parent3.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating3 = '0'
    
    try:
        # review count
        review_count_parent3 = item3.find('div', class_="a-row a-size-small")
        review_count3 = review_count_parent3.find_all('span')[3].text.strip()
    except AttributeError:
        review_count3 = '0'
        
    result3 = (book_title3, author3, review_rating3, review_count3)
    return result3

In [759]:
records3 = []
results3 = soup3.find_all('div', {'class': 'a-section a-spacing-medium'})

for item3 in results3:
    record3 = extract_record(item3)
    if record3:
        records3.append(record3)

In [760]:
records3[0]

('The Holy Word for Morning Revival - The Experience of Christ',
 'Witness Lee',
 '5.0 out of 5 stars',
 '147')

In [762]:
for row in records3:
    print(row[0])

The Holy Word for Morning Revival - The Experience of Christ
The Friend who Forgives (Tales That Tell the Truth)
Church Undivided: Exploring God’s Vision for Unity in 1 Corinthians 8–10
Both Sides Now: A True Story of Love, Loss and Bold Living.
The Holy Word for Morning Revival - Crystallization-study of Deuteronomy, Volume 2
The Glory Within: The Interior Life and the Power of Speaking in Tongues
The Holy Word for Morning Revival - Crystallization-study of Deuteronomy, Volume 1
New Morning Mercies: A Daily Gospel Devotional
Leading the Other Way: How to Change the Church Planting World
God's Creative Power for Healing
Parenting: 14 Gospel Principles That Can Radically Change Your Family
Refuting Rabbinic Objections to Christianity & Messianic Prophecies
Praying Through It: 365 Days Worth of Prayers That Make Praying Easy
NKJV, Adventure Bible, Leathersoft, Blue, Full Color
The Read-Aloud Family: Making Meaningful and Lasting Connections with Your Kids
The Believer's Authority


In [767]:
#import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=department&i=stripbooks&rh=n%3A283155%2Cn%3A22&s={}&dc&qid=1602610498&rnid=1000&ref=sr_st_review-rank"
    search_term = search_term.replace(' ','-')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'  
    
    return url

def extract_record(item3):
    """Extract and return data from a single record"""
    
    # description and url
    book_title3 = item3.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title3 = book_title3.text
    
    try:
        # author and url
        author_parent3 = item3.find('div', class_="a-row a-size-base a-color-secondary")
        author3 = author_parent3.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author3 = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent3 = item3.find('a', class_="a-popover-trigger a-declarative")
        review_rating3 = review_rating_parent3.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating3 = '0'
    
    try:
        # review count
        review_count_parent3 = item3.find('div', class_="a-row a-size-small")
        review_count3 = review_count_parent3.find_all('span')[3].text.strip()
    except AttributeError:
        review_count3 = '0'
        
    result3 = (book_title3, author3, review_rating3, review_count3)
    return result3

def main3(search_term): 
    """run main program routine"""
    driver = webdriver.Chrome(executable_path="C://chromedriver.exe")
    
    record3 = []
    
    url3 = get_url(search_term)
    
    for page in range(2, 76):
        driver.get(url3.format(page))
        soup3 = BeautifulSoup(driver.page_source, 'html.parser')
        results3 = soup3.find_all('div', {'class': 'a-section a-spacing-medium'})
        
        for item3 in results3:
            record3 = extract_record(item3)
            if record3:    
                records3.append(record3)
                
    driver.close()
    # save the data to csv file
    with open('Religion.csv', 'w', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Book Title', 'Author', 'Review Rating', 'Review Count'])
        writer.writerows(records3)

In [768]:
main3('review rank')

In [769]:
religion = pd.read_csv('Religion.csv')

In [770]:
religion

Unnamed: 0,Book Title,Author,Review Rating,Review Count
0,The Holy Word for Morning Revival - The Experi...,Witness Lee,5.0 out of 5 stars,147
1,The Friend who Forgives (Tales That Tell the T...,Dan DeWitt,5.0 out of 5 stars,140
2,Church Undivided: Exploring God’s Vision for U...,Bob Ingle,5.0 out of 5 stars,103
3,"Both Sides Now: A True Story of Love, Loss and...",Nancy Sharp,5.0 out of 5 stars,103
4,The Holy Word for Morning Revival - Crystalliz...,Witness Lee,5.0 out of 5 stars,91
...,...,...,...,...
1243,The Practice of God's Presence,Unspecified,4.8 out of 5 stars,51
1244,Gutsy Girls: Strong Christian Women Who Impact...,Amy L. Sullivan,4.8 out of 5 stars,51
1245,The Holy Word for Morning Revival - Crystalliz...,Witness Lee,4.8 out of 5 stars,51
1246,How Many Times Do I Have to Tell You?,Unspecified,4.8 out of 5 stars,51


# POLITICS AND SOCIAL SCIENCE

In [1154]:
driver = webdriver.Chrome(executable_path="C://chromedriver.exe")

In [1155]:
def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=department&i=stripbooks&rh=n%3A283155%2Cn%3A3377866011&s={}&dc&qid=1602614413&rnid=1000&ref=sr_nr_n_1"
    search_term = search_term.replace(' ','-')
    return template.format(search_term)

In [1156]:
url4 = get_url('review rank')
print(url4)

https://www.amazon.com/s?k=department&i=stripbooks&rh=n%3A283155%2Cn%3A3377866011&s=review-rank&dc&qid=1602614413&rnid=1000&ref=sr_nr_n_1


In [1157]:
driver.get(url4)

In [1158]:
soup4 = BeautifulSoup(driver.page_source, 'html.parser')

In [1159]:
results4 = soup4.find_all('div', {'class': 'a-section a-spacing-medium'})

In [1160]:
len(results4)

16

In [1161]:
item4 = results4[0]

book_title4 = item4.find('span', class_="a-size-medium a-color-base a-text-normal")
book_title4 = book_title4.text

author_parent4 = item4.find('div', class_="a-row a-size-base a-color-secondary")
author4 = author_parent4.find('a', class_="a-size-base a-link-normal").text.strip()

review_rating_parent4 = item4.find('a', class_="a-popover-trigger a-declarative")
review_rating4 = review_rating_parent4.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()

review_count_parent4 = item4.find('div', class_="a-row a-size-small")
review_count4 = review_count_parent4.find_all('span')[3].text.strip()

In [1162]:
def extract_record(item4):
    """Extract and return data from a single record"""
    
    # description and url
    book_title4 = item4.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title4 = book_title4.text
    
    try:
        # author and url
        author_parent4 = item4.find('div', class_="a-row a-size-base a-color-secondary")
        author4 = author_parent4.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author4 = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent4 = item4.find('a', class_="a-popover-trigger a-declarative")
        review_rating4 = review_rating_parent4.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating4 = '0'
    
    try:
        # review count
        review_count_parent4 = item4.find('div', class_="a-row a-size-small")
        review_count4 = review_count_parent4.find_all('span')[3].text.strip()
    except AttributeError:
        review_count4 = '0'
        
    result4 = (book_title4, author4, review_rating4, review_count4)
    return result4

In [1163]:
records4 = []
results4 = soup4.find_all('div', {'class': 'a-section a-spacing-medium'})

for item4 in results4:
    record4 = extract_record(item4)
    if record4:
        records4.append(record4)

In [1164]:
records4[0]

('Both Sides Now: A True Story of Love, Loss and Bold Living.',
 'Nancy Sharp',
 '5.0 out of 5 stars',
 '103')

In [1165]:
for row in records4:
    print(row[0])

Both Sides Now: A True Story of Love, Loss and Bold Living.
Esther the Wonder Pig: Changing the World One Heart at a Time
Discrimination and Disparities
Black Labor, White Wealth : The Search for Power and Economic Justice
Dirt to Soil: One Family’s Journey into Regenerative Agriculture
The Quest for Cosmic Justice
O Livro dos Médiuns (Portuguese Edition)
Getting Life: An Innocent Man's 25-Year Journey from Prison to Peace
Farming While Black: Soul Fire Farm’s Practical Guide to Liberation on the Land
Stealth Invasion: Muslim Conquest Through Immigration and Resettlement Jihad
An Introduction to Celebrity Protection and Touring: A Guide to Mastering the Business of Vip Security
Temple Grandin's Guide to Working with Farm Animals: Safe, Humane Livestock Handling Practices for the Small Farm
Invasão Vertical dos Bárbaros (Coleção Abertura Cultural) (Portuguese Edition)
Super Soldiers: A Salute to the Comic Book Heroes and Villains Who Fought for Their Country
Tiki Pop (Bibliotheca Univer

In [1166]:
#import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """generate a url from search term"""
    template = "https://www.amazon.com/s?k=department&i=stripbooks&rh=n%3A283155%2Cn%3A3377866011&s={}&dc&qid=1602614413&rnid=1000&ref=sr_nr_n_1"
    search_term = search_term.replace(' ','-')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page={}'  
    
    return url

def extract_record(item4):
    """Extract and return data from a single record"""
    
    # description and url
    book_title4 = item4.find('span', class_="a-size-medium a-color-base a-text-normal")
    book_title4 = book_title4.text
    
    try:
        # author and url
        author_parent4 = item4.find('div', class_="a-row a-size-base a-color-secondary")
        author4 = author_parent4.find('a', class_="a-size-base a-link-normal").text.strip()
    except AttributeError:
        author4 = 'Unspecified'
   
    try:
        # review ratings
        review_rating_parent4 = item4.find('a', class_="a-popover-trigger a-declarative")
        review_rating4 = review_rating_parent4.find('i', class_="a-icon a-icon-star-small a-star-small-5 aok-align-bottom").text.strip()
    except AttributeError:
        review_rating4 = '0'
    
    try:
        # review count
        review_count_parent4 = item4.find('div', class_="a-row a-size-small")
        review_count4 = review_count_parent4.find_all('span')[3].text.strip()
    except AttributeError:
        review_count4 = '0'
        
    result4 = (book_title4, author4, review_rating4, review_count4)
    return result4

def main4(search_term): 
    """run main program routine"""
    driver = webdriver.Chrome(executable_path="C://chromedriver.exe")
    
    record4 = []
    
    url4 = get_url(search_term)
    
    for page in range(2, 76):
        driver.get(url4.format(page))
        soup4 = BeautifulSoup(driver.page_source, 'html.parser')
        results4 = soup4.find_all('div', {'class': 'a-section a-spacing-medium'})
        
        for item4 in results4:
            record4 = extract_record(item4)
            if record4:    
                records4.append(record4)
                
    driver.close()
    # save the data to csv file
    with open('Politics.csv', 'w', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Book Title', 'Author', 'Review Rating', 'Review Count'])
        writer.writerows(records4)

In [1167]:
main4('review rank')

In [1197]:
politics = pd.read_csv('Politics.csv')

In [1186]:
politics

Unnamed: 0,Book Title,Author,Review Rating,Review Count
0,"Both Sides Now: A True Story of Love, Loss and...",Nancy Sharp,5.0 out of 5 stars,103
1,Esther the Wonder Pig: Changing the World One ...,Steve Jenkins,4.9 out of 5 stars,1456
2,Discrimination and Disparities,Thomas Sowell,4.9 out of 5 stars,1004
3,"Black Labor, White Wealth : The Search for Pow...",Claud Anderson,4.9 out of 5 stars,904
4,Dirt to Soil: One Family’s Journey into Regene...,Gabe Brown,4.9 out of 5 stars,401
...,...,...,...,...
1179,Fast Food Nation: The Dark Side of the All-Ame...,Eric Schlosser,0,2297
1180,Jack Kemp: The Bleeding-Heart Conservative Who...,Unspecified,0,58
1181,The Classic Treasury Of Aesop's Fables (Childr...,Unspecified,0,1861
1182,The End of Faith,Unspecified,0,1749


#### SINCE ALL RATINGS ARE OUT OF 5, WE MAY KEEP ONLY THE RATING AND REMOVE THE EXTRA PART OF IT

In [846]:
sociology['Review Rating'] = sociology['Review Rating'].apply(lambda x: x.split()[0])

In [847]:
sociology['Review Rating'] = pd.to_numeric(sociology['Review Rating'])

In [850]:
literature = lit_fiction

In [852]:
literature['Review Rating'] = literature['Review Rating'].apply(lambda x: x.split()[0])

In [853]:
literature['Review Rating'] = pd.to_numeric(literature['Review Rating'])

In [854]:
bus_money['Review Rating'] = bus_money['Review Rating'].apply(lambda x: x.split()[0])

In [855]:
bus_money['Review Rating'] = pd.to_numeric(bus_money['Review Rating'])

In [856]:
religion['Review Rating'] = religion['Review Rating'].apply(lambda x: x.split()[0])

In [857]:
religion['Review Rating'] = pd.to_numeric(religion['Review Rating'])

In [858]:
politics['Review Rating'] = politics['Review Rating'].apply(lambda x: x.split()[0])

In [859]:
politics['Review Rating'] = pd.to_numeric(politics['Review Rating'])

## GETTING THE VALUE COUNTS OF THE RATINGS

In [1071]:
# FOR LITERATURE CATEGORY
literature

Unnamed: 0,Book Title,Author,Review Rating,Review Count
0,Manga in Theory and Practice: The Craft of Cre...,Hirohiko Araki,4.9,733
1,The Dance Class: Book One in the series: Mama ...,Liza K Womack,4.9,289
2,Amish Cooking Class - The Blessing,Wanda E. Brunstetter,4.9,247
3,The Sweetest Surrender (Falling For A Rose Boo...,Stephanie Nicole Norris,4.9,186
4,"When Darkness Falls, He Doesn't Catch It",Ross Patterson,4.9,146
...,...,...,...,...
1227,The Sub-American Dream,Unspecified,5.0,7
1228,Orange Sun,Gerald A. Loeb,5.0,7
1229,Life Is Unfinished Without the Language of Poetry,Unspecified,5.0,7
1230,There Is Only Lampyridae,Unspecified,5.0,7


In [1095]:
pliterature = literature[['Review Rating']]

In [1096]:
pliterature

Unnamed: 0,Review Rating
0,4.9
1,4.9
2,4.9
3,4.9
4,4.9
...,...
1227,5.0
1228,5.0
1229,5.0
1230,5.0


In [1097]:
# convert the review ratings to range and rename
cut_labels_5 = ['0 star','1 star', '2 stars', '3 stars', '4 stars', '5 stars']
cut_bins = [-1, 0.9, 1.9, 2.9, 3.9, 4.9, 5]
pliterature['Review Rating'] = pd.cut(pliterature['Review Rating'], bins = cut_bins, labels = cut_labels_5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [1098]:
# add value counts to the revew ratings
pliterature['Freq'] = pliterature['Review Rating'].map(pliterature['Review Rating'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1099]:
pliterature

Unnamed: 0,Review Rating,Freq
0,4 stars,257
1,4 stars,257
2,4 stars,257
3,4 stars,257
4,4 stars,257
...,...,...
1227,5 stars,81
1228,5 stars,81
1229,5 stars,81
1230,5 stars,81


In [1103]:
stat_lit = pliterature[['Review Rating', 'Freq']].sort_values(by = 'Review Rating', ascending = False)

In [1104]:
# set review ratings as index
stat_lit = stat_lit.set_index('Review Rating')

In [1106]:
# eliminate duplicate index
stat_lit = stat_lit[~stat_lit.index.duplicated(keep = 'last')]

In [1107]:
stat_lit

Unnamed: 0_level_0,Freq
Review Rating,Unnamed: 1_level_1
5 stars,81
4 stars,257
0 star,894


In [None]:
# FOR SOCIOLOGY CATEGORY
sociology

In [1109]:
psociology = sociology[['Review Rating']]

In [1111]:
# convert the review ratings to range and rename
cut_labels_5 = ['0 star','1 star', '2 stars', '3 stars', '4 stars', '5 stars']
cut_bins = [-1, 0.9, 1.9, 2.9, 3.9, 4.9, 5]
psociology['Review Rating'] = pd.cut(psociology['Review Rating'], bins = cut_bins, labels = cut_labels_5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [1112]:
#add value counts to review ratings
psociology['Freq'] = psociology['Review Rating'].map(psociology['Review Rating'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1113]:
psociology

Unnamed: 0,Review Rating,Freq
0,4 stars,108
1,4 stars,108
2,4 stars,108
3,4 stars,108
4,4 stars,108
...,...,...
1195,5 stars,183
1196,5 stars,183
1197,5 stars,183
1198,5 stars,183


In [1115]:
stat_sociology = psociology[['Review Rating', 'Freq']].sort_values(by = 'Review Rating', ascending = False)

# set review rating as index
stat_sociology = stat_sociology.set_index('Review Rating')

#eliminate duplicate index
stat_sociology = stat_sociology[~stat_sociology.index.duplicated(keep = 'last')]

In [1116]:
stat_sociology

Unnamed: 0_level_0,Freq
Review Rating,Unnamed: 1_level_1
5 stars,183
4 stars,108
0 star,909


In [1117]:
# FOR BUSINESS AND MONEY CATEGORY
bus_money

Unnamed: 0,Book Title,Author,Review Rating,Review Count
0,"The Clean Money Revolution: Reinventing Power,...",Joel Solomon,5.0,79
1,Fear Is Not the Boss of You: How to Get Out of...,Jennifer Allwood,4.9,1126
2,"Smuggler's Cove: Exotic Cocktails, Rum, and th...",Martin Cate,4.9,1093
3,Discrimination and Disparities,Thomas Sowell,4.9,1001
4,Do It! Speaking: 77 Instant-Action Ideas to Ma...,David Newman,4.9,116
...,...,...,...,...
1243,RCM--Gateway to World Class Maintenance,Anthony M. Smith,5.0,4
1244,Teacher Strike!: Public Education and the Maki...,Jon Shelton,5.0,4
1245,Customer Service Management Training 101: Quic...,Renee Evenson,0.0,8
1246,Knock Your Socks Off Service Recovery,Ron Zemke,5.0,4


In [1118]:
pbus_money = bus_money[['Review Rating']]

In [1119]:
# convert the review ratings to range and rename
cut_labels_5 = ['0 star','1 star', '2 stars', '3 stars', '4 stars', '5 stars']
cut_bins = [-1, 0.9, 1.9, 2.9, 3.9, 4.9, 5]
pbus_money['Review Rating'] = pd.cut(bus_money['Review Rating'], bins = cut_bins, labels = cut_labels_5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [1120]:
#add value counts to review ratings
pbus_money['Freq'] = pbus_money['Review Rating'].map(pbus_money['Review Rating'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [1121]:
pbus_money

Unnamed: 0,Review Rating,Freq
0,5 stars,142
1,4 stars,202
2,4 stars,202
3,4 stars,202
4,4 stars,202
...,...,...
1243,5 stars,142
1244,5 stars,142
1245,0 star,904
1246,5 stars,142


In [1122]:
stat_bus_money = pbus_money[['Review Rating', 'Freq']].sort_values(by = 'Review Rating', ascending = False)

# set review rating as index
stat_bus_money = stat_bus_money.set_index('Review Rating')

#eliminate duplicate index
stat_bus_money = stat_bus_money[~stat_bus_money.index.duplicated(keep = 'last')]

In [1130]:
stat_bus_money

Unnamed: 0_level_0,Freq
Review Rating,Unnamed: 1_level_1
5 stars,142
4 stars,202
0 star,904


In [1139]:
# FOR RELIGION CATEGORY
religion

Unnamed: 0,Book Title,Author,Review Rating,Review Count
0,The Holy Word for Morning Revival - The Experi...,Witness Lee,5.0,147
1,The Friend who Forgives (Tales That Tell the T...,Dan DeWitt,5.0,140
2,Church Undivided: Exploring God’s Vision for U...,Bob Ingle,5.0,103
3,"Both Sides Now: A True Story of Love, Loss and...",Nancy Sharp,5.0,103
4,The Holy Word for Morning Revival - Crystalliz...,Witness Lee,5.0,91
...,...,...,...,...
1243,The Practice of God's Presence,Unspecified,4.8,51
1244,Gutsy Girls: Strong Christian Women Who Impact...,Amy L. Sullivan,4.8,51
1245,The Holy Word for Morning Revival - Crystalliz...,Witness Lee,4.8,51
1246,How Many Times Do I Have to Tell You?,Unspecified,4.8,51


In [1132]:
preligion = religion[['Review Rating']]

In [1133]:
# convert the review ratings to range and rename
cut_labels_5 = ['0 star','1 star', '2 stars', '3 stars', '4 stars', '5 stars']
cut_bins = [-1, 0.9, 1.9, 2.9, 3.9, 4.9, 5]
preligion['Review Rating'] = pd.cut(religion['Review Rating'], bins = cut_bins, labels = cut_labels_5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [1134]:
#add value counts to review ratings
preligion['Freq'] = preligion['Review Rating'].map(preligion['Review Rating'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [1135]:
preligion

Unnamed: 0,Review Rating,Freq
0,5 stars,164
1,5 stars,164
2,5 stars,164
3,5 stars,164
4,5 stars,164
...,...,...
1243,4 stars,927
1244,4 stars,927
1245,4 stars,927
1246,4 stars,927


In [1136]:
stat_religion = preligion[['Review Rating', 'Freq']].sort_values(by = 'Review Rating', ascending = False)

# set review rating as index
stat_religion = stat_religion.set_index('Review Rating')

#eliminate duplicate index
stat_religion = stat_religion[~stat_religion.index.duplicated(keep = 'last')]

In [1137]:
stat_religion

Unnamed: 0_level_0,Freq
Review Rating,Unnamed: 1_level_1
5 stars,164
4 stars,927
0 star,157


In [1217]:
stat_bus_money
plt.savefig('business.png')

<Figure size 432x288 with 0 Axes>

In [1221]:
stat_bus_money

Unnamed: 0_level_0,Freq
Review Rating,Unnamed: 1_level_1
5 stars,142
4 stars,202
0 star,904


In [1218]:
stat_sociology
plt.savefig('politics.png')

<Figure size 432x288 with 0 Axes>

In [1222]:
stat_sociology

Unnamed: 0_level_0,Freq
Review Rating,Unnamed: 1_level_1
5 stars,183
4 stars,108
0 star,909


In [1219]:
stat_lit
plt.savefig('literature.png')

<Figure size 432x288 with 0 Axes>

In [1223]:
stat_lit

Unnamed: 0_level_0,Freq
Review Rating,Unnamed: 1_level_1
5 stars,81
4 stars,257
0 star,894


In [1220]:
stat_religion
plt.savefig('religion.png')

<Figure size 432x288 with 0 Axes>

In [1224]:
stat_religion

Unnamed: 0_level_0,Freq
Review Rating,Unnamed: 1_level_1
5 stars,164
4 stars,927
0 star,157
