## Step 1: Frame the Problem

Currently, stakeholders know can identify which books are too difficult for their children/students to read but not which books are beyond the maturity level of the child/student. This project attempts to rectify this issue. 

## Step 2: Get the Data

[Guide](https://www.dataquest.io/blog/web-scraping-tutorial-python/)

In [1]:
import os
import string
import requests
import pandas as pd 
from bs4 import BeautifulSoup
from csv import writer

In [2]:
base_url = 'https://www.commonsensemedia.org/book-reviews'
page = '?page='
all_pages = range(1,291)
all_pages_list = [base_url+page+str(p) for p in all_pages]

In [3]:
page = requests.get(base_url)

soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
title = [s.get_text().strip() for s in soup.findAll(class_="views-field views-field-field-reference-review-ent-prod result-title")]

In [5]:
stripper = str.maketrans("", "", string.punctuation)
title_edited = [t.translate(stripper).replace(" ", "-").lower() for t in title] #preps for creating url
urls = [base_url + "/" + t for t in title_edited] 

In [6]:
df = pd.DataFrame()
df['title'] = title
df['url'] = urls
df

Unnamed: 0,title,url
0,Black Beauty,https://www.commonsensemedia.org/book-reviews/...
1,Love,https://www.commonsensemedia.org/book-reviews/...
2,Whistle for Willie,https://www.commonsensemedia.org/book-reviews/...
3,The Little Prince,https://www.commonsensemedia.org/book-reviews/...
4,The Hobbit,https://www.commonsensemedia.org/book-reviews/...
5,Middle School Is Worse Than Meatloaf,https://www.commonsensemedia.org/book-reviews/...
6,"Tiger's Quest: Tiger's Curse, Book 2",https://www.commonsensemedia.org/book-reviews/...
7,Wild Wings,https://www.commonsensemedia.org/book-reviews/...
8,Junonia,https://www.commonsensemedia.org/book-reviews/...
9,The Warlock: The Secrets of the Immortal Nicho...,https://www.commonsensemedia.org/book-reviews/...


In [19]:
page2 = requests.get('https://www.commonsensemedia.org/book-reviews/black-beauty')

In [20]:
soup2 = BeautifulSoup(page2.text, 'html.parser')
summary = soup2.find(class_="shutter-summary-pane panel-pane pane-product-details")

# Get all data from the summary box at the bottom

In [22]:
links = summary.find_all('li')
links
data = []
d = {}
for link in links: 
    k, v = link.text.split(":")
    k = k.strip()
    v = v.strip()
    d[k] = v
data.append(d)
data

[{'Author': 'Anna Sewell',
  'Genre': 'Literary Fiction',
  'Topics': 'Friendship, Great Boy Role Models, Horses and Farm Animals',
  'Book type': 'Fiction',
  'Publisher': 'Penguin Group',
  'Publication date': 'November 24, 1887',
  "Publisher's recommended age(s)": '9 - 12',
  'Number of pages': '275',
  'Available on': 'Paperback, Audiobook (unabridged), Audiobook (abridged), Hardback, Kindle',
  'Last updated': 'December 04, 2019'}]

# Get data from the top of the page

In [29]:
summary2 = soup2.find(class_="panel-content-top panel-panel clearfix")

In [None]:
title = soup2.find('h1').string
description = soup2.find(class_='field field-name-field-one-liner field-type-text field-label-hidden').get_text()
cs_rec_age = soup2.find(class_="field field-name-field-review-recommended-age field-type-list-integer field-label-hidden").get_text().split(" ", 1)[1]
author = summary.find(class_="0 first").get_text().split(":", 1)[1].strip()
genre = summary.find(class_="1").get_text().split(":", 1)[1].strip()
book_type = summary.find(class_="types").get_text().split(":", 1)[1].strip()
pub = summary.find(class_="publishers").get_text().split(":", 1)[1].strip()
pub_date = summary.find(class_="2").get_text().split(":", 1)[1].strip()
pub_rec_age = summary.find(class_="3").get_text().split(":", 1)[1].strip()
pages = summary.find(class_="4").get_text().split(":", 1)[1].strip()
versions = summary.find(class_="5").get_text().split(":", 1)[1].strip()
updated = summary.find(class_="6 last").get_text().split(":", 1)[1].strip()
themes = summary.find(class_="themes").get_text().split(":", 1)[1].strip()

In [None]:
themes

In [None]:
soup2.find(id = "review-product-details-list")

# Idea: Try not putting the scrapping in loops. 

In [8]:
with open('lexile/books.csv', "a") as csv_file:
    csv_writer = writer(csv_file)
    
    #create the header
    headers = ['title', 'description', "author", "genre", 'book_type', 'publisher', "pub_date",\
               "pub_rec_age", "pages", "themes", "cs_rec_age"]
    
    csv_writer.writerow(headers)

    for url in urls:
        print(url)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        summary = soup.find(class_="shutter-summary-pane panel-pane pane-product-details")
        print("hi")
        title = soup.title.get_text().split("[")[0].strip()
        print('yes')
        description = soup.find(class_='field field-name-field-one-liner field-type-text field-label-hidden').get_text()
        cs_rec_age = soup.find(class_="field field-name-field-review-recommended-age field-type-list-integer field-label-hidden").get_text().split(" ", 1)[1]
        print('boo!')

        author = summary.find(class_="0 first").get_text().split(":", 1)[1].strip()
        genre = summary.find(class_="1").get_text().split(":", 1)[1].strip()
        book_type = summary.find(class_="types").get_text().split(":", 1)[1].strip()
        pub = summary.find(class_="publishers").get_text().split(":", 1)[1].strip()
        pub_date = summary.find(class_="2").get_text().split(":", 1)[1].strip()
        pub_rec_age = summary.find(class_="3").get_text().split(":", 1)[1].strip()
        pages = summary.find(class_="4").get_text().split(":", 1)[1].strip()
        #versions = summary.find(class_="5").get_text().split(":", 1)[1].strip()
        #updated = summary.find(class_="6 last").get_text().split(":", 1)[1].strip()
        themes = summary.find(class_="themes").get_text().split(":", 1)[1].strip()
        print("hi again")
    
        csv_writer.writerow([title, description, author, genre, book_type, pub, pub_date,\
                             pub_rec_age, pages, versions, themes, cs_rec_age])

https://www.commonsensemedia.org/book-reviews/black-beauty
hi
yes
boo!
hi again
https://www.commonsensemedia.org/book-reviews/love
hi
yes
boo!
hi again
https://www.commonsensemedia.org/book-reviews/whistle-for-willie
hi
yes
boo!
hi again
https://www.commonsensemedia.org/book-reviews/the-little-prince
hi
yes
boo!
hi again
https://www.commonsensemedia.org/book-reviews/the-hobbit
hi
yes
boo!
hi again
https://www.commonsensemedia.org/book-reviews/middle-school-is-worse-than-meatloaf
hi
yes
boo!


AttributeError: 'NoneType' object has no attribute 'get_text'

# Within the Loop

In [None]:
with open('lexile/books.csv', "a") as csv_file:
    csv_writer = writer(csv_file)
    
    #create the header
    headers = ['title', 'description', "author", "genre", 'book_type', 'publisher', "pub_date",\
               "pub_rec_age", "pages", "versions", "updated", "themes", "cs_rec_age"]
    
    csv_writer.writerow(headers)

    for url in urls:
        print(url)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        summary = soup.find(class_="shutter-summary-pane panel-pane pane-product-details")
        print("hi")
        for s in soup: 
            title = soup.title.get_text().split("[")[0].strip()
            print('yes')
            description = s.find(class_='field field-name-field-one-liner field-type-text field-label-hidden').get_text()
            cs_rec_age = s.find(class_="field field-name-field-review-recommended-age field-type-list-integer field-label-hidden").get_text().split(" ", 1)[1]
        print('boo!')

        for summ in summary:  

            author = summ.find(class_="0 first").get_text().split(":", 1)[1].strip()
            genre = summ.find(class_="1").get_text().split(":", 1)[1].strip()
            book_type = summ.find(class_="types").get_text().split(":", 1)[1].strip()
            pub = summ.find(class_="publishers").get_text().split(":", 1)[1].strip()
            pub_date = summ.find(class_="2").get_text().split(":", 1)[1].strip()
            pub_rec_age = summ.find(class_="3").get_text().split(":", 1)[1].strip()
            pages = summ.find(class_="4").get_text().split(":", 1)[1].strip()
            versions = summ.find(class_="5").get_text().split(":", 1)[1].strip()
            updated = summ.find(class_="6 last").get_text().split(":", 1)[1].strip()
            themes = summ.find(class_="themes").get_text().split(":", 1)[1].strip()
    
        csv_writer.writerow([title, description, author, genre, book_type, pub, pub_date,\
                             pub_rec_age, pages, versions, updated, themes, cs_rec_age])

In [None]:
with open('lexile/books.csv', "a") as csv_file:
    csv_writer = writer(csv_file)
    
    #create the header
    headers = ['title', 'description', "author", "genre", 'book_type', 'publisher', "pub_date",\
               "pub_rec_age", "pages", "versions", "updated", "themes", "cs_rec_age"]
    
    #write a row of headers in the csv
    csv_writer.writerow(headers)
    
    #for sum in summary:
    title = soup2.find('h1').string
    description = soup2.find(class_='field field-name-field-one-liner field-type-text field-label-hidden').get_text()
    cs_rec_age = soup2.find(class_="field field-name-field-review-recommended-age field-type-list-integer field-label-hidden").get_text().split(" ", 1)[1]
    #csv_writer.writerow([title, description, cs_rec_age])
    
    author = summary.find(class_="0 first").get_text().split(":", 1)[1].strip()
    genre = summary.find(class_="1").get_text().split(":", 1)[1].strip()
    book_type = summary.find(class_="types").get_text().split(":", 1)[1].strip()
    pub = summary.find(class_="publishers").get_text().split(":", 1)[1].strip()
    pub_date = summary.find(class_="2").get_text().split(":", 1)[1].strip()
    pub_rec_age = summary.find(class_="3").get_text().split(":", 1)[1].strip()
    pages = summary.find(class_="4").get_text().split(":", 1)[1].strip()
    versions = summary.find(class_="5").get_text().split(":", 1)[1].strip()
    updated = summary.find(class_="6 last").get_text().split(":", 1)[1].strip()
    themes = summary.find(class_="themes").get_text().split(":", 1)[1].strip()

    csv_writer.writerow([title, description, author, genre, book_type, publisher, pub_date,\
               pub_rec_age, pages, versions, updated, themes, cs_rec_age])
    

In [None]:
books = soup.findAll(class_="content-content-wrapper")

In [None]:
with open('lexile/books.csv', 'a') as csv_file:
    csv_writer = writer(csv_file)
    
    #create header in the csv file
    headers = ['title', 'description', "author", "age"]
    
    #write a row of headers in the csv
    csv_writer.writerow(headers)
    
    #loop
    for book in books:
        title = book.find(class_="views-field views-field-field-reference-review-ent-prod result-title").get_text()
        description = book.find(class_="views-field views-field-field-one-liner one-liner").get_text()
        author = book.find(class_="views-field views-field-field-term-book-authors review-supplemental").get_text().replace(" By ", "").rstrip()
        age = book.find(class_="csm-green-age").get_text().replace("age ", "")
        csv_writer.writerow([title, description, author, age])

In [None]:
for page in all_pages_list: 
    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    books = soup.findAll(class_="content-content-wrapper")
    with open('lexile/books.csv', 'a') as csv_file:
        csv_writer = writer(csv_file)
        for book in books:
            title = book.find(class_="views-field views-field-field-reference-review-ent-prod result-title").get_text()
            description = book.find(class_="views-field views-field-field-one-liner one-liner").get_text()
            author = book.find(class_="views-field views-field-field-term-book-authors review-supplemental").get_text().replace(" By ", "").rstrip()
            age = book.find(class_="csm-green-age").get_text().replace("age ", "")
            csv_writer.writerow([title, description, author, age])

In [None]:
df = pd.read_csv('lexile/books.csv')
df[df['Title'].str.contains("/")]

[Get Book Covers](https://towardsdatascience.com/web-scraping-using-beautifulsoup-edd9441ba734)

In [None]:
covers = soup.findAll(class_="field-content review-product-image")
covers = [cover.findAll("img") for cover in covers]
title = [cover[1].get('title') for cover in covers]

In [None]:
title = [cover[1].get('title') for cover in covers]
title = [t[:-18].replace("/", "_") for t in title]
cover_src = [cover[1].get('src') for cover in covers]

In [None]:
info = dict(zip(title, cover_src))
!mkdir lexile/covers

In [None]:
for k, v in info.items():
    if '.jpg?' in v:
        try:
            with open('./lexile/covers/' + k + '.jpg', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)
    else:
        try:
            with open('./lexile/covers/' + k + '.png', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)

In [None]:
for page in all_pages_list: 
    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    covers = soup.findAll(class_="field-content review-product-image")
    covers = [cover.findAll("img") for cover in covers]
    title = [cover[1].get('title') for cover in covers]
    title = [t[:-18].replace("/", "_") for t in title]
    cover_src = [cover[1].get('src') for cover in covers]
    info = dict(zip(title, cover_src))
    for k, v in info.items():
        if '.jpg?' in v:
            try:
                with open('./lexile/covers/' + k + '.jpg', 'wb') as f:
                    f.write(requests.get(v).content)
            except FileNotFoundError as err:
                print(k)
        else:
            try:
                with open('./lexile/covers/' + k + '.png', 'wb') as f:
                    f.write(requests.get(v).content)
            except FileNotFoundError as err:
                print(k)

## Step 3: Explore the Data

## To do:


In [None]:
!mkdir lexile/test

In [None]:
for k, v in info.items():
    if '.jpg?' in v:
        try:
            with open('./lexile/covers/' + k + '.jpg', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)
    else:
        try:
            with open('./lexile/covers/' + k + '.png', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)