## Step 1: Frame the Problem

Currently, stakeholders know can identify which books are too difficult for their children/students to read but not which books are beyond the maturity level of the child/student. This project attempts to rectify this issue. 

## Step 2: Get the Data

[Guide](https://www.dataquest.io/blog/web-scraping-tutorial-python/)

In [7]:
import os
import string
import requests
import pandas as pd 
from bs4 import BeautifulSoup
from csv import writer

In [16]:
base_url = 'https://www.commonsensemedia.org/book-reviews'
page = '?page='
all_pages = range(1,291)
all_pages_list = [base_url+page+str(p) for p in all_pages]

In [17]:
page = requests.get(base_url)

soup = BeautifulSoup(page.text, 'html.parser')

In [24]:
title = [s.get_text().strip() for s in soup.findAll(class_="views-field views-field-field-reference-review-ent-prod result-title")]

In [25]:
stripper = str.maketrans("", "", string.punctuation)
title_edited = [t.translate(stripper).replace(" ", "-").lower() for t in title] #preps for creating url
urls = [base_url + "/" + t for t in title_edited] 

In [42]:
df = pd.DataFrame()
df['title'] = title
df['url'] = urls
df.head()

Unnamed: 0,title,url
0,Black Beauty,https://www.commonsensemedia.org/book-reviews/...
1,Love,https://www.commonsensemedia.org/book-reviews/...
2,Whistle for Willie,https://www.commonsensemedia.org/book-reviews/...
3,The Little Prince,https://www.commonsensemedia.org/book-reviews/...
4,The Hobbit,https://www.commonsensemedia.org/book-reviews/...


In [45]:
page2 = requests.get('https://www.commonsensemedia.org/book-reviews/black-beauty')

soup2 = BeautifulSoup(page2.text, 'html.parser')

In [71]:
summary

<div class="shutter-summary-pane panel-pane pane-product-details">
<h2 class="pane-title">Book details</h2>
<div class="pane-content">
<div class="inner-wrapper">
<div class="item-list"><ul id="review-product-details-list"><li class="0 first"><strong class="label">Author:</strong> <a href="/search/Anna%20Sewell" property="author" typeof="schema:Person" vocab="http://schema.org"><meta content="Anna Sewell" property="name"/>Anna Sewell</a></li>
<li class="1"><strong class="label">Genre:</strong> <a href="/reviews/category/book/genre/literary-fiction-257">Literary Fiction</a></li>
<li class="themes"><strong class="label">Topics:</strong> <a href="/reviews/category/book/topic/friendship-21616">Friendship</a>, <a href="/reviews/category/book/topic/great-boy-role-models-21617">Great Boy Role Models</a>, <a href="/reviews/category/book/topic/horses-and-farm-animals-21622">Horses and Farm Animals</a></li>
<li class="types"><strong class="label">Book type:</strong> <a href="/search?f%5B0%5D=fie

In [69]:
summary = soup2.find(class_="shutter-summary-pane panel-pane pane-product-details")
for s in summary.stripped_strings:
    print(s)

Book details
Author:
Anna Sewell
Genre:
Literary Fiction
Topics:
Friendship
,
Great Boy Role Models
,
Horses and Farm Animals
Book type:
Fiction
Publisher:
Penguin Group
Publication date:
November 24, 1887
Publisher's recommended age(s):
9 - 12
Number of pages:
275
Available on:
Paperback, Audiobook (unabridged), Audiobook (abridged), Hardback, Kindle
Last updated:
December 04, 2019
Continue reading
Show less


In [89]:
summary.find(class_="0 first").get_text()
summary.find(class_="1").get_text()
summary.find(class_="2").get_text()
summary.find(class_="3").get_text()
summary.find(class_="4").get_text()
summary.find(class_="5").get_text()
summary.find(class_="6 last").get_text()

'Last updated: December 04, 2019'

In [None]:
books = soup.findAll(class_="content-content-wrapper")

In [None]:
with open('lexile/books.csv', 'w') as csv_file:
    csv_writer = writer(csv_file)
    
    #create header in the csv file
    headers = ['Title', 'Description', "Author", 'Age']
    
    #write a row of headers in the csv
    csv_writer.writerow(headers)
    
    #loop
    for book in books:
        title = book.find(class_="views-field views-field-field-reference-review-ent-prod result-title").get_text()
        description = book.find(class_="views-field views-field-field-one-liner one-liner").get_text()
        author = book.find(class_="views-field views-field-field-term-book-authors review-supplemental").get_text().replace(" By ", "").rstrip()
        age = book.find(class_="csm-green-age").get_text().replace("age ", "")
        csv_writer.writerow([title, description, author, age])

In [None]:
for page in all_pages_list: 
    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    books = soup.findAll(class_="content-content-wrapper")
    with open('lexile/books.csv', 'a') as csv_file:
        csv_writer = writer(csv_file)
        for book in books:
            title = book.find(class_="views-field views-field-field-reference-review-ent-prod result-title").get_text()
            description = book.find(class_="views-field views-field-field-one-liner one-liner").get_text()
            author = book.find(class_="views-field views-field-field-term-book-authors review-supplemental").get_text().replace(" By ", "").rstrip()
            age = book.find(class_="csm-green-age").get_text().replace("age ", "")
            csv_writer.writerow([title, description, author, age])

In [None]:
df = pd.read_csv('lexile/books.csv')
df[df['Title'].str.contains("/")]

[Get Book Covers](https://towardsdatascience.com/web-scraping-using-beautifulsoup-edd9441ba734)

In [4]:
covers = soup.findAll(class_="field-content review-product-image")
covers = [cover.findAll("img") for cover in covers]
title = [cover[1].get('title') for cover in covers]

In [5]:
title = [cover[1].get('title') for cover in covers]
title = [t[:-18].replace("/", "_") for t in title]
cover_src = [cover[1].get('src') for cover in covers]

In [6]:
info = dict(zip(title, cover_src))
!mkdir lexile/covers

In [7]:
for k, v in info.items():
    if '.jpg?' in v:
        try:
            with open('./lexile/covers/' + k + '.jpg', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)
    else:
        try:
            with open('./lexile/covers/' + k + '.png', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)

In [None]:
for page in all_pages_list: 
    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    covers = soup.findAll(class_="field-content review-product-image")
    covers = [cover.findAll("img") for cover in covers]
    title = [cover[1].get('title') for cover in covers]
    title = [t[:-18].replace("/", "_") for t in title]
    cover_src = [cover[1].get('src') for cover in covers]
    info = dict(zip(title, cover_src))
    for k, v in info.items():
        if '.jpg?' in v:
            try:
                with open('./lexile/covers/' + k + '.jpg', 'wb') as f:
                    f.write(requests.get(v).content)
            except FileNotFoundError as err:
                print(k)
        else:
            try:
                with open('./lexile/covers/' + k + '.png', 'wb') as f:
                    f.write(requests.get(v).content)
            except FileNotFoundError as err:
                print(k)

## Step 3: Explore the Data

## To do:


In [None]:
!mkdir lexile/test

In [None]:
for k, v in info.items():
    if '.jpg?' in v:
        try:
            with open('./lexile/covers/' + k + '.jpg', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)
    else:
        try:
            with open('./lexile/covers/' + k + '.png', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)