[Guide](https://www.dataquest.io/blog/web-scraping-tutorial-python/)

In [1]:
import requests
import pandas as pd 
from bs4 import BeautifulSoup
from csv import writer

In [2]:
base_url = 'https://www.commonsensemedia.org/book-reviews'
page = '?page='
all_pages = range(1,291)
all_pages_list = [base_url+page+str(p) for p in all_pages]

In [3]:
page = requests.get(base_url)

soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
books = soup.findAll(class_="content-content-wrapper")

In [29]:
with open('lexile/books.csv', 'w') as csv_file:
    csv_writer = writer(csv_file)
    
    #create header in the csv file
    headers = ['Title', 'Description', "Author", 'Age']
    
    #write a row of headers in the csv
    csv_writer.writerow(headers)
    
    #loop
    for book in books:
        title = book.find(class_="views-field views-field-field-reference-review-ent-prod result-title").get_text()
        description = book.find(class_="views-field views-field-field-one-liner one-liner").get_text()
        author = book.find(class_="views-field views-field-field-term-book-authors review-supplemental").get_text().replace(" By ", "").rstrip()
        age = book.find(class_="csm-green-age").get_text()
        csv_writer.writerow([title, description, author, age])

In [6]:
df = pd.read_csv('lexile/books.csv')
df.head()

Unnamed: 0,Title,Description,Author,Age
0,96 Miles,Kids seek safety in riveting tale of post-dis...,J. L. Esplin,age 9+
1,Tomorrow I'll Be Kind,Book about kindness is beautiful but preachy.,Jessica Hische,age 3+
2,Chirp,Girl finds voice to call out harassment in tr...,Kate Messner,age 10+
3,"The Night Country: The Hazel Wood, Book 2",Tough teen battles dark magic in inventive fa...,Melissa Albert,age 13+
4,Foul Is Fair,"Dark, violent revenge tale is a teen update o...",Hannah Capin,age 14+


In [13]:
df['Age'].value_counts().sort_index()

age 10+    559
age 11+    198
age 12+    502
age 13+    559
age 14+    638
age 15+    220
age 16+     89
age 17+     18
age 2+     147
age 3+     282
age 4+     788
age 5+     198
age 6+     219
age 7+     168
age 8+     600
age 9+     623
Name: Age, dtype: int64

In [15]:
len(df['Author'].unique())

2926

In [20]:
df.shape

(5808, 4)

In [32]:
for page in all_pages_list: 
    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    books = soup.findAll(class_="content-content-wrapper")
    with open('lexile/books.csv', 'a') as csv_file:
        csv_writer = writer(csv_file)
    
        #create heder in the csv file
        #headers = ['Title', 'Description', 'Age']
    
        #write a row of headers in the csv
        #csv_writer.writerow(headers)
    
        #loop
        for book in books:
            title = book.find(class_="views-field views-field-field-reference-review-ent-prod result-title").get_text()
            description = book.find(class_="views-field views-field-field-one-liner one-liner").get_text()
            author = book.find(class_="views-field views-field-field-term-book-authors review-supplemental").get_text().replace(" By ", "").rstrip()
            age = book.find(class_="csm-green-age").get_text()
            csv_writer.writerow([title, description, author, age])

[Get Book Covers](https://towardsdatascience.com/web-scraping-using-beautifulsoup-edd9441ba734)

In [17]:
covers = soup.findAll(class_="field-content review-product-image")
covers = [cover.findAll("img") for cover in covers]

In [18]:
title = [cover[1].get('title') for cover in covers]
cover_src = [cover[1].get('src') for cover in covers]

In [None]:
info = dict(zip(title, cover_src))

In [None]:
for k, v in info.items():
    with open('./lexile/covers/' + k + '.png', 'wb') as f:
        f.write(requests.get(v).content)

In [None]:
for page in all_pages_list: 
    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    covers = soup.findAll(class_="field-content review-product-image")
    covers = [cover.findAll("img") for cover in covers]
    title = [cover[1].get('title') for cover in covers]
    cover_src = [cover[1].get('src') for cover in covers]
    info = dict(zip(title, cover_src))
    for k, v in info.items():
        try:
            with open('./lexile/covers/' + k + '.png', 'wb') as f:
                f.write(requests.get(v).content)
        except FileNotFoundError as err:
            print(k)


## To do:
- rename the book covers
- clean the age column
- clean the names with a "/" so that we can download them