In [1]:
import pandas as pd
import numpy as np
import csv
from bs4 import BeautifulSoup
import os

In [2]:
file = "lexile/raw/book-reviews/3-willows.html"
with open(file, 'r') as f: 
        soup = BeautifulSoup(f.read(), 'html.parser')

## Get All Data

In [3]:
title = soup.h1.text
description = soup.find(class_='field field-name-field-one-liner field-type-text field-label-hidden').get_text()
csm_age = soup.find(class_="csm-green-age").get_text().replace("age ", "")[:-1]

In [4]:
print("Title: {}".format(title))
print()
print("Description: {}".format(description))
print()
print("CSM Recommended Age: {}".format(csm_age))

Title: 3 Willows

Description: Sisterhood author writes sweet new book about friendship.

CSM Recommended Age: 12


### Plot Summary

In [5]:
def get_summary(soup):
    remove_charachters = ["\xa0\n", "\xa0a", "\xa0"]
    plot_sum = soup.find(class_="field field-name-field-what-is-story field-type-text-long field-label-hidden").get_text()
    for charachter in remove_charachters:
        plot_sum = plot_sum.replace(charachter, " ")
    return plot_sum

### Is It Any Good? 

In [6]:
def get_csm_review(soup):
    remove_charachters = ["\xa0\n", "\xa0a", "\xa0"]
    csm_evaluation = soup.find(class_="field field-name-field-any-good field-type-text-long field-label-hidden").get_text()[:-1]
    for charachter in remove_charachters:
        csm_evaluation = csm_evaluation.replace(charachter, " ")
    return csm_evaluation

### Parent's rating

In [7]:
def parents_rating(soup): 
    try:
        for s in soup.find_all(class_='user-review-statistics adult'):
            adult_rating = s.find("div", attrs={'class': 'stat-wrapper age'})
        return adult_rating.text.replace("age ", "")[:-1]
    except:
        return np.nan
parents_rating(soup)

nan

### Children's rating

In [8]:
def kids_rating(soup):
    try:
        for s in soup.find_all(class_='user-review-statistics child'):
            childs_rating = s.find("div", attrs={'class': 'stat-wrapper age'})
        return childs_rating.text.replace("age ", "")[:-1]
    except:
        return np.nan
kids_rating(soup)

'12'

In [9]:
def pntk(soup):
    remove_charachters = ["\xa0\n", "\xa0a", "\xa0"]
    pntk = soup.find(class_="field field-name-field-parents-need-to-know field-type-text-long field-label-hidden").get_text()
    for charachter in remove_charachters:
        pntk = pntk.replace(charachter, " ")
    return pntk
pntk(soup)

"Parents need to know that this book from the author of The Sisterhood of the Traveling Pants ultimately delivers positive messages about doing what's right, making good choices, taking chances, and accepting oneself. Along the way, the three young teen protagonists deaI with a borderline eating disorder, an alcoholic parent, a fickle cute boy, and the tribulations of fitting in with the popular crowd. There are crushes, dates, and kisses, but no sex. The book name-drops characters from the author's Traveling Pants books and may appeal most to fans of the series.\n"

In [10]:
path = 'lexile/raw/book-reviews/'
files = os.listdir(path)
files = [path + file for file in files]

In [11]:
titles = []
descriptions = [] 
plot_sums = []
csm_evaluations = []
need_to_know = []
p_rating = []
k_rating = []
csm_ages = []

In [12]:
for file in files:
    try: 
        with open(file, 'r') as f: 
            soup = BeautifulSoup(f.read(), 'html.parser')
            titles.append(soup.h1.text)
            descriptions.append(soup.find(class_='field field-name-field-one-liner field-type-text field-label-hidden').get_text())
            csm_ages.append(soup.find(class_="csm-green-age").get_text().replace("age ", "")[:-1])
            plot_sums.append(get_summary(soup))
            csm_evaluations.append(get_csm_review(soup))
            need_to_know.append(pntk(soup))
            p_rating.append(parents_rating(soup))
            k_rating.append(kids_rating(soup))
    except:
        continue

In [13]:
df = pd.DataFrame({
    'title': titles,
    'description' : descriptions,
    'plot': plot_sums,
    'csm_review' : csm_evaluations,
    'need_to_know' : need_to_know,
    'par_rating' : p_rating,
    'kids_rating' : k_rating,
    'csm_rating' : csm_ages,
})

In [14]:
df.tail()

Unnamed: 0,title,description,plot,csm_review,need_to_know,par_rating,kids_rating,csm_rating
5811,The Fold,Insightful and frank novel explores beauty in ...,Joyce has a cute crush that doesn't know she e...,"An Na has written a beautiful, poignant coming...",Parents need to know that this book deals with...,12.0,11.0,9
5812,Secrets of the Terra-Cotta Soldier,Boy joins a come-to-life clay soldier in excit...,"Ming, a 14-year-old boy in 1970s China, is at ...",SECRETS OF THE TERRA-COTTA SOLDIER is an excit...,Parents need to know that Secrets of the Terra...,,4.0,10
5813,"Fallout: Lois Lane, Book 1","Comics reporter updated as teen in fun, slick ...","As the new girl at Metropolis High, Army brat ...",Author Gwenda Bond has Lois' lingo and attitud...,Parents need to know that Fallout reimagines S...,,12.0,13
5814,The Return of the King,The exciting and bittersweet conclusion of Fro...,The quest to destroy the One Ring of Power bef...,This concluding volume of Tolkien's three-book...,Parents need to know that if kids are ready to...,11.0,11.0,12
5815,The Random House Book of Poetry for Children,Classic will be the most dog-eared on your shelf.,"This treasury of 572 poems, some written but m...","Despite its no-nonsense title, THE RANDOM HOUS...",Parents need to know that this collection of p...,8.0,,5


In [15]:
df.shape

(5816, 8)

In [16]:
df.to_csv('csv/book_info.csv')

## Get All Data from the Summary box

In [17]:
data = []

for file in files:
    with open(file, 'r') as f: 
        soup = BeautifulSoup(f.read(), 'html.parser') 
        summary = soup.find(class_="shutter-summary-pane panel-pane pane-product-details")
        links = summary.find_all('li')
        d = {}
        for link in links:
            k, v = link.text.split(":")
            k = k.strip()
            v = v.strip()
            d[k] = v
        data.append(d)


In [18]:
df2 = pd.DataFrame(data)

In [19]:
df2.head()

Unnamed: 0,Author,Genre,Topics,Book type,Publisher,Publication date,Publisher's recommended age(s),Number of pages,Available on,Last updated,Illustrator,Authors,Awards,Publishers,Award,Illustrators
0,CJ Omololu,Mystery,"Adventures, Brothers and Sisters, Friendship, ...",Fiction,Delacorte Press,"February 24, 2015",12 - 18,336,"Nook, Hardback, iBooks, Kindle","June 19, 2019",,,,,,
1,Beth Kephart,Coming of Age,"Friendship, History, Horses and Farm Animals",Fiction,Philomel,"July 19, 2012",14 - 17,304,"Nook, Hardback, iBooks, Kindle","May 06, 2019",,,,,,
2,Soman Chainani,Fairy Tale,"Magic and Fantasy, Princesses, Fairies, Mermai...",Fiction,HarperCollins Children's Books,"May 14, 2013",8 - 17,496,"Nook, Audiobook (unabridged), Hardback, iBooks...","October 18, 2017",Iacopo Bruno,,,,,
3,Kami Garcia,Science Fiction,"Magic and Fantasy, Adventures, Great Boy Role ...",Fiction,Imprint,"January 3, 2017",14 - 18,320,"Nook, Audiobook (abridged), Hardback, iBooks, ...","June 19, 2019",,,,,,
4,Tonya Bolden,Historical Fiction,"Friendship, History",Fiction,Bloomsbury Children's Books,"May 30, 2017",,240,"Nook, Audiobook (unabridged), Hardback, Kindle","January 18, 2019",,,,,,


In [20]:
df2.shape

(5816, 16)

In [21]:
df2.to_csv('csv/book_info_summary.csv')

In [22]:
df2.tail()

Unnamed: 0,Author,Genre,Topics,Book type,Publisher,Publication date,Publisher's recommended age(s),Number of pages,Available on,Last updated,Illustrator,Authors,Awards,Publishers,Award,Illustrators
5811,An Na,Coming of Age,,Fiction,Putnam Juvenile,"April 10, 2008",9 - 12,192,,"July 12, 2017",,,,,,
5812,,Historical Fiction,"Adventures, Friendship, History, Horses and Fa...",Fiction,Amulet Books,"January 14, 2014",10 - 12,240,"Nook, Hardback, Kindle","June 19, 2019",,"Ying Chang Compestine, Vinson Compestine",,,,
5813,Gwenda Bond,Science Fiction,"Superheroes, Adventures, Great Girl Role Model...",Fiction,Switch Press,"May 1, 2015",12 - 18,304,"Hardback, iBooks, Kindle","June 19, 2019",,,,,,
5814,J.R.R. Tolkien,Fantasy,"Magic and Fantasy, Princesses, Fairies, Mermai...",Fiction,Houghton Mifflin Children's Books,"October 20, 1955",12 - 14,520,,"February 15, 2020",,,,,,
5815,Jack Prelutsky,Poetry,,Non-Fiction,Random House,"September 12, 1983",5 - 7,248,,"July 12, 2017",Arnold Lobel,,,,,


In [23]:
df3 = pd.concat([df, df2], axis=1)

In [24]:
df.shape

(5816, 8)

In [25]:
df2.shape

(5816, 16)

In [26]:
df3.shape

(5816, 24)

In [27]:
df3.head()

Unnamed: 0,title,description,plot,csm_review,need_to_know,par_rating,kids_rating,csm_rating,Author,Genre,...,Publisher's recommended age(s),Number of pages,Available on,Last updated,Illustrator,Authors,Awards,Publishers,Award,Illustrators
0,The Third Twin,Gripping thriller skimps on character developm...,"Twins Ava and Alexa ""Lexi"" Rios live in an aff...","THE THIRD TWIN has an interesting, compelling ...",Parents need to know that The Third Twin is a ...,17.0,14.0,12,CJ Omololu,Mystery,...,12 - 18,336,"Nook, Hardback, iBooks, Kindle","June 19, 2019",,,,,,
1,Small Damages,Luminous story of pregnant teen's summer in Sp...,"It's the summer of 1996, which 18-year-old Ken...",This could well have been a minefield of clich...,Parents need to know that Small Damages is nar...,,14.0,14,Beth Kephart,Coming of Age,...,14 - 17,304,"Nook, Hardback, iBooks, Kindle","May 06, 2019",,,,,,
2,"The School for Good and Evil, Book 1",Fractured fairy tale has plenty of twists for ...,When best friends Sophie and Agatha are stolen...,The School for Good and Evil is no run-of-the-...,Parents need to know that The School for Good ...,11.0,11.0,8,Soman Chainani,Fairy Tale,...,8 - 17,496,"Nook, Audiobook (unabridged), Hardback, iBooks...","October 18, 2017",Iacopo Bruno,,,,,
3,"Agent of Chaos: The X-Files Origins, Book 1","Series pictures Mulder as teen, captures essen...","Set in 1979, AGENT OF CHAOS follows a 17-year-...",Popular TV characters don't always make a smoo...,Parents need to know that Agent of Chaos: The ...,,,13,Kami Garcia,Science Fiction,...,14 - 18,320,"Nook, Audiobook (abridged), Hardback, iBooks, ...","June 19, 2019",,,,,,
4,Crossing Ebenezer Creek,Heartbreaking novel follows freed slaves on Sh...,CROSSING EBENEZER CREEK is a YA novel from awa...,"Beautifully written and poetically rendered, t...",Parents need to know that Crossing Ebenezer Cr...,,,13,Tonya Bolden,Historical Fiction,...,,240,"Nook, Audiobook (unabridged), Hardback, Kindle","January 18, 2019",,,,,,


In [28]:
df3.to_csv('csv/book_info_complete.csv')