In [1]:
import pandas as pd
import csv
from bs4 import BeautifulSoup
import os

In [2]:
file = "lexile/raw/book-reviews/3-willows.html"
with open(file, 'r') as f: 
        soup = BeautifulSoup(f.read(), 'html.parser')

## Get All Data

In [3]:
title = soup.h1.text
description = soup.find(class_='field field-name-field-one-liner field-type-text field-label-hidden').get_text()
csm_age = soup.find(class_="csm-green-age").get_text().replace("age ", "")[:-1]

In [4]:
print("Title: {}".format(title))
print()
print("Description: {}".format(description))
print()
print("CSM Recommended Age: {}".format(csm_age))

Title: 3 Willows

Description: Sisterhood author writes sweet new book about friendship.

CSM Recommended Age: 12


### Plot Summary

In [5]:
def get_summary(soup):
    remove_charachters = ["\xa0\n", "\xa0a", "\xa0"]
    plot_sum = soup.find(class_="field field-name-field-what-is-story field-type-text-long field-label-hidden").get_text()
    for charachter in remove_charachters:
        plot_sum = plot_sum.replace(charachter, " ")
    return plot_sum

### Is It Any Good? 

In [6]:
def get_csm_review(soup):
    remove_charachters = ["\xa0\n", "\xa0a", "\xa0"]
    csm_evaluation = soup.find(class_="field field-name-field-any-good field-type-text-long field-label-hidden").get_text()[:-1]
    for charachter in remove_charachters:
        csm_evaluation = csm_evaluation.replace(charachter, " ")
    return csm_evaluation

### Parent's rating

In [7]:
def parents_rating(soup): 
    try:
        for s in soup.find_all(class_='user-review-statistics adult'):
            adult_rating = s.find("div", attrs={'class': 'stat-wrapper age'})
        return adult_rating.text.replace("age ", "")[:-1]
    except:
        return "0"
parents_rating(soup)

'0'

### Children's rating

In [8]:
def kids_rating(soup):
    try:
        for s in soup.find_all(class_='user-review-statistics child'):
            childs_rating = s.find("div", attrs={'class': 'stat-wrapper age'})
        return childs_rating.text.replace("age ", "")[:-1]
    except:
        return '0'
kids_rating(soup)

'12'

In [9]:
def pntk(soup):
    remove_charachters = ["\xa0\n", "\xa0a", "\xa0"]
    pntk = soup.find(class_="field field-name-field-parents-need-to-know field-type-text-long field-label-hidden").get_text()
    for charachter in remove_charachters:
        pntk = pntk.replace(charachter, " ")
    return pntk
pntk(soup)

"Parents need to know that this book from the author of The Sisterhood of the Traveling Pants ultimately delivers positive messages about doing what's right, making good choices, taking chances, and accepting oneself. Along the way, the three young teen protagonists deaI with a borderline eating disorder, an alcoholic parent, a fickle cute boy, and the tribulations of fitting in with the popular crowd. There are crushes, dates, and kisses, but no sex. The book name-drops characters from the author's Traveling Pants books and may appeal most to fans of the series.\n"

## Get All Data from the Summary box

In [10]:
summary = soup.find(class_="shutter-summary-pane panel-pane pane-product-details")
links = summary.find_all('li')
data = []
d = {}
for link in links: 
    k, v = link.text.split(":")
    k = k.strip()
    v = v.strip()
    d[k] = v
data.append(d)
data

[{'Author': 'Ann Brashares',
  'Genre': 'Friendship',
  'Book type': 'Fiction',
  'Publisher': 'Delacorte Press',
  'Publication date': 'January 13, 2009',
  "Publisher's recommended age(s)": '13 - 17',
  'Number of pages': '320',
  'Last updated': 'July 13, 2017'}]

In [36]:
for link in links:
    print(link.text)

Author: Ann Brashares
Genre: Friendship
Book type: Fiction
Publisher: Delacorte Press
Publication date: January 13, 2009
Publisher's recommended age(s): 13 - 17
Number of pages: 320
Last updated: July 13, 2017


In [11]:
path = 'lexile/raw/book-reviews/'
files = os.listdir(path)
files = [path + file for file in files]

In [12]:
titles = []
descriptions = [] 
plot_sums = []
csm_evaluations = []
need_to_know = []
p_rating = []
k_rating = []
csm_ages = []

In [13]:
for file in files:
    with open(file, 'r') as f: 
        soup = BeautifulSoup(f.read(), 'html.parser')
        titles.append(soup.h1.text)
        descriptions.append(soup.find(class_='field field-name-field-one-liner field-type-text field-label-hidden').get_text())
        csm_ages.append(soup.find(class_="csm-green-age").get_text().replace("age ", "")[:-1])
        plot_sums.append(get_summary(soup))
        csm_evaluations.append(get_csm_review(soup))
        need_to_know.append(pntk(soup))
        p_rating.append(parents_rating(soup))
        k_rating.append(kids_rating(soup))

In [26]:
df = pd.DataFrame()

In [27]:
df['title'] = titles
df['description']  = descriptions
df['plot'] = plot_sums
df['csm_review'] = csm_evaluations
df['need_to_know'] = need_to_know
df['par_rating'] = p_rating
df['kids_rating'] = k_rating
df['csm_rating'] = csm_ages


In [37]:
df.head()

Unnamed: 0,title,description,plot,csm_review,need_to_know,par_rating,kids_rating,csm_rating
0,The Third Twin,Gripping thriller skimps on character developm...,"Twins Ava and Alexa ""Lexi"" Rios live in an aff...","THE THIRD TWIN has an interesting, compelling ...",Parents need to know that The Third Twin is a ...,17,14,12
1,Small Damages,Luminous story of pregnant teen's summer in Sp...,"It's the summer of 1996, which 18-year-old Ken...",This could well have been a minefield of clich...,Parents need to know that Small Damages is nar...,0,14,14
2,"The School for Good and Evil, Book 1",Fractured fairy tale has plenty of twists for ...,When best friends Sophie and Agatha are stolen...,The School for Good and Evil is no run-of-the-...,Parents need to know that The School for Good ...,11,11,8
3,"Agent of Chaos: The X-Files Origins, Book 1","Series pictures Mulder as teen, captures essen...","Set in 1979, AGENT OF CHAOS follows a 17-year-...",Popular TV characters don't always make a smoo...,Parents need to know that Agent of Chaos: The ...,0,0,13
4,Crossing Ebenezer Creek,Heartbreaking novel follows freed slaves on Sh...,CROSSING EBENEZER CREEK is a YA novel from awa...,"Beautifully written and poetically rendered, t...",Parents need to know that Crossing Ebenezer Cr...,0,0,13


In [39]:
df.to_csv('book_info.csv', index=False )