# NYRB Classics Color Analysis

### Collect All NYRB Classics into a Dataset

In [337]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [232]:
# Create list of relevant URLs
base_url = 'https://www.nyrb.com/collections/classics'
url_list = ["{}?page={}".format(base_url, str(page)) for page in range(1,10)]

In [234]:
# Retrieve information and parse through BeautifulSoup
stack = []
for url_ in url_list:
    print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    for div in soup_new.find_all("div", class_="product"):
        stack.append(div)

Processing https://www.nyrb.com/collections/classics?page=1




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Processing https://www.nyrb.com/collections/classics?page=2
Processing https://www.nyrb.com/collections/classics?page=3
Processing https://www.nyrb.com/collections/classics?page=4
Processing https://www.nyrb.com/collections/classics?page=5
Processing https://www.nyrb.com/collections/classics?page=6
Processing https://www.nyrb.com/collections/classics?page=7
Processing https://www.nyrb.com/collections/classics?page=8
Processing https://www.nyrb.com/collections/classics?page=9


In [236]:
# Parse bs4 ResultSet for information of interest - returns dictionary of values
def parse_product(product):
    
    # print(product)
    title = product.find("h4").text
    img = product.find("img")['src'][2:]
    detail = product.find("a")['href']
    nyrb_pub_date = pd.to_datetime(product['data-pubdate'])  # use this to filter published Classics from forthcoming Classics 
    
    return {
        "title": title,
        "img": img, 
        "detail": detail,
        "nyrb_pub_date": nyrb_pub_date
    }

In [248]:
# Parse information and add it to a pandas dataframe
all_books = pd.DataFrame()
for i in range(1, len(stack)):
    prod = parse_product(stack[i])
    prod = pd.DataFrame(prod, index=[0])
    all_books = all_books.append(prod)

In [262]:
# Filter to exclude forthcoming publications
books = all_books[all_books['nyrb_pub_date'] <= datetime.today()]
books = books.reset_index()
books = books.drop(columns = ['index'])

In [None]:
# Include Out of Print books
oop = {
    "title" = ['Letty Fox: Her Luck',
               'To the Finland Station',
               'The Diary of a Rapist',
               'The Man Who Watched Trains Go By',
               'The Sorrow Beyond Dreams',
               'Selected Stories of Robert Walser',
               'The Towers of Trebizond'],
    "img" = ['https://images.gr-assets.com/books/1320400476l/132508.jpg',
             'https://images.gr-assets.com/books/1320440378l/694282.jpg',
             'https://i2.wp.com/i4.photobucket.com/albums/y126/paradorlounge/159017094601LZZZZZZZ.jpg', 
             'https://images-na.ssl-images-amazon.com/images/I/41LLjzL%2B%2BML._SX311_BO1,204,203,200_.jpg',
             'https://images-na.ssl-images-amazon.com/images/I/41B238tikhL._SX294_BO1,204,203,200_.jpg',
             'https://images.gr-assets.com/books/1320472249l/160313.jpg',
             'https://images.gr-assets.com/books/1386748970l/192954.jpg'],
    "detail" = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'],
    "nyrb_pub_date" = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']
}

In [263]:
# Peep the dataset
books[0:6]

Unnamed: 0,title,img,detail,nyrb_pub_date
0,Journey into the Mind's Eye,cdn.shopify.com/s/files/1/0726/9203/products/J...,/collections/classics/products/journey-into-th...,2018-07-10
1,Sand,cdn.shopify.com/s/files/1/0726/9203/products/S...,/collections/classics/products/sand,2018-06-12
2,Havoc,cdn.shopify.com/s/files/1/0726/9203/products/H...,/collections/classics/products/havoc,2018-06-12
3,The Seventh Cross,cdn.shopify.com/s/files/1/0726/9203/products/s...,/collections/classics/products/the-seventh-cross,2018-05-22
4,Compulsory Games,cdn.shopify.com/s/files/1/0726/9203/products/9...,/collections/classics/products/compulsory-games,2018-05-08
5,Basic Black with Pearls,cdn.shopify.com/s/files/1/0726/9203/products/B...,/collections/classics/products/basic-black-wit...,2018-04-17


#### Quick Info about NYRB Classics Dataset

In [265]:
print("Number of books: {}".format(len(books)))
print("Publication dates range from {} to {}".format(min(books['nyrb_pub_date']), max(books['nyrb_pub_date'])))

Num of books: 486
Publication dates range from 1999-09-30 00:00:00 to 2018-07-10 00:00:00


### Scrape information from individual book pages

In [277]:
r = requests.get("https://www.nyrb.com/collections/classics/products/sand")
soup = BeautifulSoup(r.text, "html5lib")
test = soup.find_all("div",class_="span8")   # this find_all request only captures some of the information needed

In [381]:
def parse_details(book):
    
    # "book" is a BeautifulSoup object - right after calling BeautifulSoup()
    title = book.find_all("div", class_='span8')[0].find("h1").text
    people = book.find_all("div", class_='span8')[0].find("h2", class_="combined-authors").text.strip()
    isbn = book.find_all("div", class_='description additional')[0].find(class_='variant-sku').text
    more = book.find_all("div", class_='description additional')[0].find("p").text
    tags_mess = book.find_all("div",class_="span8")[0].find("div", class_="tags clearfix").find_all("a")
    
    # parse authors and language string
    author = re.split(",", people)[0][3:]
    language = re.split(" ", re.split(",", people)[1].strip())[3]
    
    # parse additional information string
    pages = re.split("\s+", re.split("\s+\s+", more)[2])[1]
    
    # parse tags
    l = []
    for i in range(0, len(a)):
        l.append(a[i].text)
        
    tags = ",".join(l)
    
    return {
        "title": title,
        "author": author,
        "isbn": isbn,
        "pages": pages,
        "original_language": language,
        "tags": tags
    }

In [382]:
# Example
parse_details(soup)

{'author': 'Wolfgang Herrndorf',
 'isbn': '9781681372013',
 'original_language': 'German',
 'pages': '464',
 'tags': 'Available as E-Book,German Literature,Historical Fiction,Literary Fiction,Suspense & Crime',
 'title': 'Sand'}