# NYRB Classics Color Analysis

### Scrape information from NYRB Classics Pages

In [128]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [232]:
# Create list of relevant URLs
base_url = 'https://www.nyrb.com/collections/classics'
url_list = ["{}?page={}".format(base_url, str(page)) for page in range(1,10)]

In [234]:
# Retrieve information and parse through BeautifulSoup
stack = []
for url_ in url_list:
    print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text)
    for div in soup_new.find_all("div", class_="product"):
        stack.append(div)

Processing https://www.nyrb.com/collections/classics?page=1




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Processing https://www.nyrb.com/collections/classics?page=2
Processing https://www.nyrb.com/collections/classics?page=3
Processing https://www.nyrb.com/collections/classics?page=4
Processing https://www.nyrb.com/collections/classics?page=5
Processing https://www.nyrb.com/collections/classics?page=6
Processing https://www.nyrb.com/collections/classics?page=7
Processing https://www.nyrb.com/collections/classics?page=8
Processing https://www.nyrb.com/collections/classics?page=9


In [236]:
# Parse bs4 ResultSet for information of interest - returns dictionary of values
def parse_product(product):
    
    # print(product)
    title = product.find("h4").text
    img = product.find("img")['src'][2:]
    detail = product.find("a")['href']
    nyrb_pub_date = pd.to_datetime(product['data-pubdate'])  # use this to filter published Classics from forthcoming Classics 
    
    return {
        "title": title,
        "img": img, 
        "detail": detail,
        "nyrb_pub_date": nyrb_pub_date
    }

In [248]:
# Parse information and add it to a pandas dataframe
all_books = pd.DataFrame()
for i in range(1, len(stack)):
    prod = parse_product(stack[i])
    prod = pd.DataFrame(prod, index=[0])
    all_books = all_books.append(prod)

In [250]:
# Filter to exclude forthcoming publications
books = all_books[all_books['nyrb_pub_date'] <= datetime.today()]

In [251]:
# Peep the dataset
books[0:6]

Unnamed: 0,title,img,detail,nyrb_pub_date
0,Journey into the Mind's Eye,cdn.shopify.com/s/files/1/0726/9203/products/J...,/collections/classics/products/journey-into-th...,2018-07-10
0,Sand,cdn.shopify.com/s/files/1/0726/9203/products/S...,/collections/classics/products/sand,2018-06-12
0,Havoc,cdn.shopify.com/s/files/1/0726/9203/products/H...,/collections/classics/products/havoc,2018-06-12
0,The Seventh Cross,cdn.shopify.com/s/files/1/0726/9203/products/s...,/collections/classics/products/the-seventh-cross,2018-05-22
0,Compulsory Games,cdn.shopify.com/s/files/1/0726/9203/products/9...,/collections/classics/products/compulsory-games,2018-05-08
0,Basic Black with Pearls,cdn.shopify.com/s/files/1/0726/9203/products/B...,/collections/classics/products/basic-black-wit...,2018-04-17


#### Statistics about NYRB Classics Dataset

In [252]:
print("Num of books: {}".format(len(books)))

Num of books: 486
