# NYRB Classics Color Analysis

### Collect All NYRB Classics into a Dataset

In [2]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [3]:
# Create list of relevant URLs
base_url = 'https://www.nyrb.com/collections/classics'
url_list = ["{}?page={}".format(base_url, str(page)) for page in range(1,10)]

In [234]:
# Retrieve information and parse through BeautifulSoup
stack = []
for url_ in url_list:
    print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    for div in soup_new.find_all("div", class_="product"):
        stack.append(div)

Processing https://www.nyrb.com/collections/classics?page=1




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Processing https://www.nyrb.com/collections/classics?page=2
Processing https://www.nyrb.com/collections/classics?page=3
Processing https://www.nyrb.com/collections/classics?page=4
Processing https://www.nyrb.com/collections/classics?page=5
Processing https://www.nyrb.com/collections/classics?page=6
Processing https://www.nyrb.com/collections/classics?page=7
Processing https://www.nyrb.com/collections/classics?page=8
Processing https://www.nyrb.com/collections/classics?page=9


In [236]:
# Parse bs4 ResultSet for information of interest - returns dictionary of values
def parse_product(product):
    
    # print(product)
    title = product.find("h4").text
    img = product.find("img")['src'][2:]
    detail = product.find("a")['href']
    nyrb_pub_date = pd.to_datetime(product['data-pubdate'])  # use this to filter published Classics from forthcoming Classics 
    
    return {
        "title": title,
        "img": img, 
        "detail": detail,
        "nyrb_pub_date": nyrb_pub_date
    }

In [248]:
# Parse information and add it to a pandas dataframe
all_books = pd.DataFrame()
for i in range(0, len(stack)):
    prod = parse_product(stack[i])
    prod = pd.DataFrame(prod, index=[0])
    all_books = all_books.append(prod)

In [262]:
# Filter to exclude forthcoming publications
books = all_books[all_books['nyrb_pub_date'] <= datetime.today()]
books = books.reset_index()
books = books.drop(columns = ['index'])

In [None]:
# Include Out of Print books
oop = {
    "title" = ['Letty Fox: Her Luck',
               'To the Finland Station',
               'The Diary of a Rapist',
               'The Man Who Watched Trains Go By',
               'The Sorrow Beyond Dreams',
               'Selected Stories of Robert Walser',
               'The Towers of Trebizond'],
    "img" = ['https://images.gr-assets.com/books/1320400476l/132508.jpg',
             'https://images.gr-assets.com/books/1320440378l/694282.jpg',
             'https://i2.wp.com/i4.photobucket.com/albums/y126/paradorlounge/159017094601LZZZZZZZ.jpg', 
             'https://images-na.ssl-images-amazon.com/images/I/41LLjzL%2B%2BML._SX311_BO1,204,203,200_.jpg',
             'https://images-na.ssl-images-amazon.com/images/I/41B238tikhL._SX294_BO1,204,203,200_.jpg',
             'https://images.gr-assets.com/books/1320472249l/160313.jpg',
             'https://images.gr-assets.com/books/1386748970l/192954.jpg'],
    "detail" = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'],
    "nyrb_pub_date" = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']
}

In [263]:
# Peep the dataset
books[0:6]

Unnamed: 0,title,img,detail,nyrb_pub_date
0,Journey into the Mind's Eye,cdn.shopify.com/s/files/1/0726/9203/products/J...,/collections/classics/products/journey-into-th...,2018-07-10
1,Sand,cdn.shopify.com/s/files/1/0726/9203/products/S...,/collections/classics/products/sand,2018-06-12
2,Havoc,cdn.shopify.com/s/files/1/0726/9203/products/H...,/collections/classics/products/havoc,2018-06-12
3,The Seventh Cross,cdn.shopify.com/s/files/1/0726/9203/products/s...,/collections/classics/products/the-seventh-cross,2018-05-22
4,Compulsory Games,cdn.shopify.com/s/files/1/0726/9203/products/9...,/collections/classics/products/compulsory-games,2018-05-08
5,Basic Black with Pearls,cdn.shopify.com/s/files/1/0726/9203/products/B...,/collections/classics/products/basic-black-wit...,2018-04-17


#### Quick Info about NYRB Classics Dataset

In [265]:
print("Number of books: {}".format(len(books)))
print("Publication dates range from {} to {}".format(min(books['nyrb_pub_date']), max(books['nyrb_pub_date'])))

Num of books: 486
Publication dates range from 1999-09-30 00:00:00 to 2018-07-10 00:00:00


### Scrape information from individual book pages

In [437]:
# Create list of relevant URLs from earlier collection
base_url = 'https://www.nyrb.com'
book_urls = [books['detail'][i] for i in range(0, len(books))]
url_list = ["{}{}".format(base_url, url) for url in book_urls]

In [438]:
# Retrieve information and parse through BeautifulSoup
stack = []
for url_ in url_list:
    # print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    stack.append(soup_new)

In [526]:
# Further parse BeautifulSoup for information of interest - returns dictionary of values
def parse_details(book):
    
    # "book" is a BeautifulSoup object - right after calling BeautifulSoup()
    title = book.find_all("div", class_='span8')[0].find("h1").text
    people = book.find_all("div", class_='span8')[0].find("h2", class_="combined-authors").text.strip()
    isbn = book.find_all("div", class_='description additional')[0].find(class_='variant-sku').text
    more = book.find_all("div", class_='description additional')[0].find("p").text
    tags_mess = book.find_all("div",class_="span8")[0].find("div", class_="tags clearfix").find_all("a")
    
    # parse authors and language string
    author = re.split(",", people)[0][3:]
    try:
        phrase = re.search('translated from the [a-zA-z]{1,10}', people).group(0)
        language = re.split("\s+", phrase)[-1]
    except AttributeError:
        try:
            language = re.search('translated', people).group(0)
        except AttributeError:
            language = 'English'
 
    # parse additional information string
    pages = re.split("\s+", re.split("\s+\s+", more)[2])[1]
    
    # parse tags
    l = []
    for i in range(0, len(tags_mess)):
        l.append(tags_mess[i].text)
        
    tags = ",".join(l)
    
    return {
        "title": title,
        "author": author,
        "isbn": isbn,
        "pages": pages,
        "original_language": language,
        "tags": tags
    }

In [485]:
# Example
parse_details(stack[1])

{'author': 'Wolfgang Herrndorf',
 'isbn': '9781681372013',
 'original_language': 'German',
 'pages': '464',
 'tags': 'Available as E-Book,German Literature,Historical Fiction,Literary Fiction,Suspense & Crime',
 'title': 'Sand'}

In [527]:
# Parse information and add it to a pandas dataframe
all_details = pd.DataFrame()
for i in range(0, len(stack)):
    details = parse_details(stack[i])
    details = pd.DataFrame(details, index=[0])
    all_details = all_details.append(details)

In [486]:
# Peep the dataset
all_details[0:6]

Unnamed: 0,title,author,isbn,pages,original_language,tags
0,Journey into the Mind's EyeFragments of an Aut...,Lesley Blanch,9781681371931,400,,"Available as E-Book,Biography & Memoir,Literat..."
0,Sand,Wolfgang Herrndorf,9781681372013,464,German,"Available as E-Book,German Literature,Historic..."
0,Havoc,Tom Kristensen,9781681372075,528,Danish,"Available as E-Book,International Literature,L..."
0,The Seventh Cross,Anna Seghers,9781681372129,416,German,"Available as E-Book,German Literature,Historic..."
0,Compulsory Games,Robert Aickman,9781681371894,368,,"Available as E-Book,British & Irish Literature..."
0,Basic Black with Pearls,Helen Weinzweig,9781681372167,160,,"Available as E-Book,Literary Fiction,Literatur..."


In [484]:
# need to fix titles
stack[0].find_all("div", class_='span8')[0].find("h1", class_='title')

<h1 class="title" itemprop="name">Journey into the Mind's Eye<span class="subtitle">Fragments of an Autobiography</span></h1>

#### Quick Statistics about NYRB Classics - Details Dataset

In [530]:
print("There are {} books in this dataset that were translated into English".format(str(len(all_details[all_details['original_language'] != 'English']))))

There are 194 books in this dataset that were translated into English


In [529]:
all_details['original_language'].value_counts() # need to fix unknown languages

English    292
French      50
German      31
Name: original_language, dtype: int64

In [None]:
# fix languages, including those not listed (like Maupassant)
# fix title - no subtitles
# fix tags - one tag per column?
# include author gender, country 

#### Find more information from [Publishers Weekly Translation Database](https://www.publishersweekly.com/pw/translation/search/index.html)

NOTE: Check for discrepancies, like Eileen Chang's page (she's not from Chile)

In [4]:
# Find page for NYRB specifically
url = 'https://www.publishersweekly.com/pw/translation/search/index.html?country=&language=&translator_gender=&submit=Search&author=&genre=&author_gender=&submitting=1&translator=&isbn=&title=&publisher=226'
r = requests.get(url)

In [5]:
# Parse page for book titles and authors
soup = BeautifulSoup(r.text, "html5lib")
list = soup.find("table", class_="table table-striped").find_all("tr")

In [6]:
# Collect list of relevant PW urls
pw_urls = [list[i].find("a")['href'] for i in range(0, len(list))]

In [7]:
base_url = 'https://www.publishersweekly.com'
url_list = ["{}{}".format(base_url, url) for url in pw_urls]

In [8]:
url_list[0]

'https://www.publishersweekly.com/pw/translation/search/index.html?record=692'

In [9]:
r = requests.get(url_list[0])

In [10]:
soup = BeautifulSoup(r.text, "html5lib")

In [34]:
nice = [soup.find("table").find_all("tr")[i].text for i in range(1,len(soup.find("table").find_all("tr")))]

In [35]:
len(nice)

12

In [13]:
stack = []
for url_ in url_list:
    # print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    stack.append(soup_new)

In [26]:
def pw_parser(book):
    
    # retrieve table with information
    info = [book.find("table").find_all("tr")[i].text for i in range(1,len(book.find("table").find_all("tr")))]
    author = re.split(":", info[0])[1]
    isbn = re.split(":", info[3])[1]
    pw_genre = re.split(":", info[5])[1]
    country = re.split(":", info[7])[1]
    gender = re.split(":", info[8])[1]
    
    return {
        "author": author,
        "isbn": isbn,
        "pw_genre": pw_genre,
        "country": country,
        "gender": gender
    }

In [27]:
pw_parser(stack[10])

{'author': 'Alfred Doblin',
 'country': 'Germany',
 'gender': 'Male',
 'isbn': '9781590179734',
 'pw_genre': 'Fiction'}

In [None]:
# figure out if lengths of nice is the same

In [None]:
for i in range(0, len(stack)):
    nice = [soup.find("table").find_all("tr")[i].text for i in range(1,len(soup.find("table").find_all("tr")))]

In [31]:
def create_nice(book):
    nice = [book.find("table").find_all("tr")[i].text for i in range(1,len(book.find("table").find_all("tr")))]
    return len(nice)

In [36]:
lengths = []
for i in range(0, len(stack)):
    lengths.append(create_nice(stack[i]))

In [37]:
lengths  # LENGTHS ARE NOT ALL THE SAME!! NOT ALL THE SAME INFO!!!! fix parser

[12,
 10,
 10,
 10,
 10,
 10,
 10,
 12,
 11,
 12,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 10,
 13,
 12,
 10,
 10,
 10,
 10,
 10,
 11,
 12,
 10,
 10,
 10,
 10,
 10,
 11,
 10,
 10,
 10,
 10,
 12,
 10,
 11,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 12,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 12,
 11,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10]