# NYRB Classics Color Analysis

### Collect All NYRB Classics into a Dataset

In [7]:
import re
import requests
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from datetime import datetime

In [8]:
# Create list of relevant URLs
base_url = 'https://www.nyrb.com/collections/classics'
url_list = ["{}?page={}".format(base_url, str(page)) for page in range(1,10)]

In [9]:
# Retrieve information and parse through BeautifulSoup
stack = []
for url_ in url_list:
    print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    for div in soup_new.find_all("div", class_="product"):
        stack.append(div)

Processing https://www.nyrb.com/collections/classics?page=1
Processing https://www.nyrb.com/collections/classics?page=2
Processing https://www.nyrb.com/collections/classics?page=3
Processing https://www.nyrb.com/collections/classics?page=4
Processing https://www.nyrb.com/collections/classics?page=5
Processing https://www.nyrb.com/collections/classics?page=6
Processing https://www.nyrb.com/collections/classics?page=7
Processing https://www.nyrb.com/collections/classics?page=8
Processing https://www.nyrb.com/collections/classics?page=9


In [10]:
# Parse bs4 ResultSet for information of interest - returns dictionary of values
def parse_product(product):
    
    # print(product)
    title = product.find("h4").text
    img = product.find("img")['src'][2:]
    detail = product.find("a")['href']
    nyrb_pub_date = pd.to_datetime(product['data-pubdate'])  # use this to filter published Classics from forthcoming Classics 
    
    return {
        "title": title,
        "img": img, 
        "detail": detail,
        "nyrb_pub_date": nyrb_pub_date
    }

In [11]:
# Parse information and add it to a pandas dataframe
all_books = pd.DataFrame()
for i in range(0, len(stack)):
    prod = parse_product(stack[i])
    prod = pd.DataFrame(prod, index=[0])
    all_books = all_books.append(prod)

In [16]:
stack[0]

<div class="product span2 adaptive-grid" data-pubdate="2019-02-19" id="product-negrophobia">
	
  
	<script>
		var saleAmount = Math.round(100 - 1495 / 0 * 100);
		if (saleAmount <= '10') $('#sale-negrophobia').remove();
	</script>

	<div class="productholder">
	  <div class="image">
		<a href="/collections/classics/products/negrophobia">
		  <img alt="Negrophobia" class="loop-image" src="//cdn.shopify.com/s/files/1/0726/9203/products/james.NEW_large.jpg?v=1532102917"/>
		</a>
	  </div>

	  <div class="details ploop">
		<a class="clearfix" href="/collections/classics/products/negrophobia">
		  <h4 class="title">Negrophobia</h4>
		  
		  <span class="author"><!-- sortauthorhandle: james collection: all --><!-- sortauthorhandle: james collection: ongiri --><!-- sortauthorhandle: james collection: james --><!-- sortauthorhandle: james collection: forthcoming --><!-- sortauthorhandle: james collection: classics --></span>
			
            
        </a>
	  </div>
	</div>
</div>

In [14]:
# Filter to exclude forthcoming publications
books = all_books[all_books['nyrb_pub_date'] <= datetime.today()]
books = books.reset_index()

In [None]:
# Include Out of Print books
oop = {
    "title" = ['Letty Fox: Her Luck',
               'To the Finland Station',
               'The Diary of a Rapist',
               'The Man Who Watched Trains Go By',
               'The Sorrow Beyond Dreams',
               'Selected Stories of Robert Walser',
               'The Towers of Trebizond'],
    "img" = ['https://images.gr-assets.com/books/1320400476l/132508.jpg',
             'https://images.gr-assets.com/books/1320440378l/694282.jpg',
             'https://i2.wp.com/i4.photobucket.com/albums/y126/paradorlounge/159017094601LZZZZZZZ.jpg', 
             'https://images-na.ssl-images-amazon.com/images/I/41LLjzL%2B%2BML._SX311_BO1,204,203,200_.jpg',
             'https://images-na.ssl-images-amazon.com/images/I/41B238tikhL._SX294_BO1,204,203,200_.jpg',
             'https://images.gr-assets.com/books/1320472249l/160313.jpg',
             'https://images.gr-assets.com/books/1386748970l/192954.jpg'],
    "detail" = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'],
    "nyrb_pub_date" = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']
}

In [49]:
# Peep the dataset
books[0:6]

Unnamed: 0,index,detail,img,nyrb_pub_date,title
0,0,/collections/classics/products/journey-into-th...,cdn.shopify.com/s/files/1/0726/9203/products/J...,2018-07-10,Journey into the Mind's Eye
1,0,/collections/classics/products/sand,cdn.shopify.com/s/files/1/0726/9203/products/S...,2018-06-12,Sand
2,0,/collections/classics/products/havoc,cdn.shopify.com/s/files/1/0726/9203/products/H...,2018-06-12,Havoc
3,0,/collections/classics/products/the-seventh-cross,cdn.shopify.com/s/files/1/0726/9203/products/s...,2018-05-22,The Seventh Cross
4,0,/collections/classics/products/compulsory-games,cdn.shopify.com/s/files/1/0726/9203/products/9...,2018-05-08,Compulsory Games
5,0,/collections/classics/products/basic-black-wit...,cdn.shopify.com/s/files/1/0726/9203/products/B...,2018-04-17,Basic Black with Pearls


#### Quick Info about NYRB Classics Dataset

In [265]:
print("Number of books: {}".format(len(books)))
print("Publication dates range from {} to {}".format(min(books['nyrb_pub_date']), max(books['nyrb_pub_date'])))

Num of books: 486
Publication dates range from 1999-09-30 00:00:00 to 2018-07-10 00:00:00


### Scrape information from individual book pages

In [24]:
# Create list of relevant URLs from earlier collection
base_url = 'https://www.nyrb.com'
book_urls = [books['detail'][i] for i in range(0, len(books))]
url_list = ["{}{}".format(base_url, url) for url in book_urls]

In [27]:
# Retrieve information and parse through BeautifulSoup
stack = []
for url_ in url_list:
    # print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    stack.append(soup_new)

In [28]:
# Further parse BeautifulSoup for information of interest - returns dictionary of values
def parse_details(book):
    
    # "book" is a BeautifulSoup object - right after calling BeautifulSoup()
    title = book.find_all("div", class_='span8')[0].find("h1").text
    people = book.find_all("div", class_='span8')[0].find("h2", class_="combined-authors").text.strip()
    isbn = book.find_all("div", class_='description additional')[0].find(class_='variant-sku').text
    more = book.find_all("div", class_='description additional')[0].find("p").text
    tags_mess = book.find_all("div",class_="span8")[0].find("div", class_="tags clearfix").find_all("a")
    
    # parse authors and language string
    author = re.split(",", people)[0][3:]
    try:
        phrase = re.search('translated from the [a-zA-z]{1,10}', people).group(0)
        language = re.split("\s+", phrase)[-1]
    except AttributeError:
        try:
            language = re.search('translated', people).group(0)
        except AttributeError:
            language = 'English'
 
    # parse additional information string
    pages = re.split("\s+", re.split("\s+\s+", more)[2])[1]
    
    # parse tags
    l = []
    for i in range(0, len(tags_mess)):
        l.append(tags_mess[i].text)
        
    tags = ",".join(l)
    
    return {
        "title": title,
        "author": author,
        "isbn": isbn,
        "pages": pages,
        "original_language": language,
        "tags": tags
    }

In [615]:
# Example
parse_details(stack[1])

{'author': 'Wolfgang Herrndorf',
 'isbn': '9781681372013',
 'original_language': 'German',
 'pages': '464',
 'tags': 'Available as E-Book,German Literature,Historical Fiction,Literary Fiction,Suspense & Crime',
 'title': 'Sand'}

In [29]:
# Parse information and add it to a pandas dataframe
all_details = pd.DataFrame()
for i in range(0, len(stack)):
    details = parse_details(stack[i])
    details = pd.DataFrame(details, index=[0])
    all_details = all_details.append(details)

In [30]:
# Peep the dataset
all_details = all_details.reset_index()
all_details[0:6]

Unnamed: 0,index,author,isbn,original_language,pages,tags,title
0,0,Lesley Blanch,9781681371931,English,400,"Available as E-Book,Biography & Memoir,Literat...",Journey into the Mind's EyeFragments of an Aut...
1,0,Wolfgang Herrndorf,9781681372013,German,464,"Available as E-Book,German Literature,Historic...",Sand
2,0,Tom Kristensen,9781681372075,Danish,528,"Available as E-Book,International Literature,L...",Havoc
3,0,Anna Seghers,9781681372129,German,416,"Available as E-Book,German Literature,Historic...",The Seventh Cross
4,0,Robert Aickman,9781681371894,English,368,"Available as E-Book,British & Irish Literature...",Compulsory Games
5,0,Helen Weinzweig,9781681372167,English,160,"Available as E-Book,Literary Fiction,Literatur...",Basic Black with Pearls


In [57]:
# For downloading images later
books_urls = pd.concat([all_details, pd.DataFrame(book_urls)], axis=1)
books_urls.rename(columns={list(books_urls)[-1]:'detail'}, inplace=True)
imgs = pd.merge(books_urls, books, on='detail', how='inner')
slugs = [re.split("/", imgs['detail'][i])[-1] for i in range(0, len(imgs))]

In [None]:
# need to fix titles
stack[0].find_all("div", class_='span8')[0].find("h1", class_='title')

In [701]:
# Parse NYRB tags into list with corresponding titles
titles = []
nyrb_tags = []
for tag_list in range(0, len(tags_list)):
    for tag in range(0, len(tags_list[tag_list])):
        nyrb_tags.append([tags_list[tag_list][tag]])
        titles.append(all_details['title'][tag_list])

In [717]:
# Store books and tags information in dataframe
books_n_tags = pd.concat([pd.DataFrame(titles), pd.DataFrame(nyrb_tags)], axis=1)
books_n_tags.columns = ['title', 'nyrb_tag']
books_n_tags[0:6]

Unnamed: 0,title,nyrb_tag
0,Journey into the Mind's EyeFragments of an Aut...,Available as E-Book
1,Journey into the Mind's EyeFragments of an Aut...,Biography & Memoir
2,Journey into the Mind's EyeFragments of an Aut...,Literature in English
3,Sand,Available as E-Book
4,Sand,German Literature
5,Sand,Historical Fiction


In [718]:
pd.DataFrame(all_tags)[0].value_counts()[0:6]

Available as E-Book           322
International Literature      226
Literature in English         203
Literary Fiction              190
British & Irish Literature    116
American Literature           100
Name: 0, dtype: int64

#### Quick Statistics about NYRB Classics - Details Dataset

In [530]:
print("There are {} books in this dataset that were translated into English".format(str(len(all_details[all_details['original_language'] != 'English']))))

There are 194 books in this dataset that were translated into English


In [529]:
all_details['original_language'].value_counts() # need to fix unknown languages

English    292
French      50
German      31
Name: original_language, dtype: int64

In [None]:
# fix languages, including those not listed (like Maupassant)
# fix title - no subtitles
# fix tags - one tag per column?
# include author gender, country 

#### Find more information from [Publishers Weekly Translation Database](https://www.publishersweekly.com/pw/translation/search/index.html)

In [4]:
# Find page for NYRB specifically
url = 'https://www.publishersweekly.com/pw/translation/search/index.html?country=&language=&translator_gender=&submit=Search&author=&genre=&author_gender=&submitting=1&translator=&isbn=&title=&publisher=226'
r = requests.get(url)

In [5]:
# Parse page for book titles and authors
soup = BeautifulSoup(r.text, "html5lib")
list = soup.find("table", class_="table table-striped").find_all("tr")

In [6]:
# Collect list of relevant PW urls from the website itself
pw_urls = [list[i].find("a")['href'] for i in range(0, len(list))]

In [7]:
# Create URL list
base_url = 'https://www.publishersweekly.com'
url_list = ["{}{}".format(base_url, url) for url in pw_urls]

In [564]:
# Make get request and parse with BeautifulSoup for all URLs
stack = []
for url_ in url_list:
    # print("Processing {}".format(url_))
    r_new = requests.get(url_)
    soup_new = BeautifulSoup(r_new.text, "html5lib")
    stack.append(soup_new)

In [565]:
# Further parse BeautifulSoup for information of interest - returns dictionary of values
def pw_parser(book):
    
    # retrieve table with information
    info = [book.find("table").find_all("tr")[i].text for i in range(1,len(book.find("table").find_all("tr")))]
    author = re.split(":", info[0])[1]
    isbn = re.split(":", info[3])[1]
    pw_genre = re.split(":", info[5])[1]
    country = re.split(":", info[7])[1]
    gender = re.split(":", info[8])[1]
    
    return {
        "author": author,
        "isbn": isbn,
        "pw_genre": pw_genre,
        "country": country,
        "gender": gender
    }

In [566]:
# Example
pw_parser(stack[10])

{'author': 'Alfred Doblin',
 'country': 'Germany',
 'gender': 'Male',
 'isbn': '9781590179734',
 'pw_genre': 'Fiction'}

In [568]:
# Parse information and add it to a pandas dataframe
pw_details = pd.DataFrame()
for i in range(0, len(stack)):
    details = pw_parser(stack[i])
    details = pd.DataFrame(details, index=[0])
    pw_details = pw_details.append(details)

In [591]:
# Peep the dataset
pw_details[0:6]

Unnamed: 0,author,isbn,pw_genre,country,gender
0,Szilard Borbely,9781681370545,Poetry,Hungary,Male
0,Emmanuel Bove,9781590178324,Fiction,France,Male
0,Matei Calinescu,9781681371955,Fiction,Romania,Male
0,Eileen Chang,9781681371276,Fiction,Chile,Female
0,Anton Chekhov,9781590178362,Fiction,Russia,Male
0,Gabriel Chevallier,9781590177167,Fiction,France,Male


In [600]:
# Join information from PW and NYRB
more_details = pd.merge(pw_details, all_details, on='isbn', how='inner')
more_details[0:6]

Unnamed: 0,author_x,isbn,pw_genre,country,gender,title,author_y,pages,original_language,tags
0,Emmanuel Bove,9781590178324,Fiction,France,Male,Henri Duchemin and His Shadows,Emmanuel Bove,160,French,"Available as E-Book,International Literature,L..."
1,Matei Calinescu,9781681371955,Fiction,Romania,Male,The Life and Opinions of Zacharias Lichter,Matei Calinescu,160,Romanian,"Available as E-Book,International Literature,L..."
2,Eileen Chang,9781681371276,Fiction,Chile,Female,Little Reunions,Eileen Chang,352,translated,"Asian Literature,Available as E-Book,Historica..."
3,Anton Chekhov,9781590178362,Fiction,Russia,Male,The PrankThe Best of Young Chekhov,Anton Chekhov,168,English,"Available as E-Book,International Literature,L..."
4,Gabriel Chevallier,9781590177167,Fiction,France,Male,FearA Novel of World War I,Gabriel Chevallier,328,French,"Available as E-Book,French Literature,Internat..."
5,Jean-Paul Clebert,9781590179574,Fiction,France,Male,Paris Vagabond,Jean-Paul Clébert,352,English,"Available as E-Book,Biography & Memoir,French ..."


### Download Book Covers

In [69]:
# Download covers into your repo
for i in range(0, len(books_urls)):
    url = "https://{}".format(imgs['img'][i])
    filename = "{}.jpg".format(slugs[i])
    
    urllib.request.urlretrieve(url, filename)

### NEED TO FIX:
- Eileen Chang's page - country should be China?, not Chile
- Proensa volume should be by various authors
- Research and fix "translated" mark in original_language 
- fix titles - no subtitles