# NYRB Summary Parsing

In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [144]:
def get_summary(url):
    print("Retrieving summary for {}".format(re.split("/", url)[-1]))
    bs4 = BeautifulSoup(requests.get(url).text, "html5lib")
    s = bs4.find_all("div", class_="span8")[0].find_all("div", class_="description")[0].text
    mess = re.split('\\n        Praise\\n', s)[0]
    summary = mess.replace("\n", "")
    summary = summary.replace("\t", "")
    summary = summary.strip()
    return summary.lower()

In [145]:
# example
get_summary("https://www.nyrb.com/products/sand")

Retrieving summary for sand


'an nyrb classics original                \ufeffjune 2018 selection for the nyrb classics book club.north africa, 1972. while the world is reeling from the massacre of israeli athletes at the munich olympics, a series of mysterious events is playing out in the sahara. four people are murdered in a hippie commune, a suitcase full of money disappears, and a pair of unenthusiastic detectives are assigned to investigate. in the midst of it all, a man with no memory tries to evade his armed pursuers. who are they? what do they want from him? if he could just recall his own identity he might have a chance of working it out. . . .this darkly sophisticated literary thriller, the last novel wolfgang herrndorf completed before his untimely death in 2013, is, in the words of michael maar, “the greatest, grisliest, funniest, and wisest novel of the past decade.” certainly no reader will ever forget it.'

In [146]:
get_summary("https://www.nyrb.com/products/stoner")

Retrieving summary for stoner


"william stoner is born at the end of the nineteenth century into a dirt-poor missouri farming family. sent to the state university to study agronomy, he instead falls in love with english literature and embraces a scholar’s life, so different from the hardscrabble existence he has known. and yet as the years pass, stoner encounters a succession of disappointments: marriage into a “proper” family estranges him from his parents; his career is stymied; his wife and daughter turn coldly away from him; a transforming experience of new love ends under threat of scandal. driven ever deeper within himself, stoner rediscovers the stoic silence of his forebears and confronts an essential solitude.john williams's\xa0luminous and deeply moving novel is a work of quiet perfection. william stoner emerges from it not only as an archetypal american, but as an unlikely existential hero, standing, like a figure in a painting by edward hopper, in stark relief against an unforgiving world.john williams, 

In [147]:
books = pd.read_csv("books_imgs.csv")

In [148]:
details = [books.detail[i][21:] for i in range(0, len(books))]
base_url = "https://www.nyrb.com"
urls = ["{}{}".format(base_url, details[i]) for i in range(0, len(details))]

In [None]:
summaries = [get_summary(i) for i in urls]

In [150]:
slugs = [re.split("/", i)[-1] for i in urls]

In [151]:
dat = pd.concat([pd.DataFrame(slugs), pd.DataFrame(summaries)], axis=1)
dat.columns = ['slugs', 'summaries']

In [152]:
# search through summaries to find which ones are noir

In [153]:
noir = []
for i in range(0, len(dat)):
    if "noir" in dat.summaries[i]:
        noir.append(dat.slugs[i])

In [264]:
imgs = pd.concat([dat, pd.DataFrame(slugs)], axis=1)

In [224]:
def search_words(word):
    stack = []
    for i in range(0, len(dat)):
        if re.search(r'\b{}\b'.format(word), dat.summaries[i]):     # find way to search multiple words; optimize for regex
            stack.append(dat.slugs[i])
    return stack

In [257]:
dat

Unnamed: 0,slugs,summaries
0,journey-into-the-minds-eye,july 2018 selection for the nyrb classics book...
1,sand,an nyrb classics original ﻿june...
2,havoc,ole jastrau is the very model of an enterprisi...
3,the-seventh-cross,an nyrb classics original the s...
4,compulsory-games,an nyrb classics original may 2...
5,basic-black-with-pearls,april 2018 selection for the nyrb classics boo...
6,the-life-and-opinions-of-zacharias-lichter,an nyrb classics original march...
7,jigsaw,sybille bedford placed the ambiguous and inesc...
8,ivory-pearl,an nyrb classics original out o...
9,kolyma-stories,an nyrb classics original in 19...


In [225]:
search_words("noir")

['in-a-lonely-place',
 'the-expendable-man',
 'the-big-clock',
 'short-letter-long-farewell',
 'clark-giffords-body']

In [229]:
search_words("world war i")

['schlump',
 'the-end-of-the-hunt',
 'grand-hotel',
 'the-world-as-i-found-it',
 'the-unrest-cure-and-other-stories',
 'the-snows-of-yesteryear',
 'the-post-office-girl',
 'the-fox-in-the-attic',
 'soul-of-wood',
 'my-century',
 'moravagine',
 'the-ermine-of-czernopol',
 'schoolboys-diary-and-other-stories',
 'a_legacy']

In [233]:
search_words("mussolini")

['the-communist-1', 'the-moro-affair', 'kaputt', 'a_family_lexicon']

In [241]:
search_words("mexico")

['a-visit-to-don-otavio',
 'unforgiving-years',
 'memoirs-of-a-revolutionary',
 'journey-into-the-past']

In [184]:
germany = search_words("germany")
germany

['the-seventh-cross',
 'all-for-nothing',
 'the-farm-in-the-green-mountains',
 'when-the-world-spoke-french',
 'unforgiving-years',
 'transit',
 'the-wooden-shepherdess',
 'the-goshawk',
 'soul-of-wood',
 'life-and-fate',
 'journey-into-the-past',
 'irretrievable',
 'going-to-the-dogs',
 'diary-of-a-man-in-despair',
 'a_legacy']

In [187]:
search_words("world war i")

['ivory-pearl',
 'nothing',
 'schlump',
 'izas-ballad',
 'the-end-of-the-hunt',
 'grand-hotel',
 'paris-vagabond',
 'more-was-lost',
 'houses',
 'war-and-the-iliad',
 'the-world-as-i-found-it',
 'the-use-of-man',
 'the-unrest-cure-and-other-stories',
 'the-three-christs-of-ypsilanti',
 'the-snows-of-yesteryear',
 'the-slaves-of-solitude',
 'the-singapore-grip',
 'the-siege-of-krishnapur',
 'the-selected-works-of-cesare-pavese',
 'the_prince_of_minor_writers_the_selected_essays_of_max_beerbohm',
 'the-post-office-girl',
 'the_little_town_where_time_stood_still',
 'the-gallery',
 'the-fox-in-the-attic',
 'the-complete-fiction',
 'the-book-of-ebenezer-le-page',
 'soul-of-wood',
 'school-for-love',
 'pedigree',
 'niki',
 'names-on-the-land',
 'my-century',
 'moravagine',
 'life-and-fate',
 'kaputt',
 'boredom',
 'the-ermine-of-czernopol',
 'a-time-to-keep-silence',
 'schoolboys-diary-and-other-stories',
 'a_legacy']

In [250]:
search_words("communism|communist")

['havoc',
 'the-kremlin-ball',
 'the-communist-1',
 'izas-ballad',
 'houses',
 'a-memoir-of-the-warsaw-uprising',
 'the-book-of-blam',
 'unforgiving-years',
 'the_little_town_where_time_stood_still',
 'the-door',
 'peking-story',
 'niki',
 'memories-of-the-future',
 'letters-from-russia',
 'ice-trilogy']

In [255]:
search_words("soviet")

['kolyma-stories',
 'the-kremlin-ball',
 'earthly-signs',
 'katalin-street',
 'a-memoir-of-the-warsaw-uprising',
 'the-queue',
 'the-letter-killers-club',
 'the-case-of-comrade-tulayev',
 'soul',
 'my-century',
 'moura',
 'memories-of-the-future',
 'memoirs-of-a-revolutionary',
 'life-and-fate',
 'happy-moscow',
 'everything-flows',
 'envy',
 'an-armenian-sketchbook']

words 
- world war i/ii -- mussolini, etc
- france/germany/etc 
- feminist/woman/women 
- prison/crime
- colonialism/colonial/colony
- love/sex
- europe/asia/south america/etc
- philosophers/philosophy/religion


In [169]:
search_words(r"\bworld war i\b")

[]

In [193]:
re.search("\bin\b", "things in")

In [190]:
"things".find("in")

2

In [211]:
if re.search(r"\b{}\b".format("world war ii"), "world war ii and world war i"):
    print("yes")

yes
