# NY TIMES scraping

https://www.nytimes.com/books/best-sellers/

## 1. Import libraries

In [1]:
import requests as rq
import pandas as pd
from bs4 import BeautifulSoup as bs
import lxml
import re

## 2. Set base url

In [2]:
url = "https://www.nytimes.com/books/best-sellers/2020/06/21/"

## 3. Make a request 

In [3]:
# set response object
res = rq.get(url)
# check status code 200 = OK
res.status_code

200

## 4. Create our soup object

In [4]:
lasoupe = bs(res.text, 'lxml')
lasoupe.find("title").text

'Best Sellers - June 21, 2020 - The New York Times'

## 5. Find the data

- [x] find the sections (genres and types of books)
- [x] for each section find the top books
    - [x] position
    - [x] title
    - [x] author
    - [x] how many weeks in the list
    - [x] synopsis
    - [x] cover

In [5]:
section_class = "css-nzgijy"
sections = [x.text for x in lasoupe.find_all("a", section_class) for i in range(5)]

In [6]:
title_class = "css-i1z3c1"
titles = [x.text.title() for x in lasoupe.find_all("h3", title_class)]

In [7]:
author_class = "css-1nxjbfc"
authors = [re.sub("by ", "", x.text) for x in lasoupe.find_all("p", author_class)]

In [8]:
synopsis_class = "css-5yxv3r"
synopsis = [x.text if x.text != "" else "null" for x in lasoupe.find_all("p", synopsis_class)]

In [9]:
cover_class = "css-35otwa"
covers = [x["src"] for x in lasoupe.find_all("img", cover_class)]

In [10]:
week_class = "css-t7cods"
n_weeks = r"(\d{1,4})"
weeks = [''.join(re.findall(n_weeks, x.text)) for x in lasoupe.find_all("p", week_class)]

## 6. Create the Data Frame object

In [11]:
data = pd.DataFrame({
    "title": titles,
    "author": authors,
    "synopsis": synopsis,
    "cover_url": covers,
    "common_genre": sections,
    "weeks_in_ranking": weeks})

# replace empty weeks by 1 week
data.replace("", 1, inplace=True)
data.head()

Unnamed: 0,title,author,synopsis,cover_url,common_genre,weeks_in_ranking
0,The Vanishing Half,Brit Bennett,The lives of twin sisters who run away from a ...,https://s1.nyt.com/du/books/images/97805255362...,Combined Print & E-Book Fiction,1
1,The Guest List,Lucy Foley,A wedding between a TV star and a magazine pub...,https://s1.nyt.com/du/books/images/97800628689...,Combined Print & E-Book Fiction,1
2,Where The Crawdads Sing,Delia Owens,In a quiet town on the North Carolina coast in...,https://s1.nyt.com/du/books/images/97807352190...,Combined Print & E-Book Fiction,91
3,The Lies That Bind,Emily Giffin,When the new man in her life disappears on 9/1...,https://s1.nyt.com/du/books/images/97803991789...,Combined Print & E-Book Fiction,1
4,Hideaway,Nora Roberts,"A child star escapes her abductors, gathers he...",https://s1.nyt.com/du/books/images/97812502071...,Combined Print & E-Book Fiction,2


## 7. Export csv file  

In [12]:
data.to_csv("../data/ny_times.csv" ,index=False)

## Import csv file

In [13]:
pd.read_csv("../data/ny_times.csv").head()

Unnamed: 0,title,author,synopsis,cover_url,common_genre,weeks_in_ranking
0,The Vanishing Half,Brit Bennett,The lives of twin sisters who run away from a ...,https://s1.nyt.com/du/books/images/97805255362...,Combined Print & E-Book Fiction,1
1,The Guest List,Lucy Foley,A wedding between a TV star and a magazine pub...,https://s1.nyt.com/du/books/images/97800628689...,Combined Print & E-Book Fiction,1
2,Where The Crawdads Sing,Delia Owens,In a quiet town on the North Carolina coast in...,https://s1.nyt.com/du/books/images/97807352190...,Combined Print & E-Book Fiction,91
3,The Lies That Bind,Emily Giffin,When the new man in her life disappears on 9/1...,https://s1.nyt.com/du/books/images/97803991789...,Combined Print & E-Book Fiction,1
4,Hideaway,Nora Roberts,"A child star escapes her abductors, gathers he...",https://s1.nyt.com/du/books/images/97812502071...,Combined Print & E-Book Fiction,2
