Code modified from the below two repositories:

https://github.com/suewoon/nyt-api-wrapper/tree/master

https://github.com/susannapaoli/web-scraper-nyt/tree/main

In [1]:
import os
import argparse
import logging
from datetime import datetime, timedelta
from requests import get
import json
import concurrent.futures
import pandas as pd
from pymongo import MongoClient, UpdateOne
from pymongo.errors import PyMongoError
from bs4 import BeautifulSoup
from time import sleep
from random import randint, random
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium_stealth import stealth

In [2]:
# Define the sectors you're interested in



# Define the start and end dates for the 4-year time period
begin_date = "20180101"
end_date = "20231031"

# Replace with your New York Times API key
API_KEY = "your_api_key"
DBNAME = 'nytimes'

# keep track to avoid rate limit
request_count = 0
    
# track api keys
api_key_count = 0

api_key_list = ["your_api_keys"]

dates =[]
ENDPOINT = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'

# initialize Chrome driver and modify it
chromedriver_path = "C:/Users/zacha/projects/capstone/chromedriver.exe"

options = webdriver.ChromeOptions()
options.add_argument("start-maximized")

options.add_argument("--disable-extensions")
options.add_argument("--incognito")
options.add_experimental_option('debuggerAddress', 'localhost:8992')
driver = webdriver.Chrome(options=options, executable_path=chromedriver_path)

stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )


In [3]:
def generate_month_ranges(start_date, end_date):
    current_date = start_date

    while current_date <= end_date:
        # Calculate the last day of the current month
        last_day_of_month = (current_date.replace(day=1) + timedelta(days=31)).replace(day=1) - timedelta(days=1)

        # Yield the start and end dates for the current month
        yield current_date.strftime("%Y%m%d"), last_day_of_month.strftime("%Y%m%d")

        # Move to the next month
        current_date = last_day_of_month + timedelta(days=1)

# Define the start and end dates
start_date = datetime(2018, 1, 1)
end_date = datetime(2023, 10, 31)

# Generate and each month to dates
for start, end in generate_month_ranges(start_date, end_date):
    dates.append([start, end])


In [4]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
class Scraper(object):

    def __init__(self, dbname, collection_name, endpoint):
        self.dbname = dbname
        self.columns = ['_id', 'web_url', 'pub_date', 'document_type',
                        'type_of_material', 'word_count', 'keywords', 'query', 'text']
        self.collection_name = collection_name
        self.endpoint = endpoint
        
    def set_collection_name(self, new_val):
        self.collection_name = new_val
        
    def scrape_and_save(self, params):
        '''
        extract data into pandas dataframe
        @param params: parameters for API
        @return: None
        '''
        
        global request_count
        response = get(self.endpoint, params=params)
        request_count += 1
        response_json = response.json()
        
        # page units are bucketed in 10
        # hits shows the total number of matched articles
        max_pages = int(response_json['response']['meta']['hits']/10) + 1
        
        page = 0
        while page < max_pages:
            
            params['page'] = page
            response = get(self.endpoint, params=params)
            response_json = response.json()
            request_count += 1

            if 'response' in response_json:
                metadata = self.query_metadata(response_json['response'])
                df = pd.DataFrame(metadata, columns=self.columns[:-2])

                web_urls = df['web_url'].values.tolist()
                page_contents_data = self.scrap_pages(web_urls)
                
                
                logger.info(f'page contents data: {page_contents_data}')
                
                
                df[self.columns[-2]] = params['q']
                df[self.columns[-1]] = page_contents_data

                logger.debug(f'Extracted records for page {page + 1}..')

                # write records into mongodb
                client = MongoClient()
                db = client[self.dbname]
                coll = db[self.collection_name]
                
                try:
                    if self.collection_name in db.list_collection_names():
                        logger.info(f'Continue writing records in pre-existing collection for page {page + 1} to DB...')
                        # Update existing document or add new ones
                        records = json.loads(df.T.to_json()).values()
                        req = [UpdateOne({'_id': record['_id']}, {'$set': record}, upsert=True) for record in records]
                        result = coll.bulk_write(req)
                        logger.info(f'Successfully done inserting records for page {page + 1}!')

                    else:
                        # Add to the records if it doesn't exist
                        records = json.loads(df.T.to_json()).values()
                        result = coll.insert_many(records)
                    
                except PyMongoError as e:
                    logger.error(f'Failed to insert records for page {page + 1} of {self.collection_name}:')
                    logger.error(e)
            
            else:
                logger.error(f'Error in API response for page {page + 1}: {response_json}')
                
            page += 1

            if page == max_pages:
                logger.info("ALL DONE!")
                return ValueError("Stop the program!")

    def query_metadata(self, response_json):
        '''
        query metadata
        @param response_json: api response in json format
        @return: a list of metadata row
        '''

        # columns to extract from the response
        selected_column = self.columns[:-2]

        logging.info(f'selected column: {selected_column}')
        
        # extract the data
        metadata = []
        logger.info('Extract metadata using API')
        for row in response_json.get('docs', []):
            selected_row = []
            for item in selected_column:
                selected_row.append(row.get(item, None))
            metadata.append(selected_row)
        logger.info('Finish extracting metadata')
        return metadata

    def scrap_pages(self, web_urls):
        '''
        :loop over response in order to get web_url of each object.
        @param web_urls: web url list to scrap article
        @return: a list of page contents string
        '''
        n_contents = len(web_urls)
        page_contents = []

        logger.info('Scraping pages ... ')
        page_contents = list(page_contents)
        
        logger.info(f'page contents:{page_contents}')
        for i in range(n_contents):
            body_text_str = self.scrap_page(web_urls[i])
            page_contents.append(body_text_str)
        logger.info('Finish scraping pages')
        return page_contents

    def scrap_page(self, web_url):
        '''
        : parse only body text from response json and return a list of body texts
        @param web_url: web_url for parsing html
        @return: a page contents text, string
        '''
        logging.info(f'web url: {web_url}')
        
        driver.get(web_url)
        text = ''
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        paragraph = soup.find_all('p')
        
        for p in paragraph:
            x = p.get_text()
            if x != 'Advertisement' and x != 'Supported by' and x != 'Send any friend a story' and x != 'As a subscriber, you have 10 gift articles to give each month. Anyone can read what you share.' and not x.startswith("By") and not x.startswith("More About") and x != "Check out with card" and x != "©2023 The New York Times Company":
                text += x
                text += " "
        
        sleep(random() + randint(1, 2))
        return text

In [6]:
print(dates)

[['20180101', '20180131'], ['20180201', '20180228'], ['20180301', '20180331'], ['20180401', '20180430'], ['20180501', '20180531'], ['20180601', '20180630'], ['20180701', '20180731'], ['20180801', '20180831'], ['20180901', '20180930'], ['20181001', '20181031'], ['20181101', '20181130'], ['20181201', '20181231'], ['20190101', '20190131'], ['20190201', '20190228'], ['20190301', '20190331'], ['20190401', '20190430'], ['20190501', '20190531'], ['20190601', '20190630'], ['20190701', '20190731'], ['20190801', '20190831'], ['20190901', '20190930'], ['20191001', '20191031'], ['20191101', '20191130'], ['20191201', '20191231'], ['20200101', '20200131'], ['20200201', '20200229'], ['20200301', '20200331'], ['20200401', '20200430'], ['20200501', '20200531'], ['20200601', '20200630'], ['20200701', '20200731'], ['20200801', '20200831'], ['20200901', '20200930'], ['20201001', '20201031'], ['20201101', '20201130'], ['20201201', '20201231'], ['20210101', '20210131'], ['20210201', '20210228'], ['20210301'

In [7]:
begin_date = "20180101"
end_date = "20231031"
sector = "financial"
dates = ['20220701', '20220731']

['20220701', '20220731']


In [9]:
params = {'begin_date': dates[0], 'end_date': dates[1], 'q': sector}
params['api-key'] = API_KEY

response = get(ENDPOINT, params=params)
response_json = response.json()
max_pages_x = int(response_json['response']['meta']['hits']/10) + 1

print(sector, max_pages_x)

financial 45


energy 2861
materials 2332
------------------consumer goods 336
------------------healthcare 181
financial 3658
------------------semiconductors 151
------------------utilities 517
real estate 1901
------------------telecommunications 170
industrial 968
------------------consumer discretionary 18

In [10]:
if __name__ == '__main__':    
    # initialize a Scrpaer instance
    scraper = Scraper(DBNAME, sector, ENDPOINT)
    request_count += 1

    while request_count < 500:
        API_KEY = api_key_list[api_key_count]
        if request_count >= 475:
            api_key_count += 1

            # no more API keys availble
            if api_key_count >= len(api_key_list):
                break 
                
        params = {'begin_date': dates[0], 'end_date': dates[1], 'q': sector}
        params['api-key'] = API_KEY

        x = scraper.scrape_and_save(params)
        if x == ValueError:
            break
        break


INFO:root:selected column: ['_id', 'web_url', 'pub_date', 'document_type', 'type_of_material', 'word_count', 'keywords']
INFO:__main__:Extract metadata using API
INFO:__main__:Finish extracting metadata
INFO:__main__:Scraping pages ... 
INFO:__main__:page contents:[]
INFO:root:web url: https://www.nytimes.com/2022/07/18/business/second-quarter-bank-earnings.html
INFO:root:web url: https://www.nytimes.com/2022/07/14/business/watchdog-absolves-fed-officials.html
INFO:root:web url: https://www.nytimes.com/2022/07/28/business/west-virginia-fossil-fuel-banks.html
INFO:root:web url: https://www.nytimes.com/2022/07/29/business/strong-profits-shaky-economy.html
INFO:root:web url: https://www.nytimes.com/2022/07/27/business/berkshire-hathaway-trident-mortgage-redlining-20-million.html
INFO:root:web url: https://www.nytimes.com/2022/07/27/business/credit-suisse-earnings.html
INFO:root:web url: https://www.nytimes.com/2022/07/27/opinion/italy-draghi-populism.html
INFO:root:web url: https://www.ny

Run 2

In [7]:
# get db client 
import pymongo 
from pymongo import MongoClient
client = client = MongoClient("localhost:27017")
# load to csv file
db = client['nytimes']
collection = db[f'{sector}']
cursor = collection.find()
df = pd.DataFrame(list(cursor))


In [8]:
df.head()

Unnamed: 0,_id,web_url,pub_date,document_type,type_of_material,word_count,keywords,query,text
0,nyt://article/7593cf27-bbe5-5ac7-b692-fe3ad8c8...,https://www.nytimes.com/2018/12/17/business/de...,2018-12-17T21:32:39+0000,article,News,375,"[{'name': 'subject', 'value': 'Health Insuranc...",healthcare,Get the DealBook newsletter to make sense of m...
1,nyt://article/a965ed73-f872-582c-8254-707cbd04...,https://www.nytimes.com/2018/12/07/arts/design...,2018-12-07T21:39:47+0000,article,News,374,"[{'name': 'subject', 'value': 'Art', 'rank': 1...",healthcare,The Smithsonian Institution announced on Thurs...
2,nyt://article/be30d64d-ba67-5061-b50e-8b67922e...,https://www.nytimes.com/2018/10/12/business/ec...,2018-10-12T17:23:12+0000,article,News,1335,"[{'name': 'subject', 'value': 'Renting and Lea...",healthcare,"LOS ANGELES — From pulpits across Los Angeles,..."
3,nyt://article/1d1ce1bd-ca27-55d2-adf7-25221255...,https://www.nytimes.com/2018/06/10/business/de...,2018-06-11T00:22:01+0000,article,News,356,"[{'name': 'subject', 'value': 'Mergers, Acquis...",healthcare,"K.K.R., the private equity giant, is near a de..."
4,nyt://article/c05d980b-d7a5-5060-a96a-df9d36a0...,https://www.nytimes.com/2018/07/16/health/fda-...,2018-07-16T22:00:36+0000,article,News,603,"[{'name': 'subject', 'value': 'Drugs (Pharmace...",healthcare,The Food and Drug Administration has announced...


In [1]:
# save to csv
csv_save_path = f"/Users/zacha/projects/capstone/text_data/{sector}/{sector}.csv"

df.to_csv(csv_save_path, index=False)

NameError: name 'sector' is not defined

In [12]:
client.close()