In [None]:
import pandas as pd
import numpy as np
import requests
from time import sleep
import random 
from bs4 import BeautifulSoup
from collections import defaultdict
import re
import math

#### Use beatufull soup to get the number of articles per year
One of the parameters for NYTimes API request is 'page'. Page number has to be less than 200 => Need to break down by year

In [None]:
years = np.arange(1985,2019,1)

In [None]:
# define a function that inouts a year in the url and return the number of articles with a gives search query
def num_of_pages(year):
    url = f'https://www.nytimes.com/search/gun%20control/newest/{year}0101/{year}1231'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get(url, headers=headers)
    search_page = response.text
    soup = BeautifulSoup(search_page,"html5lib")
    search_result = soup.find_all(class_='SearchForm-searchStatus--2Z3Tw')
    search_result_str = str(search_result)
    num_of_articles = re.match(r'.*Showing\s(\d,*\d+)\sresults.*',search_result_str).group(1)
    num = int(num_of_articles.replace(',',''))
    return(num)

In [None]:
news_per_year = defaultdict()
for year in years:
    news_per_year[year]=num_of_pages(year)
    print(year, ': ', num_of_pages(year))

#### Connect to NYTimes API to get article meta data

In [None]:
API_ROOT = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

API_SIGNUP_PAGE = 'http://developer.nytimes.com/docs/reference/keys'

In [None]:
keys = ['*my_key*_here*']

from 01/01/1985 to date there are 19,294 articles that came up for 'gun control' search term.
it means i have to iterate through 1930 pages to collect the urls

In [None]:
# define a function that calls NYTimes API and returns a json object with sesarch page results 
# Note: returns meta data about 10 articles (1 search page)
def create_request(start_date, end_date, page_num):
    s = len(keys)-1
    API_KEY = keys[random.randint(0,s)]
    resp = requests.get(API_ROOT, params={
        'api-key': API_KEY,
        'q': "gun control",
        'begin_date': start_date,
        'end_date': end_date,
        'sort': "newest",
        'page': page_num})
    return(resp.json())

In [None]:
test = create_request('20130101', '20131231', 20)
print(test)

In [None]:
# define a function that takes json output and parses data from it
# this will return article URLs for further scraping of articles' text
def parse_search_res(json_file):
    news = []
    for i in json_file['response']['docs']:   
        dic = {}
        dic['id'] = i['_id']
        dic['headline'] = i['headline']['main'].encode("utf8")
        dic['date'] = i['pub_date'][0:10]
        dic['score'] = i['score']
        if i['snippet'] is not None:
            dic['snippet'] = i['snippet'].encode("utf8")
        if 'source' in i:
            dic['source'] = i['source']
        if 'type_of_material' in i:
            dic['type'] = i['type_of_material']
        dic['url'] = i['web_url']
        dic['word_count'] = i['word_count']
        news.append(dic)
    return(news)

In [None]:
# code below with crate take start/end year and output the result with meta-data for all articles 
# sattisfiying query result published during the set timeframe
for k, v in news_per_year.items():
    data = []
    start_date = f'{k}0101'
    end_date = f'{k}1231'
    pages = math.ceil(v/10)+1
    for page in range(pages):
        data.extend(parse_search_res(create_request(start_date,end_date,page)))
        sleep(2)
    df = pd.DataFrame.from_dict(data)
    df.to_csv(f'nytimes_meta_{k}.csv')
    print('Collected:', k)