In [1]:
import json
import requests
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta

In [5]:

def get_news_by_date_range(query_terms, query_fields, show_fields, start_date, end_date, subdir):

    ARTICLES_DIR = join('data', 'guardian', subdir)
    makedirs(ARTICLES_DIR, exist_ok=True)

    MY_API_KEY = open("creds_guardian.txt").read().strip()
    API_ENDPOINT = 'http://content.guardianapis.com/search'
    my_params = {
        'from-date': "",
        'to-date': "",
        'order-by': "newest",
        'show-fields': show_fields,
        'q': query_terms,
        'query-fields': query_fields,
        'sectionName': 'politics',
        'page-size': 200,
        'api-key': MY_API_KEY
    }

    # day iteration from here:
    # http://stackoverflow.com/questions/7274267/print-all-day-dates-between-two-dates
    
    dayrange = range((end_date - start_date).days + 1)
    
    for daycount in dayrange:
        dt = start_date + timedelta(days=daycount)
        datestr = dt.strftime('%Y-%m-%d')
        fname_h = join(ARTICLES_DIR, datestr + '.json')
        fname_b = join(ARTICLES_DIR, datestr + '-b.json')
        
        if not exists(fname_h):
            # then let's download it
            print("Downloading", datestr)
            all_results_h = []
            all_results_b = []
            
            my_params['from-date'] = datestr
            my_params['to-date'] = datestr
            current_page = 1
            total_pages = 1
            while current_page <= total_pages:
                print("...page", current_page)
                my_params['page'] = current_page
                resp = requests.get(API_ENDPOINT, my_params)
                data = resp.json()
                #all_results.extend(data['response']['results'])
                for result in data['response']['results']:

                    if(len(result['fields']['headline']) > 0):
                        #print(result['fields'][get_field])
                        all_results_h.append(result['fields']['headline'])
                        
                    if(len(result['fields']['body']) > 0):
                        #print(result['fields'][get_field])
                        all_results_b.append(result['fields']['body'])

                # if there is more than one page
                current_page += 1
                total_pages = data['response']['pages']

            with open(fname_h, 'w') as f:
                print("Writing to", fname_h)

                # re-serialize it for pretty indentation
                f.write(json.dumps(all_results_h, indent=2))
                
            with open(fname_b, 'w') as f:
                print("Writing to", fname_b)

                # re-serialize it for pretty indentation
                f.write(json.dumps(all_results_b, indent=2))

In [7]:
get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2016, 8, 8), date(2016, 11, 8), '2016')

get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2012, 8, 6), date(2012, 11, 6), '2012')

get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2008, 8, 4), date(2008, 11, 4), '2008')

get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2004, 8, 2), date(2004, 11, 2), '2004')

get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2000, 8, 7), date(2000, 11, 7), '2000')

get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2020, 3, 7), date(2020, 6, 7), '2020')

Downloading 2008-08-04
...page 1
Writing to data/guardian/2008/2008-08-04.json
Writing to data/guardian/2008/2008-08-04-b.json
Downloading 2008-08-05
...page 1
Writing to data/guardian/2008/2008-08-05.json
Writing to data/guardian/2008/2008-08-05-b.json
Downloading 2008-08-06
...page 1
Writing to data/guardian/2008/2008-08-06.json
Writing to data/guardian/2008/2008-08-06-b.json
Downloading 2008-08-07
...page 1
Writing to data/guardian/2008/2008-08-07.json
Writing to data/guardian/2008/2008-08-07-b.json
Downloading 2008-08-08
...page 1
Writing to data/guardian/2008/2008-08-08.json
Writing to data/guardian/2008/2008-08-08-b.json
Downloading 2008-08-09
...page 1
Writing to data/guardian/2008/2008-08-09.json
Writing to data/guardian/2008/2008-08-09-b.json
Downloading 2008-08-10
...page 1
Writing to data/guardian/2008/2008-08-10.json
Writing to data/guardian/2008/2008-08-10-b.json
Downloading 2008-08-11
...page 1
Writing to data/guardian/2008/2008-08-11.json
Writing to data/guardian/2008/20

Writing to data/guardian/2008/2008-10-08.json
Writing to data/guardian/2008/2008-10-08-b.json
Downloading 2008-10-09
...page 1
Writing to data/guardian/2008/2008-10-09.json
Writing to data/guardian/2008/2008-10-09-b.json
Downloading 2008-10-10
...page 1
Writing to data/guardian/2008/2008-10-10.json
Writing to data/guardian/2008/2008-10-10-b.json
Downloading 2008-10-11
...page 1
Writing to data/guardian/2008/2008-10-11.json
Writing to data/guardian/2008/2008-10-11-b.json
Downloading 2008-10-12
...page 1
Writing to data/guardian/2008/2008-10-12.json
Writing to data/guardian/2008/2008-10-12-b.json
Downloading 2008-10-13
...page 1
Writing to data/guardian/2008/2008-10-13.json
Writing to data/guardian/2008/2008-10-13-b.json
Downloading 2008-10-14
...page 1
Writing to data/guardian/2008/2008-10-14.json
Writing to data/guardian/2008/2008-10-14-b.json
Downloading 2008-10-15
...page 1
Writing to data/guardian/2008/2008-10-15.json
Writing to data/guardian/2008/2008-10-15-b.json
Downloading 2008-

Writing to data/guardian/2004/2004-09-08.json
Writing to data/guardian/2004/2004-09-08-b.json
Downloading 2004-09-09
...page 1
Writing to data/guardian/2004/2004-09-09.json
Writing to data/guardian/2004/2004-09-09-b.json
Downloading 2004-09-10
...page 1
Writing to data/guardian/2004/2004-09-10.json
Writing to data/guardian/2004/2004-09-10-b.json
Downloading 2004-09-11
...page 1
Writing to data/guardian/2004/2004-09-11.json
Writing to data/guardian/2004/2004-09-11-b.json
Downloading 2004-09-12
...page 1
Writing to data/guardian/2004/2004-09-12.json
Writing to data/guardian/2004/2004-09-12-b.json
Downloading 2004-09-13
...page 1
Writing to data/guardian/2004/2004-09-13.json
Writing to data/guardian/2004/2004-09-13-b.json
Downloading 2004-09-14
...page 1
Writing to data/guardian/2004/2004-09-14.json
Writing to data/guardian/2004/2004-09-14-b.json
Downloading 2004-09-15
...page 1
Writing to data/guardian/2004/2004-09-15.json
Writing to data/guardian/2004/2004-09-15-b.json
Downloading 2004-

Writing to data/guardian/2000/2000-08-16.json
Writing to data/guardian/2000/2000-08-16-b.json
Downloading 2000-08-17
...page 1
Writing to data/guardian/2000/2000-08-17.json
Writing to data/guardian/2000/2000-08-17-b.json
Downloading 2000-08-18
...page 1
Writing to data/guardian/2000/2000-08-18.json
Writing to data/guardian/2000/2000-08-18-b.json
Downloading 2000-08-19
...page 1
Writing to data/guardian/2000/2000-08-19.json
Writing to data/guardian/2000/2000-08-19-b.json
Downloading 2000-08-20
...page 1
Writing to data/guardian/2000/2000-08-20.json
Writing to data/guardian/2000/2000-08-20-b.json
Downloading 2000-08-21
...page 1
Writing to data/guardian/2000/2000-08-21.json
Writing to data/guardian/2000/2000-08-21-b.json
Downloading 2000-08-22
...page 1
Writing to data/guardian/2000/2000-08-22.json
Writing to data/guardian/2000/2000-08-22-b.json
Downloading 2000-08-23
...page 1
Writing to data/guardian/2000/2000-08-23.json
Writing to data/guardian/2000/2000-08-23-b.json
Downloading 2000-

Writing to data/guardian/2000/2000-10-20.json
Writing to data/guardian/2000/2000-10-20-b.json
Downloading 2000-10-21
...page 1
Writing to data/guardian/2000/2000-10-21.json
Writing to data/guardian/2000/2000-10-21-b.json
Downloading 2000-10-22
...page 1
Writing to data/guardian/2000/2000-10-22.json
Writing to data/guardian/2000/2000-10-22-b.json
Downloading 2000-10-23
...page 1
Writing to data/guardian/2000/2000-10-23.json
Writing to data/guardian/2000/2000-10-23-b.json
Downloading 2000-10-24
...page 1
Writing to data/guardian/2000/2000-10-24.json
Writing to data/guardian/2000/2000-10-24-b.json
Downloading 2000-10-25
...page 1
Writing to data/guardian/2000/2000-10-25.json
Writing to data/guardian/2000/2000-10-25-b.json
Downloading 2000-10-26
...page 1
Writing to data/guardian/2000/2000-10-26.json
Writing to data/guardian/2000/2000-10-26-b.json
Downloading 2000-10-27
...page 1
Writing to data/guardian/2000/2000-10-27.json
Writing to data/guardian/2000/2000-10-27-b.json
Downloading 2000-

Writing to data/guardian/2020/2020-04-22.json
Writing to data/guardian/2020/2020-04-22-b.json
Downloading 2020-04-23
...page 1
Writing to data/guardian/2020/2020-04-23.json
Writing to data/guardian/2020/2020-04-23-b.json
Downloading 2020-04-24
...page 1
Writing to data/guardian/2020/2020-04-24.json
Writing to data/guardian/2020/2020-04-24-b.json
Downloading 2020-04-25
...page 1
Writing to data/guardian/2020/2020-04-25.json
Writing to data/guardian/2020/2020-04-25-b.json
Downloading 2020-04-26
...page 1
Writing to data/guardian/2020/2020-04-26.json
Writing to data/guardian/2020/2020-04-26-b.json
Downloading 2020-04-27
...page 1
Writing to data/guardian/2020/2020-04-27.json
Writing to data/guardian/2020/2020-04-27-b.json
Downloading 2020-04-28
...page 1
Writing to data/guardian/2020/2020-04-28.json
Writing to data/guardian/2020/2020-04-28-b.json
Downloading 2020-04-29
...page 1
Writing to data/guardian/2020/2020-04-29.json
Writing to data/guardian/2020/2020-04-29-b.json
Downloading 2020-