In [1]:
import json
import requests
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta

In [2]:

def get_news_by_date_range(query_terms, query_fields, show_fields, start_date, end_date, subdir):

    ARTICLES_DIR = join('data', 'guardian', subdir)
    makedirs(ARTICLES_DIR, exist_ok=True)

    MY_API_KEY = open("creds_guardian.txt").read().strip()
    API_ENDPOINT = 'http://content.guardianapis.com/search'
    my_params = {
        'from-date': "",
        'to-date': "",
        'order-by': "newest",
        'show-fields': show_fields,
        'q': query_terms,
        'query-fields': query_fields,
        'sectionName': 'politics',
        'page-size': 200,
        'api-key': MY_API_KEY
    }

    # day iteration from here:
    # http://stackoverflow.com/questions/7274267/print-all-day-dates-between-two-dates
    
    dayrange = range((end_date - start_date).days + 1)
    
    for daycount in dayrange:
        dt = start_date + timedelta(days=daycount)
        datestr = dt.strftime('%Y-%m-%d')
        fname_h = join(ARTICLES_DIR, datestr + '.json')
        fname_b = join(ARTICLES_DIR, datestr + '-b.json')
        
        if not exists(fname_h):
            # then let's download it
            print("Downloading", datestr)
            all_results_h = []
            all_results_b = []
            
            my_params['from-date'] = datestr
            my_params['to-date'] = datestr
            current_page = 1
            total_pages = 1
            while current_page <= total_pages:
                print("...page", current_page)
                my_params['page'] = current_page
                resp = requests.get(API_ENDPOINT, my_params)
                data = resp.json()
                #all_results.extend(data['response']['results'])
                for result in data['response']['results']:

                    if(len(result['fields']['headline']) > 0):
                        #print(result['fields'][get_field])
                        all_results_h.append(result['fields']['headline'])
                        
                    if(len(result['fields']['body']) > 0):
                        #print(result['fields'][get_field])
                        all_results_b.append(result['fields']['body'])

                # if there is more than one page
                current_page += 1
                total_pages = data['response']['pages']

            with open(fname_h, 'w') as f:
                print("Writing to", fname_h)

                # re-serialize it for pretty indentation
                f.write(json.dumps(all_results_h, indent=2))
                
            with open(fname_b, 'w') as f:
                print("Writing to", fname_b)

                # re-serialize it for pretty indentation
                f.write(json.dumps(all_results_b, indent=2))

In [7]:
get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2012, 8, 8), date(2012, 11, 8), '2016')

get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2012, 8, 6), date(2012, 11, 6), '2012')

get_news_by_date_range('Presidential AND Election', 'headline', 'headline,body', 'headline,body,sectionName',
                       date(2008, 8, 4), date(2008, 11, 4), '2008')

get_news_by_date_range('Presidential AND Election', 'headline', 'headline,body', 'headline,body,sectionName',
                       date(2004, 8, 2), date(2004, 11, 2), '2004')

get_news_by_date_range('Presidential AND Election', 'headline,body', 'headline,body,sectionName',
                       date(2012, 8, 6), date(2012, 11, 6), '2012')

get_news_by_date_range('Presidential AND Election', 'headline', 'headline,body', 'headline,body,sectionName',
                       date(2000, 8, 7), date(2000, 11, 7), '2000')

Downloading 2012-08-06
...page 1
Writing to data/guardian/2012/2012-08-06.json
Downloading 2012-08-07
...page 1
Writing to data/guardian/2012/2012-08-07.json
Downloading 2012-08-08
...page 1
Writing to data/guardian/2012/2012-08-08.json
Downloading 2012-08-09
...page 1
Writing to data/guardian/2012/2012-08-09.json
Downloading 2012-08-10
...page 1
Writing to data/guardian/2012/2012-08-10.json
Downloading 2012-08-11
...page 1
Writing to data/guardian/2012/2012-08-11.json
Downloading 2012-08-12
...page 1
Writing to data/guardian/2012/2012-08-12.json
Downloading 2012-08-13
...page 1
Writing to data/guardian/2012/2012-08-13.json
Downloading 2012-08-14
...page 1
Writing to data/guardian/2012/2012-08-14.json
Downloading 2012-08-15
...page 1
Writing to data/guardian/2012/2012-08-15.json
Downloading 2012-08-16
...page 1
Writing to data/guardian/2012/2012-08-16.json
Downloading 2012-08-17
...page 1
Writing to data/guardian/2012/2012-08-17.json
Downloading 2012-08-18
...page 1
Writing to data/gua