# Collecting "Economy" Articles from NYT API


In [1]:
!pip install nytimesarticle

^C


In [7]:
import time
import urllib.request
import json
from nytimesarticle import articleAPI

In [48]:
api_key='Ap8BsTKvo5PXpvcQFUeEdjocSL6GPRTA'
api = articleAPI(api_key)
trainingFolder = 'C:\\tmp\\'
sampleSize = 5 #Number of pages included. Normally each page contains 10 articles.

def get_nytarticles(sampleSize,query,category,year):
    all_articles = []

    #Set date range (all year) change
    #1Q equal 0101 to 0331
    #2Q equal 0401 to 0630
    #3Q equal 0701 to 0930
    #4Q equal 1001 to 1231
    dt_from = year+"1001" #change for each back for different time period 
    dt_to = year+"1231" #change for each back for different time period
    
    #Set basic URL
    url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"
    
    #Adding parameters
    url=url+"api-key="+api_key
    url=url+"&fq=news_desk='"+category+"'"
    url=url+"&begin_date="
    url=url+dt_from
    url=url+"&end_date="+dt_to
    url=url+"&sort:newest"
    #print (url)
    
    #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
    for i in range(1,sampleSize+1):
        print ('generating %s batch of articles [%s] ...' % (category,i))
        
        #Adding paging information
        response = urllib.request.urlopen(url+"&page="+str(i))
        articles = json.loads(response.read())
        articles = parse_articles(articles)
        all_articles = all_articles + articles
        
        # Sleep 1 second to avoid "Exceeded Request Quota" error
        time.sleep(6)
    
    return(all_articles)


def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    #print(articles)
    for i in articles['response']['docs']:
        #print(i)
        dic = {}
        dic['id'] = i['_id']
        
        if 'abstract' in i and i['abstract'] is not None:
            dic['abstract'] = i['abstract'].encode("utf8")
        
        if 'headline' in i and i['headline'] is not None:
            dic['headline'] = i['headline']['main'].encode("utf8")
        
        if 'desk' in i and i['desk'] is not None:
            dic['desk'] = i['news_desk']
        
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        
        if ('section_name' in i):
            dic['section'] = i['section_name']
       
        if 'snippet' in i and ['snippet'] is not None:
            dic['snippet'] = i['snippet'].encode("utf8")
        
        if 'source' in i and i['source'] is not None:
            dic['source'] = i['source']

        if 'type_of_material' in i and i['type_of_material'] is not None:
            dic['type'] = i['type_of_material']

        if 'web_url' in i and i['web_url'] is not None:
            dic['url'] = i['web_url']
        
        if ('word_count' in i):
            dic['word_count'] = i['word_count']
        
        # locations
        locations = []
        for x in range(0,len(i['keywords'])):
            if 'glocations' in i['keywords'][x]['name']:
                locations.append(i['keywords'][x]['value'])
        dic['locations'] = locations
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['subjects'] = subjects   
        news.append(dic)
    return(news) 

def get_articles(sampleSize,query,category,year):
    '''
    This function accepts a year in string format (e.g.'1980')
    and a query (e.g.'Amnesty International') and it will 
    return a list of parsed articles (in dictionaries)
    for that year.
    '''
    all_articles = []
    for i in range(0,sampleSize): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        print ('generating %s block ...' % i)
        articles = api.search(q = query,
                              fq = {'news_desk':category},
                              begin_date = year + '1001', #change for each time period
                              end_date = year + '1231', #change for each time period
                              sort ='newest',
                              page = str(i))
        articles = parse_articles(articles)
        all_articles = all_articles + articles
        # Sleep 1 second to avoid "Exceeded Request Quota" error
        time.sleep(1)
    return(all_articles)

def testAPI(query,category):
    content = []
    economy =  get_nytarticles(1,query,category,'2013') #change year when applicable
    print (economy)

Testing the API

In [49]:
testAPI("the","economy")

generating economy batch of articles [1] ...
[{'id': 'nyt://article/6f65ee43-6866-5b9e-8a79-67792c6ab306', 'abstract': b'A couple return to the city where they met to be married before old friends and colleagues.', 'headline': b'Love, Blown on Course by a Storm', 'date': '2013-11-22', 'section': 'Fashion & Style', 'snippet': b'A couple return to the city where they met to be married before old friends and colleagues.', 'source': 'The New York Times', 'type': 'News', 'url': 'https://www.nytimes.com/2013/11/24/fashion/weddings/love-blown-on-course-by-a-storm.html', 'word_count': 1577, 'locations': [], 'subjects': ['Weddings and Engagements', 'Dating and Courtship', 'Vows (Times Column)']}, {'id': 'nyt://article/a6cce509-b0a5-5653-99e0-0123023997cc', 'abstract': b'Prime Minister Antonis Samaras, meeting with Chancellor Angela Merkel of Germany, is seeking to ease the terms for securing more rescue funds.', 'headline': b'Greece Says It Is on Track to Repay Bailout', 'date': '2013-11-22', '

Utility Methods to generate training sets

In [50]:
def generateTrainingSet(size,category,year):
    articles = get_nytarticles(size,"the",category,year)
    fileName = "C:\\tmp\\4Q2013training_"+category.lower()  #change file name each time run. Using Quarterly dates.
    f = open(fileName, 'w')
    text = ''
    
    for doc in articles:
        try:
            if 'abstract' in doc:
                # Get the abstract
                text = doc['abstract']
            elif 'snippet' in doc:
                text = doc['snippet']

            #Remove tabs
            text = str(text,'utf-8')
            text=text.replace('\t',' ')
            f.write('%s\n' % text)
        except:
            print("Error Occured: ignoring ...")
    
    f.close()

Generate Training sets for Economy

In [51]:
generateTrainingSet(20,"Economy","2013") #change year based on query

generating Economy batch of articles [1] ...
generating Economy batch of articles [2] ...
generating Economy batch of articles [3] ...
generating Economy batch of articles [4] ...
generating Economy batch of articles [5] ...
generating Economy batch of articles [6] ...
generating Economy batch of articles [7] ...
generating Economy batch of articles [8] ...
generating Economy batch of articles [9] ...
generating Economy batch of articles [10] ...
generating Economy batch of articles [11] ...
generating Economy batch of articles [12] ...
generating Economy batch of articles [13] ...
generating Economy batch of articles [14] ...
generating Economy batch of articles [15] ...
generating Economy batch of articles [16] ...
generating Economy batch of articles [17] ...
generating Economy batch of articles [18] ...
generating Economy batch of articles [19] ...
generating Economy batch of articles [20] ...


In [139]:
#Combining all files together
filenames = "C:\\tmp\\1Q2013training_economy","C:\\tmp\\2Q2013training_economy","C:\\tmp\\3Q2013training_economy","C:\\tmp\\4Q2013training_economy","C:\\tmp\\1Q2014training_economy","C:\\tmp\\2Q2014training_economy","C:\\tmp\\3Q2014training_economy","C:\\tmp\\4Q2014training_economy","C:\\tmp\\1Q2015training_economy","C:\\tmp\\2Q2015training_economy","C:\\tmp\\3Q2015training_economy","C:\\tmp\\4Q2015training_economy","C:\\tmp\\1Q2016training_economy", "C:\\tmp\\2Q2016training_economy","C:\\tmp\\3Q2016training_economy","C:\\tmp\\4Q2016training_economy","C:\\tmp\\1Q2017training_economy","C:\\tmp\\2Q2017training_economy","C:\\tmp\\3Q2017training_economy","C:\\tmp\\4Q2017training_economy","C:\\tmp\\1Q2018training_economy","C:\\tmp\\2Q2018training_economy","C:\\tmp\\3Q2018training_economy","C:\\tmp\\4Q2018training_economy","C:\\tmp\\1Q2019training_economy","C:\\tmp\\2Q2019training_economy","C:\\tmp\\3Q2019training_economy","C:\\tmp\\4Q2019training_economy"
with open("C:\\tmp\\EconomyArticles 2013-2019", 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)