In [5]:
# pip install news-please

In [6]:
# pip install --upgrade twisted

In [7]:
# pip install cchardet

In [None]:
import requests
import time

### Test out News-Please (an API for web scraping)

In [1]:
from newsplease import NewsPlease
article = NewsPlease.from_url('https://www.nytimes.com/2017/02/23/us/politics/cpac-stephen-bannon-reince-priebus.html?hp')
print(article.title)

Stephen Bannon Reassures Conservatives Uneasy About Trump


In [5]:
for method in dir(article):
    if method[0] != '_':
        print(method)
        print(getattr(article, method))
        print()

authors
['Jeremy W. Peters']

date_download
2020-10-29 13:33:06

date_modify
None

date_publish
2017-02-24 02:07:06

description
The president’s chief strategist vowed that the “deconstruction of the administrative state” has begun, in a speech at the Conservative Political Action Conference.

filename
https%3A%2F%2Fwww.nytimes.com%2F2017%2F02%2F23%2Fus%2Fpolitics%2Fcpac-stephen-bannon-reince-priebus.html%3Fhp.json

get_dict
<bound method NewsArticle.get_dict of <NewsArticle.NewsArticle object at 0x11ad88198>>

get_serializable_dict
<bound method NewsArticle.get_serializable_dict of <NewsArticle.NewsArticle object at 0x11ad88198>>

image_url
https://static01.nyt.com/images/2017/02/24/us/24cpac1/24cpac1-facebookJumbo.jpg?year=2017&h=549&w=1050&sig=0x874aeb1455847f4591bcc7575e2ee78b

language
en

localpath
None

maintext
Ms. Conway said the stories of disarray in the White House, including recent accounts that she has been sidelined lately, were nothing more than tiresome palace intrigue

## Access the NYTimes API (fairly successful)
https://developer.nytimes.com/get-started

In [40]:
month_dict = dict()
for month in range(1,11):
    response = requests.get(f'https://api.nytimes.com/svc/archive/v1/2020/{month}.json?api-key=ZA6aVE1hA4uC83loGmtDNU7faHJAVNNv')
    jsonResponse = response.json()
    month_dict[month] = jsonResponse

In [41]:
doc_dict.keys()

dict_keys(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri'])

### Load economy and covid keywords

In [50]:
with open('economy_keywords.txt','r') as f:
    economy_keywords = f.read().split('\n')
with open('covid_keywords.txt','r') as f:
    covid_keywords = f.read().split('\n')

### Search each article text for keywords from both categories

In [55]:
economy_and_covid = []

for month in month_dict.keys():
    json_response = month_dict[month]
    for doc_dict in jsonResponse['response']['docs']:
        text = doc_dict['abstract'] + doc_dict['snippet'] + doc_dict['lead_paragraph']
        economy_key_found = False
        covid_key_found = False
        for word in economy_keywords:
            if word in text:
                economy_key_found = True
                break
        for word in covid_keywords:
            if word in text:
                covid_key_found = True
                break
        if economy_key_found and covid_key_found:
            economy_and_covid.append((doc_dict, month))

In [139]:
print('there are',len(economy_and_covid),'total COVIDxEcon articles in the NYTimes archives from 2020.')

there are 2570 total COVIDxEcon articles in the NYTimes archives from 2020.


In [61]:
with open('URLS.txt', 'w') as f:
    for doc_dict, month in economy_and_covid:
        f.write(doc_dict['web_url'] + ',' + str(month) + '\n')

## Google News API (not as successful)
https://newsapi.org/s/google-news-api

In [80]:
pip install newsapi-python

Collecting newsapi-python
  Downloading newsapi_python-0.2.6-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.6
You should consider upgrading via the '/Users/paigelee/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [115]:
from newsapi import NewsApiClient

# Init
newsapi = NewsApiClient(api_key='679b374b638e4a17bcb0a7f43f4a8806')

# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(q='bitcoin',
                                          sources='bbc-news,the-verge',
                                          language='en')

# /v2/everything
covid = newsapi.get_everything(q='covid',
                                      from_param='2020-09-29',
                                      to='2020-10-28',
                                      language='en',
                                      sort_by='relevancy')
# coronavirus = newsapi.get_everything(q='coronavirus',
#                                       from_param='2020-09-29',
#                                       to='2020-10-28',
#                                       language='en',
#                                       sort_by='relevancy',
#                                       page=2)

# /v2/sources
sources = newsapi.get_sources()

### Covid

In [131]:
all_covid_articles = []
for covid_key in covid_keywords:
    articles = newsapi.get_everything(q=covid_key,
                                      from_param='2020-10-01',
                                      to='2020-10-28',
                                      language='en',
                                      sort_by='relevancy',
                                      page=2)
    for article in articles['articles']:
        all_covid_articles.append(article['url'])
print(len(all_covid_articles), 'covid articles in the last month')

160 covid articles in the last month


In [138]:
all_econ_articles = []
for econ_key in economy_keywords:
    articles = newsapi.get_everything(q=econ_key,
                                      from_param='2020-10-01',
                                      to='2020-10-28',
                                      language='en',
                                      sort_by='relevancy',
                                      page=2)
    for article in articles['articles']:
        all_econ_articles.append(article['url'])
print(len(all_econ_articles), 'econ articles in the last month')

NewsAPIException: {'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 100 requests over a 24 hour period (50 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}

### Intersection of the COVID and Econ sets

In [136]:
articles['articles'][0]['url']

'https://www.bbc.co.uk/news/business-53731404'