# Web Scraping through API: Capturing Covid-19 data

## Registering use of API

In [1]:
import os

os.listdir("./auth")

['guardian-api-key.txt']

In [2]:
api_key = open("./auth/guardian-api-key.txt", "r").read()

api_key

'a8f926a6-a91e-4f45-812b-a5045f04b836'

## Requesting data

In [3]:
import os
import requests
import json
from datetime import datetime

In [4]:
baseurl = "http://content.guardianapis.com/search?"
searchterm = "covid-19"
auth = {"api-key":api_key}
webadd = baseurl + "q=" + searchterm

print(webadd)

response = requests.get(webadd, headers=auth)
response.status_code

http://content.guardianapis.com/search?q=covid-19


200

In [5]:
data = response.json()

data

{'response': {'status': 'ok',
  'userTier': 'developer',
  'total': 51858,
  'startIndex': 1,
  'pageSize': 10,
  'currentPage': 1,
  'pages': 5186,
  'orderBy': 'relevance',
  'results': [{'id': 'world/2021/nov/01/global-covid-19-death-toll-passes-5m',
    'type': 'article',
    'sectionId': 'world',
    'sectionName': 'World news',
    'webPublicationDate': '2021-11-01T14:10:03Z',
    'webTitle': 'Global Covid-19 death toll passes 5m',
    'webUrl': 'https://www.theguardian.com/world/2021/nov/01/global-covid-19-death-toll-passes-5m',
    'apiUrl': 'https://content.guardianapis.com/world/2021/nov/01/global-covid-19-death-toll-passes-5m',
    'isHosted': False,
    'pillarId': 'pillar/news',
    'pillarName': 'News'},
   {'id': 'sport/2021/nov/03/aaron-rodgers-covid-19-vaccine-status-reports-green-bay-packers-nfl',
    'type': 'article',
    'sectionId': 'sport',
    'sectionName': 'Sport',
    'webPublicationDate': '2021-11-03T16:31:09Z',
    'webTitle': 'Aaron Rodgers reportedly unva

## Navigating a dictionary (JSON) variable

In [6]:
data.keys()

dict_keys(['response'])

In [7]:
data["response"].keys()

dict_keys(['status', 'userTier', 'total', 'startIndex', 'pageSize', 'currentPage', 'pages', 'orderBy', 'results'])

In [8]:
search_results = data["response"]["results"]

search_results

[{'id': 'world/2021/nov/01/global-covid-19-death-toll-passes-5m',
  'type': 'article',
  'sectionId': 'world',
  'sectionName': 'World news',
  'webPublicationDate': '2021-11-01T14:10:03Z',
  'webTitle': 'Global Covid-19 death toll passes 5m',
  'webUrl': 'https://www.theguardian.com/world/2021/nov/01/global-covid-19-death-toll-passes-5m',
  'apiUrl': 'https://content.guardianapis.com/world/2021/nov/01/global-covid-19-death-toll-passes-5m',
  'isHosted': False,
  'pillarId': 'pillar/news',
  'pillarName': 'News'},
 {'id': 'sport/2021/nov/03/aaron-rodgers-covid-19-vaccine-status-reports-green-bay-packers-nfl',
  'type': 'article',
  'sectionId': 'sport',
  'sectionName': 'Sport',
  'webPublicationDate': '2021-11-03T16:31:09Z',
  'webTitle': 'Aaron Rodgers reportedly unvaccinated after testing positive for Covid-19',
  'webUrl': 'https://www.theguardian.com/sport/2021/nov/03/aaron-rodgers-covid-19-vaccine-status-reports-green-bay-packers-nfl',
  'apiUrl': 'https://content.guardianapis.co

In [9]:
type(search_results)

list

In [10]:
len(search_results)

10

In [11]:
# view the values of certain keys in each element in  the list
for result in search_results:
    print(result["type"])
    print(result["sectionName"])
    print(result["webPublicationDate"])
    print("\r")
    print("-------------")
    print("\r")

article
World news
2021-11-01T14:10:03Z

-------------

article
Sport
2021-11-03T16:31:09Z

-------------

article
Australia news
2021-12-15T01:31:11Z

-------------

article
World news
2021-10-19T11:59:00Z

-------------

article
Music
2021-10-24T18:24:40Z

-------------

article
World news
2021-10-19T18:21:41Z

-------------

article
US news
2021-09-21T10:14:08Z

-------------

article
Sport
2021-12-17T16:25:11Z

-------------

article
Sport
2021-10-22T11:39:23Z

-------------

article
Business
2021-09-03T05:00:19Z

-------------



## Saving results

In [12]:
try:
    os.mkdir("./downloads")
except:
    print("Unable to create folder: already exists")

Unable to create folder: already exists


In [13]:
# Write the results to a JSON file
date = datetime.now().strftime("%Y-%m-%d")
print(data)

outfile = "./downloads/guardian-api-covid-19-search-" + date + ".json"

with open(outfile, "w") as f:
    json.dump(data, f)

{'response': {'status': 'ok', 'userTier': 'developer', 'total': 51858, 'startIndex': 1, 'pageSize': 10, 'currentPage': 1, 'pages': 5186, 'orderBy': 'relevance', 'results': [{'id': 'world/2021/nov/01/global-covid-19-death-toll-passes-5m', 'type': 'article', 'sectionId': 'world', 'sectionName': 'World news', 'webPublicationDate': '2021-11-01T14:10:03Z', 'webTitle': 'Global Covid-19 death toll passes 5m', 'webUrl': 'https://www.theguardian.com/world/2021/nov/01/global-covid-19-death-toll-passes-5m', 'apiUrl': 'https://content.guardianapis.com/world/2021/nov/01/global-covid-19-death-toll-passes-5m', 'isHosted': False, 'pillarId': 'pillar/news', 'pillarName': 'News'}, {'id': 'sport/2021/nov/03/aaron-rodgers-covid-19-vaccine-status-reports-green-bay-packers-nfl', 'type': 'article', 'sectionId': 'sport', 'sectionName': 'Sport', 'webPublicationDate': '2021-11-03T16:31:09Z', 'webTitle': 'Aaron Rodgers reportedly unvaccinated after testing positive for Covid-19', 'webUrl': 'https://www.theguardi

In [14]:
os.listdir("./downloads")

['guardian-api-covid-19-search-2021-12-18.json']

In [15]:
with open(outfile, "r") as f:
    data = json.load(f)
    
data

{'response': {'status': 'ok',
  'userTier': 'developer',
  'total': 51858,
  'startIndex': 1,
  'pageSize': 10,
  'currentPage': 1,
  'pages': 5186,
  'orderBy': 'relevance',
  'results': [{'id': 'world/2021/nov/01/global-covid-19-death-toll-passes-5m',
    'type': 'article',
    'sectionId': 'world',
    'sectionName': 'World news',
    'webPublicationDate': '2021-11-01T14:10:03Z',
    'webTitle': 'Global Covid-19 death toll passes 5m',
    'webUrl': 'https://www.theguardian.com/world/2021/nov/01/global-covid-19-death-toll-passes-5m',
    'apiUrl': 'https://content.guardianapis.com/world/2021/nov/01/global-covid-19-death-toll-passes-5m',
    'isHosted': False,
    'pillarId': 'pillar/news',
    'pillarName': 'News'},
   {'id': 'sport/2021/nov/03/aaron-rodgers-covid-19-vaccine-status-reports-green-bay-packers-nfl',
    'type': 'article',
    'sectionId': 'sport',
    'sectionName': 'Sport',
    'webPublicationDate': '2021-11-03T16:31:09Z',
    'webTitle': 'Aaron Rodgers reportedly unva

## Refining Covid-19 data collection

In [16]:
import os
import requests
import json
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup as soup

date = datetime.now().strftime("%Y-%m-%d")

api_key = open("./auth/guardian-api-key.txt", "r").read()

In [18]:
baseurl = "http://content.guardianapis.com/search?"
searchterms = "covid-19 OR coronavirus"
auth = {"api-key":api_key}

webadd = baseurl + "q=" + searchterms
print(webadd)

response = requests.get(webadd, headers
                        =auth)
response.status_code

http://content.guardianapis.com/search?q=covid-19 OR coronavirus


200

In [19]:
data = response.json()
data["response"]["total"]

48973

## Dealing with multiple pages of results

In [24]:
baseurl = "http://content.guardianapis.com/search?"
searchterms = "covid-19 OR coronavirus"
auth = {"api-key":api_key}
numresults = "50"

webadd = baseurl + "q=" + searchterms + "&page-size=" + numresults
print(webadd)

response = requests.get(webadd, headers=auth)
response.status_code

http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50


200

In [25]:
data = response.json()
data["response"]["pageSize"]

50

In [26]:
total_pages = data["response"]["pages"]
total_pages += 1

total_pages

981

In [29]:
for pagenum in range(1, 21):
    webadd = baseurl + "q=" + searchterms + "&page-size=" + numresults \
             + "&page=" + str(pagenum)
    print(webadd)
    
    response = requests.get(webadd, headers=auth)
    data = response.json()
    
    outfile = "./downloads/guardian-api-covid-19-search-page-" \
              + str(pagenum) + "-" + date + ".json"
    
    with open(outfile, "w") as f:
        json.dump(data, f)

http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=1
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=2
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=3
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=4
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=5
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=6
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=7
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=8
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=9
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=10
http://content.guardianapis.com/search?q=covid-19 OR coronavirus&page-size=50&page=11
http://content.guardianapis.com/search?q=covid-19 OR coronaviru

In [30]:
os.listdir("./downloads")

['guardian-api-covid-19-search-page-1-2021-12-18.json',
 'guardian-api-covid-19-search-page-10-2021-12-18.json',
 'guardian-api-covid-19-search-page-11-2021-12-18.json',
 'guardian-api-covid-19-search-page-12-2021-12-18.json',
 'guardian-api-covid-19-search-page-13-2021-12-18.json',
 'guardian-api-covid-19-search-page-14-2021-12-18.json',
 'guardian-api-covid-19-search-page-15-2021-12-18.json',
 'guardian-api-covid-19-search-page-16-2021-12-18.json',
 'guardian-api-covid-19-search-page-17-2021-12-18.json',
 'guardian-api-covid-19-search-page-18-2021-12-18.json',
 'guardian-api-covid-19-search-page-19-2021-12-18.json',
 'guardian-api-covid-19-search-page-2-2021-12-18.json',
 'guardian-api-covid-19-search-page-20-2021-12-18.json',
 'guardian-api-covid-19-search-page-3-2021-12-18.json',
 'guardian-api-covid-19-search-page-4-2021-12-18.json',
 'guardian-api-covid-19-search-page-5-2021-12-18.json',
 'guardian-api-covid-19-search-page-6-2021-12-18.json',
 'guardian-api-covid-19-search-page-7

In [31]:
search_results = data["response"]["results"]
article = search_results[0]

article

{'id': 'politics/2021/jul/07/england-covid-reopening-plan-dangerous-experiment-ministers-told',
 'type': 'article',
 'sectionId': 'politics',
 'sectionName': 'Politics',
 'webPublicationDate': '2021-07-07T15:27:35Z',
 'webTitle': 'England’s reopening plan is a ‘dangerous experiment’, ministers told',
 'webUrl': 'https://www.theguardian.com/politics/2021/jul/07/england-covid-reopening-plan-dangerous-experiment-ministers-told',
 'apiUrl': 'https://content.guardianapis.com/politics/2021/jul/07/england-covid-reopening-plan-dangerous-experiment-ministers-told',
 'isHosted': False,
 'pillarId': 'pillar/news',
 'pillarName': 'News'}

In [32]:
baseurl = article["apiUrl"]
auth = {"api-key":api_key}
field = "body"

webadd = baseurl + "?show-fields=" + field

response = requests.get(webadd, headers=auth)
response.status_code

200

In [33]:
data = response.json()

data

{'response': {'status': 'ok',
  'userTier': 'developer',
  'total': 1,
  'content': {'id': 'politics/2021/jul/07/england-covid-reopening-plan-dangerous-experiment-ministers-told',
   'type': 'article',
   'sectionId': 'politics',
   'sectionName': 'Politics',
   'webPublicationDate': '2021-07-07T15:27:35Z',
   'webTitle': 'England’s reopening plan is a ‘dangerous experiment’, ministers told',
   'webUrl': 'https://www.theguardian.com/politics/2021/jul/07/england-covid-reopening-plan-dangerous-experiment-ministers-told',
   'apiUrl': 'https://content.guardianapis.com/politics/2021/jul/07/england-covid-reopening-plan-dangerous-experiment-ministers-told',
   'isHosted': False,
   'pillarId': 'pillar/news',
   'pillarName': 'News'}}}

In [34]:
data["response"]["content"]["fields"].keys()

dict_keys(['body'])

In [35]:
text = data["response"]["content"]["fields"]["body"]
text



In [36]:
from bs4 import BeautifulSoup as soup


soup_text = soup(text, "html.parser")
type(soup_text)

bs4.BeautifulSoup

In [37]:
links = soup_text.find_all("a")

links



In [38]:
len(links)

1

In [39]:
for link in links:
    print("--------")
    print(link.get("href"))
    print("--------")
    print("\r")

--------
--------



In [40]:
article_id = data["response"]["content"]["id"].replace("/", "-")

outfile = "./downloads/" + article_id + ".json"

with open(outfile, "w") as f:
    json.dump(data, f)
    
outfile

'./downloads/politics-2021-jul-07-england-covid-reopening-plan-dangerous-experiment-ministers-told.json'