In [7]:
import subprocess
import os

import pandas as pd

import requests
from bs4 import BeautifulSoup

import json


pd.set_option('display.max_rows', 500)

![CRSIP_DM](..\reports\figures\CRISP_DM.png)

# Data Understanding

* Robert Koch-Institut (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API servies to retrieve data https://npgeo-corona-npgeo-de.hub.arcgis.com/

In [4]:
data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)

In [5]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/14/20,7/15/20,7/16/20,7/17/20,7/18/20,7/19/20,7/20/20,7/21/20,7/22/20,7/23/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,34740,34994,35070,35229,35301,35475,35526,35615,35727,35928
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,3667,3752,3851,3906,4008,4090,4171,4290,4358,4466
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,20216,20770,21355,21948,22549,23084,23691,24278,24872,25484
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,861,862,877,880,880,880,884,884,889,889
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,541,576,607,638,687,705,749,779,812,851
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,74,74,74,76,76,76,76,76,76,76
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,106910,111146,114783,119301,122524,126755,130774,136118,141900,148027
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,32490,33005,33559,34001,34462,34877,34981,35254,35693,36162
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,113,113,113,113,113,113,113,113,113,113
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,3517,3527,3535,3550,3568,3588,3599,3614,3633,3640


# WebScraping

In [10]:
rki_page_link ="https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html"
rki_page = requests.get(rki_page_link)

In [11]:
rki_soup = BeautifulSoup(rki_page.content, 'html.parser')

In [12]:
html_table = rki_soup.find('table')

In [13]:
all_table_rows = html_table.find_all('tr')

In [18]:
table_data_rows = []
for pos,rows in enumerate(all_table_rows):
    col_data = [each_row.get_text(strip=True) for each_row in rows.find_all('td')]
    table_data_rows.append(col_data)


In [23]:
column_headers = {0:'state', 1:'cases', 2:'changes', 3:'cases_per_100k', 4:'fatal', 5:'comment'}
daily_status_rki = pd.DataFrame(table_data_rows).dropna().rename(columns = column_headers)

In [24]:
daily_status_rki.head()

Unnamed: 0,state,cases,changes,cases_per_100k,fatal,comment
2,Baden-Württem­berg,36.731,166,401,36,1.839
3,Bayern,50.112,63,442,34,2.619
4,Berlin,8.894,34,148,39,222.0
5,Branden­burg,3.523,13,50,20,168.0
6,Bremen,1.733,6,32,47,55.0


# REST API calls

In [25]:
npgeo_query_url = "https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
npgeo_data = requests.get(npgeo_query_url)

In [26]:
npgeo_json = json.loads(npgeo_data.content)

In [27]:
npgeo_json.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'serverGens', 'geometryType', 'spatialReference', 'fields', 'features'])

In [29]:
npgeo_full_list = []
for pos,each_dict in enumerate(npgeo_json['features'][:]):
    npgeo_full_list.append(each_dict['attributes'])

In [31]:
pd_npgeo_full_list = pd.DataFrame(npgeo_full_list)
pd_npgeo_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death
0,1,1,Schleswig-Holstein,Land,2896712,15,3299,1595541600000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,113.887746,45737310000.0,2881496.0,155
1,2,2,Hamburg,Freie und Hansestadt,1841179,6,5275,1595541600000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,286.501204,2089396000.0,418800.2,261
2,3,3,Niedersachsen,Land,7982448,9,14185,1595541600000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,177.702379,129983600000.0,4008988.0,647
3,4,4,Bremen,Freie Hansestadt,682986,5,1733,1595541600000,4,4132268b-54de-4327-ac1e-760e915112f1,253.73873,1119157000.0,335717.7,55
4,5,5,Nordrhein-Westfalen,Land,17932651,10,46956,1595541600000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,261.846394,87829360000.0,2648673.0,1726


# API access via REST service, e.g. USA data

www.smartable.ai

In [32]:
url_endpoint = 'https://api.smartable.ai/coronavirus/stats/US'
headers = {
# Request headers
'Cache-Control': 'no-cache',
'Subscription-Key': '92b06c0e995d4612976a2124793e953d',
}

response = requests.get(url_endpoint, headers=headers)

In [33]:
print(response)

<Response [200]>


In [35]:
US_dict = json.loads(response.content)
with open('../data/raw/SMARTABLE/us_data.txt', 'w') as outfile:
    json.dump(US_dict, outfile, indent = 2)