In [0]:
from IPython.display import clear_output

In [0]:
!pip install geopandas
!pip install gitpython
clear_output()

In [0]:
import geopandas
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import urllib.request
import pathlib

from zipfile import ZipFile
import git
import os

% matplotlib inline

Clone the Repository with the Covid Data

In [0]:
os.mkdir("/content/covid-data")
#git.Git("/content/covid-data").clone("https://github.com/statistikat/coronaDAT.git")

repo = git.Repo.clone_from("https://github.com/statistikat/coronaDAT.git", "/content/covid-data", no_checkout=True)
repo.git.checkout("8f9ffd3b1159172a331bd42ed08a5d2afd24cd8d")

Have a look at the downloaded data. How is the structure? What kind of data does it contain?

Start with PiePlot of gender data

In [0]:
gender = pd.read_csv("/content/covid-data/latest/geschlecht.csv", sep=";")

In [0]:
gender.index = gender['geschlecht']

In [0]:
gender

In [0]:
plot = gender.plot.pie(y='freq', figsize=(5, 5), autopct='%1.0f%%', pctdistance=0.5, labeldistance=1.2)

PiePlot or BarPlot for age data?

In [0]:
age = pd.read_csv("/content/covid-data/latest/alter.csv", sep=";")

In [0]:
age.index = age['altersgruppe']

In [0]:
age

In [0]:
plot = age.plot.pie(y='freq', figsize=(5, 5), autopct='%1.0f%%', pctdistance=0.5, labeldistance=1.2)

In [0]:
plot = age.plot.bar(x='altersgruppe', y='freq', rot=0)

Show the data for the single bezirke on a map

In [0]:
cov_bz = pd.read_csv("/content/covid-data/latest/bezirke.csv", sep=";")

In [0]:
cov_bz.head()

In [0]:
url = "http://data.statistik.gv.at/data/OGDEXT_GEM_1_STATISTIK_AUSTRIA_20200101.zip"
urllib.request.urlretrieve(url, '/content/Shapefile.zip')

In [0]:
# Create a ZipFile Object and load sample.zip in it
with ZipFile('/content/Shapefile.zip', 'r') as zipObj:
   # Extract all the contents of zip file in different directory
   zipObj.extractall('Shapefile')

In [0]:
austria = geopandas.read_file('/content/Shapefile/STATISTIK_AUSTRIA_GEM_20200101Polygon.shp', encoding='utf-8')

Look at the shapefile dataframe, plot it

In [0]:
austria.head()

In [0]:
austria.plot()

In [0]:
austria[austria.name == "Linz"].plot()

Load additional Data about Bezirke

In [0]:
git.Git("/content/").clone("https://github.com/sauarampfa1/covid-19-analysis.git")

In [0]:
bezirke = pd.read_csv("/content/covid-19-analysis/data/Wohnbezirke.csv")

In [0]:
bezirke.columns = ['bzk_id', 'bzk_name', 'bundesland', 'Kfz_Kz', 'area', 'inhabitants', 'population_denseness', 'amount', 'region_name']

In [0]:
bezirke.head()

In [0]:
def to_numeric(series):
  series = pd.to_numeric(series.apply(lambda x : x.replace(".", "")))
  return series


def clean_bezirke(bezirke_df):
  bezirke_df['bzk_name'] = bezirke_df['bzk_name'].apply(lambda x : x.split("(")[0])
  bezirke_df['bzk_name'] = bezirke_df['bzk_name'].apply(lambda x : x.split(" *")[0])

  bezirke_df = bezirke_df.drop(21)

  bezirke_df["area"] = to_numeric(bezirke_df["area"])
  bezirke_df["inhabitants"] = to_numeric(bezirke_df["inhabitants"])
  bezirke_df["population_denseness"] = to_numeric(bezirke_df["population_denseness"])

  return bezirke_df

In [0]:
bezirke = clean_bezirke(bezirke)

In [0]:
bezirke

In [0]:
# execute this cell 2 times. not really sure why, but the bzk_name is nan the first time
aggregation_functions = {'bzk_id': 'first',
                         'bundesland': 'first', 'Kfz_Kz': 'first', 
                         'area': 'sum', 'inhabitants': 'sum', 
                         'population_denseness': 'sum', 'amount': 'first', 'region_name': 'first'}
bezirke_new = bezirke.groupby(bezirke['bzk_name'], as_index=False).aggregate(aggregation_functions).reindex(columns=bezirke.columns)
#cov_bz = cov_bz_new

In [0]:
bezirke_new

In [0]:
bezirke = bezirke_new

In [0]:
sum(bezirke['inhabitants'])

assign bezirk zu each entry in shapefile

In [0]:
austria.head()

In [0]:
bezirke.head()

In [0]:
bzk = []
#bzk_name = []
#bzk_bundesland = []

for idx, row in austria.iterrows():
  added = False
  for bzk_idx, bzk_row in bezirke.iterrows():
    if row['id'].startswith(bzk_row['bzk_id']):
      bzk.append(bzk_row['bzk_id'])
      added = True
  if row['id'].startswith('9'):
    bzk.append('900')
    added = True
  if added == False:
    bzk.append('0')
    added = True
    #print(row['id'])
    #print(row['name'])


In [0]:
austria['bezirk_id'] = bzk

Correct the cov_bz data so that it fits our ids - we need it to assign the casenumbers to the shapefile

In [0]:
cov_bz.loc[cov_bz['bkz'] == 323, 'bkz'] = 304
cov_bz.loc[cov_bz['bkz'] == 319, 'bkz'] = 302

In [0]:
aggregation_functions = {'freq': 'sum', 'date': 'first'}
cov_bz_new = cov_bz.groupby(cov_bz['bkz'], as_index=False).aggregate(aggregation_functions).reindex(columns=cov_bz.columns)
cov_bz = cov_bz_new

In [0]:
cov_bz

Assign case-frequencies of cov_bz and number of inhabitants of bezirke to the shapefile

In [0]:
bezirk_cases_dict = cov_bz.set_index('bkz').to_dict()['freq']

In [0]:
austria['cases'] = pd.to_numeric(austria['bezirk_id'])

#not all bezirk ids are present in the dict, add the missing ones and assign 0 cases
for i,row in austria.iterrows():
  if int(row['bezirk_id']) not in list(bezirk_cases_dict.keys()):
    bezirk_cases_dict[int(row['bezirk_id'])] = 0

In [0]:
austria['cases'] = pd.to_numeric(austria['cases'].replace(bezirk_cases_dict))

In [0]:
bezirk_inhabitants_dict = bezirke.set_index('bzk_id').to_dict()['inhabitants']

In [0]:
austria['inhabitants'] = austria['bezirk_id']
austria['inhabitants'] = pd.to_numeric(austria['inhabitants'].replace(bezirk_inhabitants_dict))
austria['inhabitants'].replace(0, 1, inplace=True)

In [0]:
#fig, ax = plt.subplots(1, 1)
austria.plot(column='cases', legend=True) # ax = ax

Show the relative cases 

In [0]:
austria['relative_cases'] = austria['cases'] / austria['inhabitants']

In [0]:
#austria['cases_per_10000'] = austria['relative_cases'] * 10000

In [0]:
austria.head()

In [0]:
plt = austria.plot(column='relative_cases', legend=True)

Lineplot of general data

In [0]:
general = pd.read_csv("/content/covid-data/latest/allgemein.csv", sep=";")

In [0]:
general

In [0]:
archive_folder = "/content/covid-data/archive/"
data = []
general = pd.DataFrame(columns=['erkrankungen', 'hospitalisiert', 'intensivstation', 'nr_tests', 'date'])
folders = os.listdir(archive_folder)
folders.sort()
for folder in folders:
  d = pd.read_csv(os.path.join(archive_folder, folder, "ts", "allgemein.csv"), sep=";")
  general = general.append(d)

In [0]:
general

In [0]:
lines = general.plot.line(x="date", rot=45)

In [0]:
lines = general.plot.line(subplots=True, x="date")

In [0]:
lines = general.plot.line(x="date", y=["erkrankungen", "hospitalisiert", 'intensivstation'], rot=45)

In [0]:
def load_and_clean_data(filename, aggregation_functions):
  data = pd.read_csv(filename, sep=";")

  #aggregation_functions = {'nuts2': 'first', 'gesundungen': 'sum'}
  data = data.groupby(data['date'], as_index=False).aggregate(aggregation_functions).reindex(columns=data.columns)
  data['date'] = data['date'].apply(lambda x : x.split()[0])
  data = data.set_index(data['date'])
  data = data.drop(columns=['nuts2', 'date'])
  data = data.loc[~data.index.duplicated(keep='first')]
  return data

In [0]:
healthy = load_and_clean_data("/content/covid-data/ts/gesundungen_bl.csv", {'nuts2': 'first', 'gesundungen': 'sum'})
hospital = load_and_clean_data("/content/covid-data/ts/hospitalisierungen_bl.csv", {'nuts2': 'first', 'hospitalisiert': 'sum', 'intensivstation': 'sum', 'nr_tests': 'first'})
dead = load_and_clean_data("/content/covid-data/ts/sterbefaelle_bl.csv", {'nuts2': 'first', 'todesfaelle': 'sum'})

In [0]:
sick = general
sick['date'] = sick['date'].apply(lambda x : x.split()[0])
sick = sick.set_index(sick['date'])
sick = sick.drop(columns=['hospitalisiert', 'intensivstation', 'nr_tests', 'date'])
sick = sick.loc[~sick.index.duplicated(keep='first')]

In [0]:
result = pd.concat([sick, healthy, hospital, dead], axis=1, sort=True)

In [0]:
result['aktuell_krank'] = result['erkrankungen'] - result['gesundungen'] - result['todesfaelle']

In [0]:
lines = result.plot.line(y=['erkrankungen', 'gesundungen', 'aktuell_krank', 'hospitalisiert', 'intensivstation', 'todesfaelle'])

growth rate