In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from azureml.core import Workspace, Dataset

In [None]:
subscription_id = ''
resource_group = 'ai-in-cloud-workshop-rg'
workspace_name = 'ai-in-cloud-workspace'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='covid19-spread')
dataset.description

In [None]:
data = dataset.to_pandas_dataframe()
data.sample(10)

In [None]:
data.info()

| var\_name                | var\_def                                                                                                                                                                                                  |
| :----------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| iso3c                    | Country name                                                                                                                                                                                              |
| country                  | ISO3c country code as defined by ISO 3166-1 alpha-3                                                                                                                                                       |
| date                     | Calendar date                                                                                                                                                                                             |
| confirmed                | Confirmed Covid-19 cases as reported by JHU CSSE (accumulated)                                                                                                                                            |
| deaths                   | Covid-19-related deaths as reported by JHU CSSE (accumulated)                                                                                                                                             |
| recovered                | Covid-19 recoveries as reported by JHU CSSE (accumulated)                                                                                                                                                 |
| ecdc\_cases              | Covid-19 cases as reported by ECDC (accumulated)                                                                                                                                                          |
| ecdc\_deaths             | Covid-19-related deaths as reported by ECDC (accumulated)                                                                                                                                                 |
| total\_tests             | Accumulated test counts as reported by Our World in Data                                                                                                                                                  |
| tests\_units             | Definition of what constitutes a ‘test’                                                                                                                                                                   |
| soc\_dist                | Number of social distancing measures reported up to date by ACAPS, net of lifted restrictions                                                                                                             |
| mov\_rest                | Number of movement restrictions reported up to date by ACAPS, net of lifted restrictions                                                                                                                  |
| pub\_health              | Number of public health measures reported up to date by ACAPS, net of lifted restrictions                                                                                                                 |
| gov\_soc\_econ           | Number of social and economic measures reported up to date by ACAPS, net of lifted restrictions                                                                                                           |
| lockdown                 | Number of lockdown measures reported up to date by ACAPS, net of lifted restrictions                                                                                                                      |
| apple\_mtr\_driving      | Apple Maps usage for driving directions, as percentage\*100 relative to the baseline of Jan 13, 2020                                                                                                      |
| apple\_mtr\_walking      | Apple Maps usage for walking directions, as percentage\*100 relative to the baseline of Jan 13, 2020                                                                                                      |
| apple\_mtr\_transit      | Apple Maps usage for public transit directions, as percentage\*100 relative to the baseline of Jan 13, 2020                                                                                               |
| gcmr\_retail\_recreation | Google Community Mobility Reports data for the frequency that people visit retail and recreation places expressed as a percentage\*100 change relative to the baseline period Jan 3 - Feb 6, 2020         |
| gcmr\_grocery\_pharmacy  | Google Community Mobility Reports data for the frequency that people visit grocery stores and pharmacies expressed as a percentage\*100 change relative to the baseline period Jan 3 - Feb 6, 2020        |
| gcmr\_parks              | Google Community Mobility Reports data for the frequency that people visit parks expressed as a percentage\*100 change relative to the baseline period Jan 3 - Feb 6, 2020                                |
| gcmr\_transit\_stations  | Google Community Mobility Reports data for the frequency that people visit transit stations expressed as a percentage\*100 change relative to the baseline period Jan 3 - Feb 6, 2020                     |
| gcmr\_workplaces         | Google Community Mobility Reports data for the frequency that people visit workplaces expressed as a percentage\*100 change relative to the baseline period Jan 3 - Feb 6, 2020                           |
| gcmr\_residential        | Google Community Mobility Reports data for the frequency that people visit residential places expressed as a percentage\*100 change relative to the baseline period Jan 3 - Feb 6, 2020                   |
| gtrends\_score           | Google search volume for the term ‘coronavirus’, relative across time with the country maximum scaled to 100                                                                                              |
| gtrends\_country\_score  | Country-level Google search volume for the term ‘coronavirus’ over a period starting Jan 1, 2020, relative across countries with the country having the highest search volume scaled to 100 (time-stable) |
| region                   | Country region as classified by the World Bank (time-stable)                                                                                                                                              |
| income                   | Country income group as classified by the World Bank (time-stable)                                                                                                                                        |
| population               | Country population as reported by the World Bank (original identifier ‘SP.POP.TOTL’, time-stable)                                                                                                         |
| land\_area\_skm          | Country land mass in square kilometers as reported by the World Bank (original identifier ‘AG.LND.TOTL.K2’, time-stable)                                                                                  |
| pop\_density             | Country population density as reported by the World Bank (original identifier ‘EN.POP.DNST’, time-stable)                                                                                                 |
| pop\_largest\_city       | Population in the largest metropolian area of the country as reported by the World Bank (original identifier ‘EN.URB.LCTY’, time-stable)                                                                  |
| life\_expectancy         | Average life expectancy at birth of country citizens in years as reported by the World Bank (original identifier ‘SP.DYN.LE00.IN’, time-stable)                                                           |
| gdp\_capita              | Country gross domestic product per capita, measured in 2010 US-$ as reported by the World Bank (original identifier ‘NY.GDP.PCAP.KD’, time-stable)                                                        |
| timestamp                | Date and time where data has been collected from authoritative sources                                                                                                                                    |

Source: https://github.com/joachim-gassen/tidycovid19/

In [None]:
data_ru = data[data.iso3c == 'RUS']
data_ru.sort_values(by=['date'], inplace=True)

data_ru[['country', 'date', 'confirmed', 'deaths', 'recovered']].tail(10)

In [None]:
data_ru[['date', 'confirmed', 'deaths', 'recovered']].set_index('date').plot()

In [None]:
data_ru_numeric = data_ru.select_dtypes(include=np.number)

f = plt.figure(figsize=(9, 9))

plt.matshow(data_ru_numeric.corr(), fignum=f.number)
plt.xticks(range(data_ru_numeric.shape[1]), data_ru_numeric.columns, fontsize=14, rotation=45)
plt.yticks(range(data_ru_numeric.shape[1]), data_ru_numeric.columns, fontsize=14)

cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

In [None]:
data_ru['label'] = data.confirmed

data_ru['confirmed_prev'] = data.confirmed.shift(-1)
data_ru['recovered_prev'] = data.recovered.shift(-1)
data_ru['deaths_prev'] = data.deaths.shift(-1)

data_ru[['date', 'label', 'confirmed_prev', 'recovered_prev', 'deaths_prev']].tail(7)

In [None]:
data_ru['confirmed_lag2d'] = data.confirmed.shift(-2)
data_ru['confirmed_lag3d'] = data.confirmed.shift(-3)
data_ru['confirmed_lag4d'] = data.confirmed.shift(-4)
data_ru['confirmed_lag5d'] = data.confirmed.shift(-5)

data_ru[['date', 'label', 'confirmed_prev', 'confirmed_lag2d', 'confirmed_lag3d', 'confirmed_lag4d', 'confirmed_lag5d']].tail(7)

In [None]:
data_ru.drop(columns=['confirmed', 'recovered', 'deaths'], inplace=True)

In [None]:
data_ru['confirmed_diff1d'] = data_ru['confirmed_prev'].diff()
data_ru['confirmed_rate1d'] = data_ru['confirmed_prev'].pct_change()

data_ru[['date', 'label', 'confirmed_prev', 'confirmed_diff1d', 'confirmed_rate1d']].tail(7)

In [None]:
data_ru.shape

In [None]:
data_ru = data_ru[:-5]
data_ru.dropna(subset=['label'], inplace=True)

data_ru.shape

In [None]:
data_ru.tail(7)

In [None]:
os.makedirs('buffer', exist_ok=True)
data_ru.to_parquet('buffer/data_ru.parquet')

dataref = workspace.get_default_datastore().upload('buffer')
dataset = Dataset.Tabular.from_parquet_files(path = dataref.path('data_ru.parquet'))

dataset.register(workspace, name = 'covid19-spread-russia')
