# Analysis of COVID19 Time Series Summary - Workbook 2

In [15]:
import pandas as pd
import numpy as np

# Used to ignore warnings arising from pandas module
import warnings
warnings.filterwarnings('ignore')

df_country_grouped = pd.read_csv("COVID19_time_series_country.csv")

## Further aggregation

The original csv files contain data as a total number of cases for confirmed, recovered and deaths. We now need to probe into the daily increase (if any) for these cases. The resulting dataframe at the end of the aggregation should look as follows:

![Dataframe with countrywise daily cases](./img/Dataframe_daily_cases_countrywise.png)

*Note:* This dataframe is filtered for Australian data only, but our final dataframe should have data from every country available in the original dataset.

Let's create a temporary dataframe `df_temp` as follows
* Group the dataframe to retrieve the `confirmed`, `deaths` and `recovered` columns with respect to `country` and `date` in that specific order.
* For each of the column, find the sum (`DataFrame.sum`) and then calculate the difference (`DataFrame.diff`) for each row from the previous row. This way, we can find the daily changes, with some caveats as we will discuss.

In [12]:
df_temp = df_country_grouped.groupby(['country', 'date', ])['confirmed', 'deaths', 'recovered']
df_temp = df_temp.sum().diff().reset_index()

display(df_temp.loc[[102,103]])

df_temp.head()

Unnamed: 0,country,date,confirmed,deaths,recovered
102,Afghanistan,2020-05-03,235.0,13.0,14.0
103,Afghanistan,2020-05-04,190.0,5.0,52.0


Unnamed: 0,country,date,confirmed,deaths,recovered
0,Afghanistan,2020-01-22,,,
1,Afghanistan,2020-01-23,0.0,0.0,0.0
2,Afghanistan,2020-01-24,0.0,0.0,0.0
3,Afghanistan,2020-01-25,0.0,0.0,0.0
4,Afghanistan,2020-01-26,0.0,0.0,0.0


The previous step creates discrepancy between the rows where the countries are different as follows:

![Dataframe value discrepancy](./img/Dataframe_discrepancy.png)

We can reset the value of these rows by comparing rows such as these, where the country names change. You may find the `DataFrame.shift` method useful here.

In [13]:
mask = df_temp['country'] != df_temp['country'].shift()
df_temp.loc[mask, 'confirmed'] = np.nan
df_temp.loc[mask, 'deaths'] = np.nan
df_temp.loc[mask, 'recovered'] = np.nan

Let's rename of the `df_temp` the columns more appropriately.

In [14]:
renamed_columns = {
    "confirmed": "new_cases", 
    "deaths": "new_deaths",
    "recovered": "new_recovered",
}

df_temp = df_temp.rename(columns=renamed_columns)

df_temp.head()

Unnamed: 0,country,date,new_cases,new_deaths,new_recovered
0,Afghanistan,2020-01-22,,,
1,Afghanistan,2020-01-23,0.0,0.0,0.0
2,Afghanistan,2020-01-24,0.0,0.0,0.0
3,Afghanistan,2020-01-25,0.0,0.0,0.0
4,Afghanistan,2020-01-26,0.0,0.0,0.0


Merge the `df_temp` with the previous dataframe.  

In [16]:
df_country_grouped = df_country_grouped.merge(
    right=df_temp, 
    how="left", 
    on=['country', 'date'])

In [24]:
df_country_grouped.isna().sum()

date               0
country            0
lat                0
long               0
confirmed          0
deaths             0
recovered          0
active             0
new_cases        186
new_deaths       186
new_recovered    186
dtype: int64

In [26]:
df_country_grouped = df_country_grouped.fillna(0)

In [27]:
df_country_grouped.isna().sum()

date             0
country          0
lat              0
long             0
confirmed        0
deaths           0
recovered        0
active           0
new_cases        0
new_deaths       0
new_recovered    0
dtype: int64

Let's fix all data types for the pandemic cases to integer and ensure positive new cases (although there should not be any contradictory cases here).

In [45]:
# Fix data types
cols = ['confirmed', 'deaths', 'recovered', 'active', 'new_cases', 'new_deaths', 'new_recovered']
df_country_grouped[cols] = df_country_grouped[cols].astype('int')

# df_country_grouped.info
# Replace any negative values in New_Cases with 0, shouldn't really have to
df_country_grouped['new_cases'] = df_country_grouped['new_cases'].apply(lambda x: 0 if x<0 else x)

In [46]:
df_country_grouped.sample(10)

Unnamed: 0,date,country,lat,long,confirmed,deaths,recovered,active,new_cases,new_deaths,new_recovered
23342,2020-05-26,Kuwait,29.5,47.75,22575,172,7306,15097,608,7,685
9644,2020-03-13,Sudan,12.8628,30.2176,1,1,0,0,1,1,0
12842,2020-03-31,Australia,-255.9695,1129.8623,4559,18,358,4183,198,1,101
2130,2020-02-02,Italy,43.0,12.0,2,0,0,2,0,0,0
11747,2020-03-25,Cabo Verde,16.5388,-23.0418,4,1,0,3,1,0,0
17159,2020-04-23,Denmark,189.8634,-40.0143,8271,394,5573,2304,163,10,297
26853,2020-06-14,Guatemala,15.7835,-90.2308,9845,384,1886,7575,354,17,82
20747,2020-05-12,Lithuania,55.1694,23.8813,1491,50,850,591,6,0,17
13065,2020-04-01,Cyprus,35.1264,33.4299,320,9,28,283,58,1,5
1072,2020-01-27,Saint Vincent and the Grenadines,12.9843,-61.2872,0,0,0,0,0,0,0


In [47]:
df_country_grouped.to_csv('COVID19_time_series_global.csv', index=False)

# Visualising COVID19 data


In [84]:
import altair as alt
country_grouped = pd.read_csv('COVID19_time_series_country_state.csv', parse_dates=['date'])
country = country_grouped[country_grouped['country'] == 'Australia']

In [96]:
base = alt.Chart(country).mark_line().encode(
    x='monthdate(date):O', # ordinal value
    color="state:N", # nominal value
).properties(width=300)

new_cases_chart = base.encode(y="confirmed:Q")\ # quantitative value
                    .properties(title="Total Confirmed")

new_deaths_chart = base.encode(y="deaths:Q")\
                    .properties(title="Total Deaths")

new_cases_chart | new_deaths_chart

In [98]:
# red = alt.value('#f54242')
# (base.encode(y="new_cases").properties(title="Daily New Cases") & \
#  base.encode(y="new_deaths", color=red).properties(title="Daily new deaths")) | \
# (base.encode(y="confirmed").properties(title="Total confirmed") \
#  & base.encode(y="deaths", color=red).properties(title="Total deaths"))

In [99]:
# alt.Chart(aus).mark_bar().encode(
#     x='monthdate(date):O',
#     y='new_cases:Q'
# ).configure_mark(
#     opacity=0.5,
#     color='red'
# ) & \
# alt.Chart(aus).mark_bar().encode(
#     x='monthdate(date):O',
#     y='new_cases:Q'
# ).configure_mark(
#     opacity=0.5,
#     color='red'
# )