In [None]:
# warnings
import warnings
warnings.filterwarnings('ignore')

# import modules
import os
import glob
import json
import random
import requests
import pandas as pd
import numpy as np

from scipy import stats
from datetime import timedelta
from collections import Counter

# plots
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns; sns.set()

%matplotlib inline
matplotlib.style.use('seaborn-ticks')

### `load data`

`COVID data`

In [None]:
covid_path = './data/covid_19_data.csv'
covid_data = pd.read_csv(covid_path, encoding='utf-8')

# columns lower
covid_data.columns = [i.lower() for i in covid_data.columns]

# fillna values -> 0 [confirmed, deaths, recovered] -> astype INT
cols = ['confirmed', 'deaths', 'recovered']
for c in cols:
    covid_data[c] = covid_data[c].fillna(0)
    covid_data[c] = covid_data[c].astype(int)

# build date
covid_data['date'] = pd.to_datetime(covid_data['observationdate'])

# fix spaces in country - if any
covid_data['country/region'] = [
    ' '.join(i.split()).strip() for i in covid_data['country/region']
]

# sample
covid_data.head(1)

`Group data`

In [None]:
df = covid_data.groupby(['country/region', 'date']) \
    .agg(
        {
            'confirmed': sum,
            'deaths': sum,
            'recovered': sum
        }
    ) \
    .reset_index()

df.shape

In [None]:
# fix names
df.loc[df['country/region'] == 'US', 'country/region'] = 'United States'
df.loc[df['country/region'] == 'Mainland China', 'country/region'] = 'China'
df.loc[df['country/region'] == 'UK', 'country/region'] = 'United Kingdom'
df.loc[df['country/region'] == "('St. Martin',)", 'country/region'] = 'St. Martin'

In [None]:
df.head(1)

`Create Mex dataset`

In [None]:
covid_mex = df[df['country/region'] == 'Mexico'] \
    .reset_index(drop=True) \
    .copy()

# size
covid_mex.shape

In [None]:
def fix_date_and_values(data):
    '''
    return new dataframe
    '''
    # internal funct
    def fix_date(value):
        '''
        '''
        v = str(value).split()[0]
        if v > '2020-03-12':
            value = value - timedelta(days=1)
            
        return value
        
    default_index = 15
    d = data.copy()
    
    # process
    d = d.drop(default_index) \
        .reset_index(drop=True)
    
    d['date'] = d['date'].apply(fix_date)
    return d

In [None]:
# fix confirmed values in Mex
covid_mex.loc[(covid_mex['country/region'] == 'Mexico') & (covid_mex['date'] == '2020-02-28'), 'confirmed'] = 3
covid_mex.loc[(covid_mex['country/region'] == 'Mexico') & (covid_mex['date'] == '2020-03-11'), 'confirmed'] = 11
covid_mex.loc[(covid_mex['country/region'] == 'Mexico') & (covid_mex['date'] == '2020-03-12'), 'confirmed'] = 15

In [None]:
covid_mex = fix_date_and_values(covid_mex)
covid_mex.shape

*sort values*

In [None]:
covid_mex = covid_mex.sort_values(by=['country/region', 'date']) \
    .reset_index(drop=True)

# size
covid_mex.shape

In [None]:
# Fix dates -> adding missing intervals
default_date = '2020-01-22'

# create time frame
default = pd.date_range(
    start=pd.to_datetime(default_date),
    end=sorted(covid_mex['date'].tolist())[-1],
    freq='D'
).values


# temp dataframe
tmp = pd.DataFrame(
    {
        'date': default
    }
)
tmp['country/region'] = 'Mexico'


covid_mex = tmp.merge(covid_mex, how='left') \
        .fillna(0) \
        .reset_index(drop=True)

# astype int
covid_mex['confirmed'] = covid_mex['confirmed'].astype(int)
covid_mex['deaths'] = covid_mex['deaths'].astype(int)
covid_mex['recovered'] = covid_mex['recovered'].astype(int)

# add outbreak
covid_mex['outbreak'] = 'COVID-19'

In [None]:
covid_mex.head()

`add new data  ---> manually`

*verify status*

In [None]:
covid_mex[covid_mex['country/region'] == 'Mexico'][['date', 'confirmed']].tail(1)

In [None]:
new_data = pd.DataFrame(
    [
        {
            'date': pd.to_datetime('2020-04-05'),
            'country/region': 'Mexico',
            'confirmed': 2143,
            'deaths': 94,
            'recovered': 633,
            'outbreak': 'COVID-19'
        }
    ]
)

In [None]:
covid_mex = pd.concat([covid_mex, new_data], sort=True) \
    .reset_index(drop=True)

# size
covid_mex.shape

`get diff and cumsum`

In [None]:
covid_mex['cases'] = covid_mex['confirmed'].diff().fillna(0) \
    .astype(int)

In [None]:
covid_mex['cumsum'] = covid_mex['cases'].cumsum()

`confirmed and cases pct change`

In [None]:
covid_mex['confirmed_pct_change'] = covid_mex['confirmed'].pct_change() \
    .replace([np.inf, -np.inf], np.nan) \
    .fillna(0)

In [None]:
covid_mex['cases_pct_change'] = abs(covid_mex['cases'].pct_change() \
    .replace([np.inf, -np.inf], np.nan) \
    .fillna(0))

`get week`

In [None]:
covid_mex['week'] = covid_mex['date'].dt.week

`get number of days`

In [None]:
n_days = []
n = 0
for i in range(covid_mex.shape[0]):
    val = covid_mex['confirmed'].iloc[i]
    
    # add days after first case was detected in country
    if val > 0:
        n += 1
    
    n_days.append(n)

covid_mex['day_number'] = covid_mex.index + 1
covid_mex['day_number_country'] = n_days

# sample
covid_mex.shape

In [None]:
covid_mex.head(2)

`get logs`

In [None]:
def get_logs(values):
    '''
    '''
    mask = np.ma.log(values.tolist())
    return pd.Series(mask.filled(0))

#### `diff() - logratio: alternative`

    covid_mex['test'] = np.log(covid_mex['confirmed'] / covid_mex['confirmed'].shift()) \
        .replace([np.inf, -np.inf], np.nan) \
        .fillna(0)
        
    ...
    
    covid_mex['cases_logratio'] = abs(covid_mex['cases_log'].diff()).fillna(0) \
    .astype(float)

_by new cases_

In [None]:
covid_mex['cases_log'] = get_logs(covid_mex['cases'])

In [None]:
covid_mex['cases_logratio'] = abs(covid_mex['cases_log'].diff()).fillna(0) \
    .astype(float)

_by confirmed cases_

In [None]:
covid_mex['confirmed_log'] = get_logs(covid_mex['confirmed'])

In [None]:
covid_mex['confirmed_logratio'] = abs(covid_mex['confirmed_log'].diff()).fillna(0) \
    .astype(float)

_by confirmed pct change_

In [None]:
covid_mex['confirmed_pct_change_log'] = abs(get_logs(covid_mex['confirmed_pct_change']))

In [None]:
covid_mex['confirmed_pct_change_logratio'] = abs(covid_mex['confirmed_pct_change_log'].diff()).fillna(0) \
    .astype(float)

_by cases pct change_

In [None]:
covid_mex['cases_pct_change_log'] = abs(get_logs(covid_mex['cases_pct_change']))

In [None]:
covid_mex['cases_pct_change_logratio'] = abs(covid_mex['cases_pct_change_log'].diff()).fillna(0) \
    .astype(float)

`test plot`

In [None]:
print (covid_mex.columns.tolist())

In [None]:
x_field = 'day_number_country'
y_field = 'cases_pct_change_logratio'

# params
_dates = False
_type = 'line'

# dimensions
fig, ax = plt.subplots(figsize=(16, 8))

# temp df
tmp = covid_mex[covid_mex['confirmed'] > 0].copy()

# plot
sns.regplot(x=x_field, y=y_field, data=tmp, ax=ax)
ax.tick_params(direction='out', color='#070733', labelsize=15, labelrotation=0, pad=15)
ax.set_ylabel('')
ax.set_xlabel('')
plt.box(False);

`save dataset`

In [None]:
covid_mex.to_excel('./data/covid_mex.xlsx', index=False)

## `Plots`

**`New daily cases`**

In [None]:
# dimensions
fig, ax = plt.subplots(figsize=(16, 8))

# tmp
tmp = covid_mex[covid_mex['day_number_country'] > 0].copy()

# plot
tmp.plot(x='day_number_country', y='cases', kind='bar', ax=ax, label='Daily Cases in Mexico')
ax.tick_params(direction='out', color='#070733', labelsize=15, labelrotation=0, pad=15)
ax.set_ylabel('COVID-19 daily cases', labelpad=35, fontsize=18)
ax.set_xlabel('Number of days since 1st case in Mexico', labelpad=35, fontsize=18)
plt.box(False)
plt.legend(prop={'size': 15});