In [None]:
# warnings
import warnings
warnings.filterwarnings('ignore')

# import modules
import pandas as pd
import numpy as np
import datetime
import glob
import os
from datetime import timedelta

# sklearn
from sklearn.preprocessing import minmax_scale


# plots
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

%matplotlib inline
matplotlib.style.use('seaborn-ticks')

## `read COVID-19 datasets`

In [None]:
path = './data/*.csv'
files = glob.glob(os.path.join(path))
print (files)

In [None]:
datasets = {}
for f in files:
    filename = f.split('\\')[-1].split('.')[0]
    d = pd.read_csv(f, encoding='utf-8')
    datasets[filename] = d

In [None]:
key = 'covid_19_data'
datasets[key].head()

In [None]:
datasets[key].columns = [i.lower() for i in datasets[key].columns]

In [None]:
# fillna values -> 0 [confirmed, deaths, recovered] -> astype INT
cols = ['confirmed', 'deaths', 'recovered']
for c in cols:
    datasets[key][c] = datasets[key][c].fillna(0)
    datasets[key][c] = datasets[key][c].astype(int)
    
datasets[key].head(1)

In [None]:
# build date
datasets[key]['date'] = pd.to_datetime(datasets[key]['observationdate'])

In [None]:
datasets[key]['country/region'] = [
    ' '.join(i.split()).strip() for i in datasets[key]['country/region']
]

In [None]:
df = datasets[key].groupby(['country/region', 'date']) \
    .agg(
        {
            'confirmed': sum,
            'deaths': sum,
            'recovered': sum
        }
    ) \
    .reset_index()

df.shape

In [None]:
df.head()

In [None]:
# fix names
df.loc[df['country/region'] == 'US', 'country/region'] = 'United States'
df.loc[df['country/region'] == 'Mainland China', 'country/region'] = 'China'
df.loc[df['country/region'] == 'UK', 'country/region'] = 'United Kingdom'
df.loc[df['country/region'] == "('St. Martin',)", 'country/region'] = 'St. Martin'

In [None]:
# number of countries
print (f"Number of countries: {df['country/region'].unique().shape[0]}")

In [None]:
df = df.sort_values(by=['country/region', 'date']) \
    .reset_index(drop=True)
df.shape

In [None]:
df.head()

In [None]:
# Fix dates -> adding missing intervals
store_frames = []

# create time frame
default = pd.date_range(
    start=sorted(df['date'].tolist())[0],
    end=sorted(df['date'].tolist())[-1],
    freq='D'
).values
default = [pd.to_datetime(i) for i in default]

# iterate over countries
for country in df['country/region'].unique():
    d = df[df['country/region'] == country]
    serie = sorted(d['date'].tolist())
    s1 = serie[0]
    idx = default.index(s1)
    
    # check if series match in length
    match = len(serie) == len(default)
    if not match:
        
        # temporal dataframe
        tmp = pd.DataFrame(
            {
                'date': default
            }
        )
        tmp['country/region'] = country
        
        # merge frames
        d = tmp.merge(d, how='left') \
            .reset_index(drop=True)
        
        # iterate rows
        for row in range(d.shape[0]):
            if row < idx:
                d['confirmed'].iloc[row] = 0
                d['deaths'].iloc[row] = 0
                d['recovered'].iloc[row] = 0
            else:
                break
        
        # confirmed
        d['confirmed'] = d['confirmed'].fillna(method='ffill')
        d['confirmed'] = d['confirmed'].astype(int)
        
        # deaths
        d['deaths'] = d['deaths'].fillna(method='ffill')
        d['deaths'] = d['deaths'].astype(int)
        
        # recovered
        d['recovered'] = d['recovered'].fillna(method='ffill')
        d['recovered'] = d['recovered'].astype(int)
    
    # Fix confirmed cases
    values = d['confirmed'].tolist()
    store_values = []
    i = 0
    N = d.shape[0]
    store_values.append(values[i])
    for j in values:
        if i + 1 != N:
            v1 = store_values[i]
            v2 = values[i + 1]
            if v2 < v1:
                v2 = v1
        
            # sotore value
            store_values.append(v2)
            i += 1
    
    # add values
    d['confirmed'] = store_values
    
    # store frame
    store_frames.append(d)

# concat frames
df = pd.concat(store_frames, sort=True) \
    .sort_values(by=['country/region', 'date']) \
    .reset_index(drop=True)

# add outbreak
df['outbreak'] = 'COVID-19'
df.shape

In [None]:
df.head()

### `Get cases and cumsum`

In [None]:
store_frames = []
for country in df['country/region'].unique():
    d = df[df['country/region'] == country] \
        .sort_values(by='date') \
        .reset_index(drop=True)
    cases, cumsum, n_days = [], [], []
    j = 0
    n = 0
    cases.append(d['confirmed'].iloc[0])
    cumsum.append(d['confirmed'].iloc[0])
    n_days.append(n)
    for i in d['date'].unique():
        if j + 1 < d.shape[0]:
            val = d['confirmed'].iloc[j]
            diff = d['confirmed'].iloc[j + 1] - val
            cums = diff + val
            cases.append(diff)
            cumsum.append(cums)

            j += 1
            
            # add days after first case was detected in country
            if val > 0:
                n += 1
            
            n_days.append(n)
    
    d['cases'] = cases
    d['cumsum'] = cumsum
    d['day_number'] = range(1, len(default) + 1)
    d['day_number_country'] = n_days
    store_frames.append(d)

# concat data
df = pd.concat(store_frames, sort=True) \
    .reset_index(drop=True)

df.shape

In [None]:
df['week'] = df['date'].dt.week

In [None]:
df = df.sort_values(by=['country/region', 'date']) \
    .reset_index(drop=True)
df.shape

In [None]:
df.head(2)

In [None]:
df.tail(2)

## `group by country`

In [None]:
df.groupby('country/region').agg({'cases': sum}) \
    .reset_index() \
    .sort_values(by='cases', ascending=False) \
    .reset_index(drop=True) \
    [['country/region', 'cases']].head(10)

In [None]:
df[df['country/region'].isin(
    ['Brazil', 'Chile', 'Mexico', 'Colombia',
     'Peru', 'Ecuador', 'Panama', 'Argentina',
     'Venezuela', 'Costa Rica']
)].groupby('country/region').agg({'cases': sum}) \
    .reset_index() \
    .sort_values(by='cases', ascending=False) \
    .reset_index(drop=True) \
    [['country/region', 'cases']].head(10)

## `add new data  ---> manually`

In [None]:
df[df['country/region'] == 'Colombia']['confirmed'].iloc[-1],df[df['country/region'] == 'Ecuador']['date'].iloc[-1]

In [None]:
def add_new_data(d, main):
    '''
    '''
    m = main.copy()
    d = pd.DataFrame(d)
    d['date'] = pd.to_datetime(d['date'])
    d['outbreak'] = 'COVID-19'
    d['week'] = d['date'].dt.week
    
    get_countries = d['country/region'].unique().tolist()
    frames = []
    for c in get_countries:
        
        # add values
        data = m[m['country/region'] == c]
        temp = d[d['country/region'] == c]
        
        temp['day_number'] = data['day_number'].iloc[-1] + 1
        temp['day_number_country'] = data['day_number_country'].iloc[-1] + 1
        
        # cumsum, cases
        diff = temp['confirmed'].iloc[0] - data['confirmed'].iloc[-1]
        cums = diff + data['confirmed'].iloc[-1]
        temp['cases'] = diff
        temp['cumsum'] = cums
        frames.append(temp)
    
    d = pd.concat(frames, sort=True) \
        .reset_index(drop=True)
    
    return pd.concat([m, d], sort=True) \
            .sort_values(by=['country/region', 'date']) \
            .reset_index(drop=True)

In [None]:
new_data = [
    {
        'date': '2020-03-27',
        'country/region': 'Mexico',
        'confirmed': 585,
        'deaths': 8,
        'recovered': 4
    },
    {
        'date': '2020-03-27',
        'country/region': 'Colombia',
        'confirmed': 491,
        'deaths': 6,
        'recovered': 8    
    }
]

In [None]:
df = add_new_data(new_data, df)

In [None]:
df.shape

## `new confirmed cases`

In [None]:
c1 = [
    'Mexico', 'Colombia'
]

In [None]:
tmp = df[df['country/region'].isin(c1)]

# plot
g = sns.FacetGrid(tmp, col='country/region', hue='country/region',
                  sharey=False, col_wrap=2, aspect=1.7, height=6)
g.map(plt.plot, 'date', 'cases')
g.set_xticklabels(rotation=90);

In [None]:
df.head(1)

`compare countries with Italy`

In [None]:
x1 = 'Mexico'
y1 = 'Colombia'
x = df[
    (df['country/region'] == x1) &
    (df['day_number_country'] > 0)
].groupby('day_number_country').agg({'confirmed': sum})
y = df[
    (df['country/region'] == y1) &
    (df['day_number_country'] > 0)
].groupby('day_number_country').agg({'confirmed': sum})

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
ax.plot(x, label=x1, linewidth=1.5, alpha=0.8, color='#0BBA17')
ax.plot(y, label=y1, linewidth=1.5, alpha=0.8, color='yellow')
ax.tick_params(direction='in', color='#070733', labelsize=10, labelrotation=0, pad=10)
ax.set_ylabel('Casos confirmados', labelpad=35)
ax.set_xlabel('Numero de dias', labelpad=35)
plt.yscale('symlog', nonposy='clip')
plt.box(False)
ax.legend();

In [None]:
df[
    (df['country/region'] == 'Colombia') &
    (df['day_number_country'] > 0) &
    (df['day_number_country'] < 22)
] \
    .plot(x='day_number_country', y='confirmed', kind='bar')

In [None]:
g1 = MEX.groupby('Date') \
    .agg(
        {
            'Cases': 'mean'
        }
    )
g1.index = pd.DatetimeIndex(g1.index)

g2 = df[df['Country/Region'] == 'Colombia'].groupby('Date') \
    .agg(
        {
            'Cases': 'mean'
        }
    )
g2.index = pd.DatetimeIndex(g2.index)

In [None]:
MEX.tail()

In [None]:
# plot
fig, ax = plt.subplots(figsize=(16,6))

# Mexico
ax.plot(g1, label='Mexico', linewidth=2, alpha=0.7, color='#09799C', marker='.')

# Colombia
# x.plot(g2, label='Colombia', linewidth=2, alpha=0.7, color='#FDA50F', marker='.')

# parameters
ax.xaxis.set_major_locator(mdates.DayLocator(interval=5))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
ax.tick_params(direction='out', color='#070733', labelsize=16, rotation=0, pad=20)
ax.legend(loc='best', fontsize=16)
ax.set_xlabel('')
plt.box(False);

In [None]:
# plot
fig, ax = plt.subplots(figsize=(16,8))

# Colombia
ax.plot(g2, label='Colombia', linewidth=2, alpha=0.7, color='#FDA50F', marker='.')


# parameters
ax.xaxis.set_major_locator(mdates.DayLocator(interval=5))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
ax.tick_params(direction='out', color='#070733', labelsize=16, rotation=0, pad=20)
ax.legend(loc='best', fontsize=16)
ax.set_xlabel('')
plt.box(False);

    MEX = MEX.append(pd.Series(
        {
            'SNo': 0,
            'ObservationDate': '03/20/2020',
            'Province/State': None,
            'Country/Region': 'Mexico',
            'Last Update': '2020-03-20T10:53:02',
            'Confirmed': 164,
            'Deaths': 1,
            'Recovered': 4,
            'Date': '2020-03-20'
        }
    ), ignore_index=True)

    MEX['Date'] = pd.to_datetime(MEX['ObservationDate'])

    MEX = MEX.sort_values(by=['Country/Region', 'Date']) \
        .reset_index(drop=True)
    MEX.shape

    MEX = MEX.groupby(['Country/Region', 'Date']) \
        .agg(
            {
                'Confirmed': sum,
                'Deaths': sum,
                'Recovered': sum
            }
        ) \
        .reset_index()

    MEX.shape

    # Fix dates -> adding missing intervals
    store_frames = []

    # create time frame
    default = pd.date_range(
        start=pd.to_datetime('2020-01-22'),
        end=pd.to_datetime('2020-03-20'),
        freq='D'
    ).values
    default = [pd.to_datetime(i) for i in default]


    serie = sorted(MEX['Date'].tolist())
    s1 = serie[0]
    idx = default.index(s1)

    # check if series match in length
    match = len(serie) == len(default)
    if not match:

        # temporal dataframe
        tmp = pd.DataFrame(
            {
                'Date': default
            }
        )
        tmp['Country/Region'] = country

        # merge frames
        MEX = tmp.merge(MEX, how='left') \
            .reset_index(drop=True)

        # iterate rows
        for row in range(MEX.shape[0]):
            if row < idx:
                MEX['Confirmed'].iloc[row] = 0
                MEX['Deaths'].iloc[row] = 0
                MEX['Recovered'].iloc[row] = 0
            else:
                break

        # confirmed
        MEX['Confirmed'] = MEX['Confirmed'].fillna(method='ffill')
        MEX['Confirmed'] = MEX['Confirmed'].astype(int)

        # deaths
        MEX['Deaths'] = MEX['Deaths'].fillna(method='ffill')
        MEX['Deaths'] = MEX['Deaths'].astype(int)

        # recovered
        MEX['Recovered'] = MEX['Recovered'].fillna(method='ffill')
        MEX['Recovered'] = MEX['Recovered'].astype(int)

        # Fix confirmed cases
        values = MEX['Confirmed'].tolist()
        store_values = []
        i = 0
        N = MEX.shape[0]
        store_values.append(values[i])
        for j in values:
            if i + 1 != N:
                v1 = store_values[i]
                v2 = values[i + 1]
                if v2 < v1:
                    v2 = v1

                # sotore value
                store_values.append(v2)
                i += 1

        # add values
        MEX['Confirmed'] = store_values

        # Min Max scale
        MEX['Confirmed_scale'] = minmax_scale(MEX['Confirmed'])


    MEX['Outbreak'] = 'COVID-19'
    MEX.shape